示例#1
0
 def test_website_patterns_condition(self) -> None:
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc, website_patterns=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, website_patterns=[".*ABc", ".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
 def test_url_patterns_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc, url_patterns=[".*unittest", ".*zxc"])
     res_false = default_doc_selector.select_document(
         doc, url_patterns=[".*ZXc", ".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
示例#3
0
 def test_json_paths_and_json_paths_regex(self) -> None:
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
示例#4
0
    def document_selector(self, doc: Document) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        return DefaultDocumentSelector().select_document(doc)
示例#5
0
 def test_all_condition(self) -> None:
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         datasets=[".*unittest", ".*abc"],
         url_patterns=[".*unittest", ".*zxc"],
         website_patterns=[".*unittest", ".*abc"],
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc,
         datasets=[".*abc", ".*hhhh"],
         url_patterns=[".*ZXc", ".*hhhh"],
         website_patterns=[".*ABc", ".*hhhh"],
         json_paths=["$.website"],
         json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
示例#6
0
    def document_selector(self, doc) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        # match all the IFPs to this news article, record this news article as relevant for all IFPs with simmilarity above threshold

        return DefaultDocumentSelector().select_document(doc)
示例#7
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.doc_selector = DefaultDocumentSelector()
     self.incomp_decoder = DecodingValueExtractor(self.incomp_type, 'Incomp Decoder')
     self.int_decoder = DecodingValueExtractor(self.int_event_type, 'Int Decoder')
     self.int_fatalities_decoder = DecodingValueExtractor(self.int_fatalities, 'Int Fatalities Decoder')
     self.int_fatalities_size_lower_decoder = DecodingValueExtractor(self.int_fatalities_size_lower,
                                                                     'Int Fatalities Lower Bound Size Decoder')
     self.int_fatalities_size_upper_decoder = DecodingValueExtractor(self.int_fatalities_size_upper,
                                                                     'Int Fatalities Upper Bound Size Decoder',
                                                                     default_action="delete")
     self.int_causeex_decoder = DecodingValueExtractor(self.int_causeex_type,
                                                       'Int CauseEx Type',
                                                       default_action="delete")
示例#8
0
def main():
    filename = sys.argv[1]
    query_title = sys.argv[2]
    ranking_criteria = sys.argv[3]
    top_k = sys.argv[4]

    if ranking_criteria not in ('TITLE', 'SENTENCE'):
        print('Wrong mode! Please check the input argument!')
        return

    master_config = {
        "fields": {
            "developer": {
                "type": "string"
            },
            "student_developer": {
                "type": "string"
            },
            "spacy_name": {
                "type": "string"
            },
            "date": {
                "type": "date"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./extraction_modules/"])
    nlp = spacy.load('en_core_web_lg')

    date_extractor = DateExtractor(etk=etk)

    queries = dict()
    queries_ent_map = dict()

    with open(query_title) as f:
        for line in f:
            orig_ifp_title = line
            # remove date information from query term
            res = date_extractor.extract(text=line)
            start, end = float('inf'), -1
            for i in res:
                start = min(start, i.provenance['start_char'])
                end = max(end, i.provenance['end_char'])
            # delete date from query term
            if len(res) != 0:
                line = line[:start] + line[end+1:]

            queries[orig_ifp_title] = line
            queries_ent_map[line] = list()
            # extract entities from query term
            doc = nlp(line)
            for ent in doc.ents:
                queries_ent_map[line].append(re.escape(ent.text.strip()))
            # remove empty entities
            queries_ent_map[line] = list(filter(bool, queries_ent_map[line]))

    # the list of selected docs for given query term
    query_docs_mapping = dict()

    docs = list()
    with open(filename) as f:
        for line in f:
            json_obj = json.loads(line)
            docs.append(etk.create_document(json_obj))

    ds = DefaultDocumentSelector()

    for orig_query, proc_query in queries.items():
        content_regex = queries_ent_map[proc_query]
        query_docs_mapping[proc_query] = list()
        for doc in docs:
            if len(content_regex) == 0 \
                    or ds.select_document(document=doc,
                              json_paths=['$.lexisnexis.doc_description'],
                              json_paths_regex=content_regex):
                query_docs_mapping[proc_query].append(doc)

    # TODO: pass ifp_id in
    for orig_query, proc_query in queries.items():
        # print(len(query_docs_mapping[proc_query]))
        dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query)
        heap = list()
        for doc in query_docs_mapping[proc_query]:
            processed_doc = dict()

            if ranking_criteria == 'SENTENCE':
                processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document
            elif ranking_criteria == 'TITLE':
                processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document

            if len(heap) < top_k:
                heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))
            else:
                if processed_doc['similarity'] > heap[0][0]:
                    heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))

        heap.sort(reverse=True)

        output_filename = './resources/output/'+orig_ifp_title+"_result.jl"

        with open(output_filename, 'a+b') as f:
            for item in heap:
                print(item[0])
                jl_str = json.dumps(item[2]) + '\n'
                f.write(jl_str.encode())