def test_website_patterns_condition(self) -> None: doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, website_patterns=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, website_patterns=[".*ABc", ".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_url_patterns_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, url_patterns=[".*unittest", ".*zxc"]) res_false = default_doc_selector.select_document( doc, url_patterns=[".*ZXc", ".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_json_paths_and_json_paths_regex(self) -> None: doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def document_selector(self, doc: Document) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ return DefaultDocumentSelector().select_document(doc)
def test_all_condition(self) -> None: doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, datasets=[".*unittest", ".*abc"], url_patterns=[".*unittest", ".*zxc"], website_patterns=[".*unittest", ".*abc"], json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, datasets=[".*abc", ".*hhhh"], url_patterns=[".*ZXc", ".*hhhh"], website_patterns=[".*ABc", ".*hhhh"], json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def document_selector(self, doc) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ # match all the IFPs to this news article, record this news article as relevant for all IFPs with simmilarity above threshold return DefaultDocumentSelector().select_document(doc)
def __init__(self, etk): ETKModule.__init__(self, etk) self.doc_selector = DefaultDocumentSelector() self.incomp_decoder = DecodingValueExtractor(self.incomp_type, 'Incomp Decoder') self.int_decoder = DecodingValueExtractor(self.int_event_type, 'Int Decoder') self.int_fatalities_decoder = DecodingValueExtractor(self.int_fatalities, 'Int Fatalities Decoder') self.int_fatalities_size_lower_decoder = DecodingValueExtractor(self.int_fatalities_size_lower, 'Int Fatalities Lower Bound Size Decoder') self.int_fatalities_size_upper_decoder = DecodingValueExtractor(self.int_fatalities_size_upper, 'Int Fatalities Upper Bound Size Decoder', default_action="delete") self.int_causeex_decoder = DecodingValueExtractor(self.int_causeex_type, 'Int CauseEx Type', default_action="delete")
def main(): filename = sys.argv[1] query_title = sys.argv[2] ranking_criteria = sys.argv[3] top_k = sys.argv[4] if ranking_criteria not in ('TITLE', 'SENTENCE'): print('Wrong mode! Please check the input argument!') return master_config = { "fields": { "developer": { "type": "string" }, "student_developer": { "type": "string" }, "spacy_name": { "type": "string" }, "date": { "type": "date" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./extraction_modules/"]) nlp = spacy.load('en_core_web_lg') date_extractor = DateExtractor(etk=etk) queries = dict() queries_ent_map = dict() with open(query_title) as f: for line in f: orig_ifp_title = line # remove date information from query term res = date_extractor.extract(text=line) start, end = float('inf'), -1 for i in res: start = min(start, i.provenance['start_char']) end = max(end, i.provenance['end_char']) # delete date from query term if len(res) != 0: line = line[:start] + line[end+1:] queries[orig_ifp_title] = line queries_ent_map[line] = list() # extract entities from query term doc = nlp(line) for ent in doc.ents: queries_ent_map[line].append(re.escape(ent.text.strip())) # remove empty entities queries_ent_map[line] = list(filter(bool, queries_ent_map[line])) # the list of selected docs for given query term query_docs_mapping = dict() docs = list() with open(filename) as f: for line in f: json_obj = json.loads(line) docs.append(etk.create_document(json_obj)) ds = DefaultDocumentSelector() for orig_query, proc_query in queries.items(): content_regex = queries_ent_map[proc_query] query_docs_mapping[proc_query] = list() for doc in docs: if len(content_regex) == 0 \ or ds.select_document(document=doc, json_paths=['$.lexisnexis.doc_description'], json_paths_regex=content_regex): query_docs_mapping[proc_query].append(doc) # TODO: pass ifp_id in for orig_query, proc_query in queries.items(): # print(len(query_docs_mapping[proc_query])) dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query) heap = list() for doc in query_docs_mapping[proc_query]: processed_doc = dict() if ranking_criteria == 'SENTENCE': processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document elif ranking_criteria == 'TITLE': processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document if len(heap) < top_k: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) else: if processed_doc['similarity'] > heap[0][0]: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) heap.sort(reverse=True) output_filename = './resources/output/'+orig_ifp_title+"_result.jl" with open(output_filename, 'a+b') as f: for item in heap: print(item[0]) jl_str = json.dumps(item[2]) + '\n' f.write(jl_str.encode())