class Wind: def __init__(self): self.name_getter = ThuItemGetter() self.location_getter = ThuLocationGetter() self.item_getter = ThuItemGetter() self.relation_getter = None self.doc_reader = DocReader() def process(self, dir_name='data'): if os.path.exists(dir_name) and os.path.isdir(dir_name): for file_name in os.listdir(dir_name): relative_file_name = dir_name + '/' + file_name self.doc_reader.load(relative_file_name) names = set() locations = set() items = set() relations = set() params = [] for param in self.doc_reader.process(): params.append(param) names = names.union(self.name_getter.process(param)) locations = locations.union( self.location_getter.process(param)) items = items.union(self.item_getter.process(param)) # relations = relations.union(self.relation_getter.process(params, names)) yield (file_name, names, locations, items, relations)
def web_doc_similarity(self): start = time.time() df = pd.read_csv(DOC_TEXTS) df2 = pd.read_csv(SCRAPE_RESULTS_CSV) dr = DocReader() #Convert to Synsets webSynSets = [] docSynSets = [] for r, t in zip(df2.content.tolist(), df.ParaText.tolist()): webSynSets.append(dr.str_to_synsets(str(r))) docSynSets.append(dr.str_to_synsets(str(t))) result_scores = 0.0 indiv_scores = [] simi_content = [] simi_links = [] for ss1 in docSynSets: docsets = ss1 best_score = 0.0 best_content = "" best_link = "" ss2_num = 0 it = 0 for ss2 in webSynSets: websets = ss2 simi = dr.quick_compare(docsets, websets) print(simi) if (simi > 0.01): if (simi > best_score): best_score = simi best_content = df2.loc[it, 'content'] best_link = df2.loc[it, 'url'] ss2_num = it it += 1 if best_score > 0.0: del webSynSets[ss2_num] indiv_scores.append(best_score) result_scores += best_score simi_links.append(best_link) simi_content.append(best_content) end = time.time() print(end - start) try: return indiv_scores, simi_links, simi_content except: return 0.0, 0.0, "None Found", "None Found"
class DocReaderTest(unittest.TestCase): def setUp(self): self.aq_root = '/dropbox/17-18/573/AQUAINT' self.aq2_root = '/dropbox/17-18/573/AQUAINT-2' self.eng_gw_root = '/dropbox/17-18/573/ENG-GW' self.doc_reader = DocReader(self.aq_root, self.aq2_root, self.eng_gw_root) def test_read_docs(self): # Read all docs from training test set input_xml_filename = 'src/test_data/aquaint_topics.xml' parsed = self.doc_reader.read_docs(input_xml_filename) print(json.dumps(parsed)) def test_resolve_path(self): test_data = [ ('XIE19990529.0166', (self.aq_root + '/xie/1999/19990529_XIN_ENG', 'AQUAINT')), ('APW19990421.0284', (self.aq_root + '/apw/1999/19990421_APW_ENG', 'AQUAINT')), ('NYT19990421.0284', (self.aq_root + '/nyt/1999/19990421_NYT', 'AQUAINT')), ('XIN_ENG_20050415.0040', (self.aq2_root + '/data/xin_eng/xin_eng_200504.xml', 'AQUAINT-2')), ('APW_ENG_20061002.1245', (self.eng_gw_root + '/data/apw_eng/apw_eng_200610.gz', 'ENG-GW')), ('APW_ENG_20061003.0134', (self.eng_gw_root + '/data/apw_eng/apw_eng_200610.gz', 'ENG-GW')), ] for doc_id, path in test_data: with self.subTest(): self.assertEqual(path, self.doc_reader.resolve_path(doc_id)) # Test invalid doc_id with self.assertRaises(ValueError) as cm: self.doc_reader.resolve_path('foo') def test_parse_doc(self): test_data = [ ('/corpora/LDC/LDC02T31/apw/1998/19980601_APW_ENG', 'AQUAINT', 'APW19980601.0007'), ('/corpora/LDC/LDC08T25/data/apw_eng/apw_eng_200601.xml', 'AQUAINT-2', 'APW_ENG_20060101.0027'), ('/corpora/LDC/LDC11T07/data/apw_eng/apw_eng_200610.gz', 'ENG-GW', 'APW_ENG_20061003.0134'), ] for path, format_name, doc_id in test_data: content = self.doc_reader.parse_doc(path, format_name, doc_id) print(content) def test_read(self): #input_xml_filename = '/dropbox/17-18/573/Data/Documents/devtest/GuidedSumm10_test_topics.xml' test_data_dir = os.path.dirname( os.path.realpath(__file__)) + '/test_data' input_xml_filename = test_data_dir + '/test_topics.xml' topics_data = self.doc_reader.read_docs(input_xml_filename) print(topics_data)
generate_data(dataset_folder=dataset_folder) # Read corpus with open(data_file, 'rb') as f: frame = pd.read_csv(f) #Document retriver based on BM250 retriever = DocRetriever(top_n=10) #Vectorize BM250 using the corpus retriever.fit_retriever(frame) #Reader based on Hugging Face BertForQuestionAnswering Transformer #reader = DocReader('./model/') reader = DocReader('bert-large-uncased-whole-word-masking-finetuned-squad') # Find top_n documents based on BM250 for the query query = input("Enter the query (type exit to quit) : ") while(query != 'exit' and query != 'Exit'): print("Processing...") doc_scores = retriever.compute_scores(query) #Select top_n documents index = [score[0] for score in doc_scores] text = frame.loc[index] #predict n_best answers using BERT answers = reader.predict(df=text, query=query, n_best=5)
init_dirs() input_xml_filename = sys.argv[1] output_base_dir = sys.argv[2] core_nlp_port = int(sys.argv[3]) print( "Starting summarizer with input_xml_filename='{}', output_base_dir='{}', core_nlp_port='{}'" .format(input_xml_filename, output_base_dir, core_nlp_port)) os.makedirs(output_base_dir, exist_ok=True) # Initialize document reader with AQUAINT, AQUAINT-2, and ENG-GW root paths doc_reader = DocReader('/dropbox/17-18/573/AQUAINT', '/dropbox/17-18/573/AQUAINT-2', '/dropbox/17-18/573/ENG-GW') summarizer = build_summarizer(core_nlp_port) print('Reading in documents from "{}"...'.format(input_xml_filename)) topics_data = doc_reader.read_docs(input_xml_filename)['topics'] for i, topic in enumerate(topics_data): topic_id = topic['id'] topic_title = topic['title'] topic_category = topic['category'] docset = topic['docset'] print( 'Summarizing topic "{}" (topic {} of {}, {} documents)...'.format( topic_title, i + 1, len(topics_data), len(docset)))
def __init__(self): self.name_getter = ThuItemGetter() self.location_getter = ThuLocationGetter() self.item_getter = ThuItemGetter() self.relation_getter = None self.doc_reader = DocReader()
def setUp(self): self.aq_root = '/dropbox/17-18/573/AQUAINT' self.aq2_root = '/dropbox/17-18/573/AQUAINT-2' self.eng_gw_root = '/dropbox/17-18/573/ENG-GW' self.doc_reader = DocReader(self.aq_root, self.aq2_root, self.eng_gw_root)
class DocuBot(): THRESH = 0.0 doc = "" doc_local = "" doc_reader = DocReader() def __init__(self): self.THRESH = 0.5 #SET FUNCTIONS def set_file_path(self, filename, option): if(filename.lower().endswith('.docx') and os.path.isfile(filename)): if(option == 1): self.doc = filename if self.doc is not None: print("File uploaded successfully") return True if(option == 2): self.doc_local = filename if self.doc_local is not None: print("File 2 uploaded successfully") return True else: print("Error: File type not supported please make sure that the file is a .docx file") return False def set_thresh(self, x): sens = [0, 0.1, 0.5, 0.70] if((x > 0) and (x < 4)): self.THRESH = sens[x] #GET FUNCTIONS def get_thresh(self): return self.THRESH #RUN ANALYZE def merge_duplicates(self, data): #FIX THIS MERGE PART try: dataset = data.to_frame() except: dataset = data merged_dataset = dataset.groupby(['simi_links', 'simi_content'])['indiv_scores'].mean().reset_index() return merged_dataset def analyze_simi_online(self): if(os.path.exists(SIMI_RESULT)): os.remove(SIMI_RESULT) start = time.time() filename = self.doc se = SearchEngine() se.get_simi_link(filename) sp = ScrapeProcessor() df = sp.check_simi() clean_df = self.merge_duplicates(df) indiv_scores = clean_df['indiv_scores'].astype(str).astype(float) scoreList = [] for iscore in indiv_scores: scoreList.append(iscore) total_score = 0.0 url_list = [] for link, score in zip(clean_df['simi_links'].astype(str), scoreList): if(score > self.THRESH): total_score += score url_list.append(link) clean_df.to_csv(SIMI_RESULT) try: output = (total_score / len(url_list)) * 100 return output, url_list except: return 0.0, ["None Found"] end = time.time() print("Time taken to analyze: {}".format(end-start)) def analyze_simi_local(self, d1, d2): return self.doc_reader.doc_similarity(d1,d2)