def process_all_html_files(): """ Get all files in ../html_files and preprocess, then dump results in a CSV """ article_filenames = [] processed_articles = [] processor = get_html_preprocessor() html_dir = '../html_files' # Iterate through all HTML files for f in os.listdir(html_dir): filepath = f"{html_dir}/{f}" with open(filepath, 'r', errors='ignore') as c: print(f) html_content = c.read() processed = processor.process(html_content).text.strip() if len(processed) == 0: print(f"{f} processing gave empty body") article_filenames.append(f) processed_articles.append(processed) results_df = pd.DataFrame(data={ "file": article_filenames, "processed": processed_articles }) save_results(results_df, "html_processor", "html_processor")
def create_and_export_queries(): """ Output the generated queries only """ num_examples = 100 bqg = get_query_generator() truth_tup_extractor = get_truth_tuple_extractor() ids = [] original_claims = [] processed = [] for idx, claim in enumerate(train_data_generator()): if idx == num_examples: break ids.append(claim.id) original_claims.append(claim.claim) claim_truth_tuples = truth_tup_extractor.extract(claim.claimant + " " + claim.claim) processed.append(bqg.get_query(claim, truth_tuples=claim_truth_tuples)) export_df = pd.DataFrame(data={ "id": ids, "original": original_claims, "queries": processed }) save_results(export_df, "basic_query_generator", "queries")
def main(): # Get text from HTML preprocess output text_df = pd.read_csv( "../output/html_processor/html_processor_01-23-2020_20-08-47.csv") preprocessor = get_text_preprocessor() texts = list(text_df["processed"]) processed = __preprocess(preprocessor, texts) export_df = pd.DataFrame(data={"original": texts, "processed": processed}) save_results(export_df, "text_preprocessor", "text_preprocessor_alphanum")
def execute_queries_export_urls(): """ Output the queries as well as the resulting URL's from the search client Will also create the HTML files returned from the query """ num_examples = 30 # Limit # of examples so this runs faster bqg = get_query_generator() client = get_search_client() timestamp = get_timestamp() ids = [] original_claims = [] data = [] for idx, claim in enumerate(train_data_generator()): if idx == num_examples: break ids.append(claim.id) original_claims.append(claim.claim) q = bqg.get_query(claim) res = client.search(q) export_str = "" for i, r in enumerate(res.results): # Add to URLs export_str += f"{r.score}: {r.url} | " # Write Result filepath = f"output/basic_query_generator/query_html_{timestamp}/{claim.id}_{i}.html" os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'a') as f: f.write(r.content) data.append(export_str) export_df = pd.DataFrame(data={ "id": ids, "original": original_claims, "results": data }) save_results(export_df, "basic_query_generator", "query_to_url", time_str=timestamp)
def test_extraction_from_preprocessed_train_data(processed_pkl_path: str): # Get relevance scorer relevance_scorer = get_word2vec_relevance_scorer() # Get extractor sentence_extractor = get_relevant_info_extractor() # Read the train data preprocessed_df = get_preprocessed_train_data(processed_pkl_path) claims = [] extracted_info = [] for idx, row in preprocessed_df.iterrows(): claim: str = row["claim"] related_articles: List[str] = row["related_articles"] pipeline_sentences: List[PipelineSentence] = [] # Run relevance scorer - put all the articles toeether for article in related_articles: article_sentences = tokenize_by_sentence(article) for sent in article_sentences: relevance = relevance_scorer.get_relevance(claim, sent) pipeline_sent = PipelineSentence(sent) pipeline_sent.relevance = relevance pipeline_sentences.append(pipeline_sent) # Extract sentences from annotated items extracted_sentences = sentence_extractor.extract(pipeline_sentences, window=1) claims.append(claim) extracted_info.append(' . '.join( [x.sentence for x in extracted_sentences])) # Export results export_df = pd.DataFrame(data={ "claim": claims, "extracted": extracted_info }) save_results(export_df, "relevant_information_extractor", "w2v_windowsize1")
def main(): # Services query_generator = get_query_generator() client = get_search_client() html_preprocessor = get_html_preprocessor() text_preprocessor = get_text_preprocessor() # Create the appropriate relevance scorer relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \ else get_word2vec_relevance_scorer() # Outputs ids = [] original_claims = [] processed_claims = [] queries = [] processed_sentences = [] true_labels = [] for idx, claim in enumerate(train_data_generator()): if idx == NUM_EXAMPLES: break print(idx) query = query_generator.get_query(claim) search_res = client.search(query) processed_claim = ' '.join( text_preprocessor.process( claim.claim).bert_sentences) # Preprocessed claim ids.append(claim.id) original_claims.append(claim.claim) true_labels.append(claim.label) queries.append(query) processed_claims.append(processed_claim) # Continue in case of error if search_res.error: processed_sentences.append(f"Error: {search_res.error}") continue # Create master list of sentences sentences = [] for article in search_res.results: html_processed = html_preprocessor.process(article.content).text text_processed = text_preprocessor.process(html_processed) sentences += text_processed.bert_sentences # Run relevance scores if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER: relevances = get_infersent_relevances(claim.claim, sentences, relevance_scorer) else: relevances = get_word2vec_relevances(claim.claim, sentences, relevance_scorer) # Combine the two results processed_sentences_with_relevance = list(zip(relevances, sentences)) # Construct final string processed_sentences_with_relevance.sort(key=lambda item: item[0], reverse=True) process_result = "" for rel, sent in processed_sentences_with_relevance: if len(process_result) > 10000: # Some basic truncation to limit file size break process_result += f"|SEP| {rel}: {sent} \n" processed_sentences.append(process_result) # Export the result, with relevance scores and the processed text export_df = pd.DataFrame( data={ "id": ids, "label": true_labels, "original": original_claims, "query": queries, "processed_claim": processed_claims, "processed_sentences": processed_sentences }) save_results(export_df, "sentence_relevance_scorer", f"claim_to_{RELEVANCE_TYPE}_relevance")
def main(): """ For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer """ # Services query_generator = get_query_generator() client = get_search_client() html_preprocessor = get_html_preprocessor() relevance_scorer = get_lsa_relevance_scorer() # Outputs ids = [] original_claims = [] ranked_articles_for_claims = [] for idx, claim in train_data_generator( "/Users/frankjia/Desktop/LeadersPrize/train/train.json"): if idx == NUM_EXAMPLES: break print(idx) query = query_generator.get_query(claim) search_res = client.search(query) ids.append(claim.id) original_claims.append(claim.claim) # Create master list of sentences article_texts = [] article_urls = [] for article in search_res.results: # Write the article for future checking url_filename = re.sub( r"([/:.])+", "_", article.url) # Create a save-friendly filename filepath = f"output/{claim.id}/{url_filename}.html" os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'a') as f: f.write(article.content) # Process the articles html_processed_text = html_preprocessor.process( article.content).text article_urls.append(article.url) article_texts.append(html_processed_text) # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing relevances = relevance_scorer.analyze(claim.claim, article_texts) print(relevances) articles_with_relevances = list(zip(article_urls, relevances)) articles_with_relevances.sort(key=lambda x: x[1], reverse=True) # Create an export string with the URL and the relevance: article_rank_result = "" for url, rel in articles_with_relevances: article_rank_result += f"( {rel}: {url} )" ranked_articles_for_claims.append(article_rank_result) # Export the result, with relevance scores and the processed text export_df = pd.DataFrame( data={ "id": ids, "claim": original_claims, "ranked_articles": ranked_articles_for_claims }) save_results(export_df, "document_relevance_scorer", "claim_to_ranked_articles")
def test_pipeline(process_range: range, config: Dict, train_data_path: str): raw_claims: List[LeadersPrizeClaim] = [] raw_claim_dicts: List[dict] = [] init_articles = not config.get(PipelineConfigKeys.RETRIEVE_ARTICLES, True) if init_articles: print( "Reading articles from training data. Will not call search client") for idx, claim_dict, claim in train_data_generator( train_data_path + "trial_combined_data_long.json"): if idx < process_range.start: continue elif idx >= process_range.stop: break # Add the articles if we're not retrieving from search client if init_articles: articles: List[SearchQueryResult] = [] for related_article in claim.related_articles: article_html = get_train_article(train_data_path, related_article.filepath) articles.append( SearchQueryResult(content=article_html, url=related_article.url)) claim.mock_search_results = articles raw_claims.append(claim) raw_claim_dicts.append(claim_dict) start_time = datetime.now() # Create pipeline pipeline = LeadersPrizePipeline(config) # Run the prediction results = pipeline.predict(raw_claims) print(f"{len(results)} processed in {datetime.now() - start_time}") # Export results claims = [] labels = [] reasoner_inputs = [] pred_labels = [] supporting_article_urls = [] explanations = [] for res in results: claims.append(res.preprocessed_claim) labels.append(res.original_claim.label) reasoner_input = "" for idx, sent in enumerate(res.sentences_for_transformer): if idx == 10: break reasoner_input += " " + sent.preprocessed_text reasoner_inputs.append(reasoner_input) pred_labels.append(res.submission_label) supporting_article_urls.append(", ".join( res.submission_article_urls.values())) explanations.append(res.submission_explanation) results_df = pd.DataFrame( data={ "claim": claims, "label": labels, "reasoner_input": reasoner_inputs, "predicted": pred_labels, "article_urls": supporting_article_urls, "explanation": explanations }) # Get accuracies eval_predictions(labels, pred_labels) # Get datacup score eval_datacup(raw_claim_dicts, results) save_results(results_df, "pipeline_test", "full_pipeline")