def process_all_html_files(): """ Get all files in ../html_files and preprocess, then dump results in a CSV """ article_filenames = [] processed_articles = [] processor = get_html_preprocessor() html_dir = '../html_files' # Iterate through all HTML files for f in os.listdir(html_dir): filepath = f"{html_dir}/{f}" with open(filepath, 'r', errors='ignore') as c: print(f) html_content = c.read() processed = processor.process(html_content).text.strip() if len(processed) == 0: print(f"{f} processing gave empty body") article_filenames.append(f) processed_articles.append(processed) results_df = pd.DataFrame(data={ "file": article_filenames, "processed": processed_articles }) save_results(results_df, "html_processor", "html_processor")
def main(): html_preprocessor = get_html_preprocessor() error_files = [] for idx, claim in train_data_generator(TRAIN_DATA_PATH + "train.json"): if idx < PROCESS_RANGE.start: continue elif idx >= PROCESS_RANGE.stop: break # Add the articles if we're not retrieving from search client for related_article in claim.related_articles: article_html = get_train_article(TRAIN_DATA_PATH, related_article.filepath) if not html_preprocessor.process(article_html).text: print(related_article.filepath) error_files.append(related_article.filepath) output = '\n'.join(error_files) with open('../output/html_processor/html_unprocessable_files.txt', 'a') as f: f.write(output)
def preprocess_articles_from_data_pkl(pkl_path, save_path): """ Preprocess related_articles from raw html pickle """ html_preprocessor = get_html_preprocessor() text_preprocessor = get_text_preprocessor() df = pd.read_pickle(pkl_path) processed_articles_col = [] for idx, row in df.iterrows(): print_ex = idx % 100 == 0 if print_ex: print(idx) html_articles = row["related_articles"] processed_articles = [] for idx, article in enumerate(html_articles): html_processed = html_preprocessor.process(article) if not html_processed.text: row_id = row["id"] print(f"No text found in one article for claim ID {row_id}") sentences = text_util.tokenize_by_sentence(html_processed.text) text_preprocessed_sentences = text_preprocessor.process_sentences( sentences) processed_articles.append(' '.join(text_preprocessed_sentences)) if print_ex and idx == 0: print("== HTML Preprocessed ==") print(html_processed.text) print("== Text Preprocessed ==") print(' '.join(text_preprocessed_sentences)) print("\n") processed_articles_col.append(processed_articles) # Drop HTML articles df = df.drop(['related_articles'], axis=1) # Add processed articles df['related_articles'] = processed_articles_col df.to_pickle(save_path)
def main(): # Services query_generator = get_query_generator() client = get_search_client() html_preprocessor = get_html_preprocessor() text_preprocessor = get_text_preprocessor() # Create the appropriate relevance scorer relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \ else get_word2vec_relevance_scorer() # Outputs ids = [] original_claims = [] processed_claims = [] queries = [] processed_sentences = [] true_labels = [] for idx, claim in enumerate(train_data_generator()): if idx == NUM_EXAMPLES: break print(idx) query = query_generator.get_query(claim) search_res = client.search(query) processed_claim = ' '.join( text_preprocessor.process( claim.claim).bert_sentences) # Preprocessed claim ids.append(claim.id) original_claims.append(claim.claim) true_labels.append(claim.label) queries.append(query) processed_claims.append(processed_claim) # Continue in case of error if search_res.error: processed_sentences.append(f"Error: {search_res.error}") continue # Create master list of sentences sentences = [] for article in search_res.results: html_processed = html_preprocessor.process(article.content).text text_processed = text_preprocessor.process(html_processed) sentences += text_processed.bert_sentences # Run relevance scores if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER: relevances = get_infersent_relevances(claim.claim, sentences, relevance_scorer) else: relevances = get_word2vec_relevances(claim.claim, sentences, relevance_scorer) # Combine the two results processed_sentences_with_relevance = list(zip(relevances, sentences)) # Construct final string processed_sentences_with_relevance.sort(key=lambda item: item[0], reverse=True) process_result = "" for rel, sent in processed_sentences_with_relevance: if len(process_result) > 10000: # Some basic truncation to limit file size break process_result += f"|SEP| {rel}: {sent} \n" processed_sentences.append(process_result) # Export the result, with relevance scores and the processed text export_df = pd.DataFrame( data={ "id": ids, "label": true_labels, "original": original_claims, "query": queries, "processed_claim": processed_claims, "processed_sentences": processed_sentences }) save_results(export_df, "sentence_relevance_scorer", f"claim_to_{RELEVANCE_TYPE}_relevance")
def process_file(filepath): processor = get_html_preprocessor() with open(filepath, 'r', errors='ignore') as c: html_content = c.read() processed = processor.process(html_content).text.strip() print(processed)
def test_document_relevance_scorer(): """ This runs the pipeline up to retrieving articles and ranking them by relevance. Determines, if there is shared articles between what is retrieved and what is given as related_articles, the ranking of the related article """ num_examples = 1000 # Limit # of examples so this runs faster bqg = get_query_generator() client = get_search_client() html_preprocessor = get_html_preprocessor() text_prepreprocessor = get_text_preprocessor() article_relevance_scorer = get_lsa_relevance_scorer() total_searched = 0 average_rankings = [] for idx, claim in train_data_generator( "/Users/frankjia/Desktop/LeadersPrize/train/train.json"): if idx == num_examples: break print(idx) # Execute query q = bqg.get_query(claim) searched_articles = client.search(q).results # Process articles from raw HTML to parsed text pipeline_articles: List[PipelineArticle] = [] for raw_article in searched_articles: if raw_article and raw_article.content: pipeline_article = PipelineArticle(raw_article) # Extract data from HTML html_process_result = html_preprocessor.process( raw_article.content) pipeline_article.html_attributes = html_process_result.html_atts pipeline_article.raw_body_text = html_process_result.text pipeline_articles.append(pipeline_article) # Get Article Relevance preprocessed_claim_sentences = text_prepreprocessor.process( claim.claim + " " + claim.claimant).sentences preprocessed_claim = claim.claim + " " + claim.claimant if preprocessed_claim_sentences: preprocessed_claim = preprocessed_claim_sentences[0] pipeline_article_texts: List[str] = [ p.raw_body_text for p in pipeline_articles ] article_relevances = article_relevance_scorer.analyze( preprocessed_claim, pipeline_article_texts) for article_relevance, pipeline_article in zip(article_relevances, pipeline_articles): # Sometimes we get nan from numpy operations pipeline_article.relevance = article_relevance if math.isfinite( article_relevance) else 0 # Based on article relevance, only consider the top relevances pipeline_articles.sort(key=lambda article: article.relevance, reverse=True) sorted_urls = [article.url for article in pipeline_articles] claim_urls = [article.url for article in claim.related_articles] common_urls = list(set(sorted_urls).intersection(claim_urls)) total_searched += 1 if common_urls: # Determine index of shared url in the sorted urls index_sum = 0 for url in common_urls: index_sum += sorted_urls.index(url) average_rankings.append(float(index_sum) / len(common_urls)) print("RESULTS") print(total_searched) print(len(average_rankings)) print(float(sum(average_rankings)) / len(average_rankings))
def main(): """ For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer """ # Services query_generator = get_query_generator() client = get_search_client() html_preprocessor = get_html_preprocessor() relevance_scorer = get_lsa_relevance_scorer() # Outputs ids = [] original_claims = [] ranked_articles_for_claims = [] for idx, claim in train_data_generator( "/Users/frankjia/Desktop/LeadersPrize/train/train.json"): if idx == NUM_EXAMPLES: break print(idx) query = query_generator.get_query(claim) search_res = client.search(query) ids.append(claim.id) original_claims.append(claim.claim) # Create master list of sentences article_texts = [] article_urls = [] for article in search_res.results: # Write the article for future checking url_filename = re.sub( r"([/:.])+", "_", article.url) # Create a save-friendly filename filepath = f"output/{claim.id}/{url_filename}.html" os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'a') as f: f.write(article.content) # Process the articles html_processed_text = html_preprocessor.process( article.content).text article_urls.append(article.url) article_texts.append(html_processed_text) # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing relevances = relevance_scorer.analyze(claim.claim, article_texts) print(relevances) articles_with_relevances = list(zip(article_urls, relevances)) articles_with_relevances.sort(key=lambda x: x[1], reverse=True) # Create an export string with the URL and the relevance: article_rank_result = "" for url, rel in articles_with_relevances: article_rank_result += f"( {rel}: {url} )" ranked_articles_for_claims.append(article_rank_result) # Export the result, with relevance scores and the processed text export_df = pd.DataFrame( data={ "id": ids, "claim": original_claims, "ranked_articles": ranked_articles_for_claims }) save_results(export_df, "document_relevance_scorer", "claim_to_ranked_articles")