예제 #1
0
def process_all_html_files():
    """
    Get all files in ../html_files and preprocess, then dump results in a CSV
    """
    article_filenames = []
    processed_articles = []
    processor = get_html_preprocessor()

    html_dir = '../html_files'
    # Iterate through all HTML files
    for f in os.listdir(html_dir):
        filepath = f"{html_dir}/{f}"
        with open(filepath, 'r', errors='ignore') as c:
            print(f)
            html_content = c.read()
            processed = processor.process(html_content).text.strip()
            if len(processed) == 0:
                print(f"{f} processing gave empty body")
            article_filenames.append(f)
            processed_articles.append(processed)

    results_df = pd.DataFrame(data={
        "file": article_filenames,
        "processed": processed_articles
    })
    save_results(results_df, "html_processor", "html_processor")
def main():
    html_preprocessor = get_html_preprocessor()
    error_files = []
    for idx, claim in train_data_generator(TRAIN_DATA_PATH + "train.json"):
        if idx < PROCESS_RANGE.start:
            continue
        elif idx >= PROCESS_RANGE.stop:
            break
        # Add the articles if we're not retrieving from search client
        for related_article in claim.related_articles:
            article_html = get_train_article(TRAIN_DATA_PATH, related_article.filepath)
            if not html_preprocessor.process(article_html).text:
                print(related_article.filepath)
                error_files.append(related_article.filepath)

    output = '\n'.join(error_files)
    with open('../output/html_processor/html_unprocessable_files.txt', 'a') as f:
        f.write(output)
def preprocess_articles_from_data_pkl(pkl_path, save_path):
    """
    Preprocess related_articles from raw html pickle
    """
    html_preprocessor = get_html_preprocessor()
    text_preprocessor = get_text_preprocessor()

    df = pd.read_pickle(pkl_path)
    processed_articles_col = []
    for idx, row in df.iterrows():
        print_ex = idx % 100 == 0
        if print_ex:
            print(idx)

        html_articles = row["related_articles"]
        processed_articles = []
        for idx, article in enumerate(html_articles):
            html_processed = html_preprocessor.process(article)
            if not html_processed.text:
                row_id = row["id"]
                print(f"No text found in one article for claim ID {row_id}")
            sentences = text_util.tokenize_by_sentence(html_processed.text)
            text_preprocessed_sentences = text_preprocessor.process_sentences(
                sentences)
            processed_articles.append(' '.join(text_preprocessed_sentences))

            if print_ex and idx == 0:
                print("== HTML Preprocessed ==")
                print(html_processed.text)
                print("== Text Preprocessed ==")
                print(' '.join(text_preprocessed_sentences))
                print("\n")

        processed_articles_col.append(processed_articles)

    # Drop HTML articles
    df = df.drop(['related_articles'], axis=1)
    # Add processed articles
    df['related_articles'] = processed_articles_col
    df.to_pickle(save_path)
def main():
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_preprocessor = get_text_preprocessor()
    # Create the appropriate relevance scorer
    relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \
        else get_word2vec_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    processed_claims = []
    queries = []
    processed_sentences = []
    true_labels = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)
        processed_claim = ' '.join(
            text_preprocessor.process(
                claim.claim).bert_sentences)  # Preprocessed claim

        ids.append(claim.id)
        original_claims.append(claim.claim)
        true_labels.append(claim.label)
        queries.append(query)
        processed_claims.append(processed_claim)
        # Continue in case of error
        if search_res.error:
            processed_sentences.append(f"Error: {search_res.error}")
            continue

        # Create master list of sentences
        sentences = []
        for article in search_res.results:
            html_processed = html_preprocessor.process(article.content).text
            text_processed = text_preprocessor.process(html_processed)
            sentences += text_processed.bert_sentences

        # Run relevance scores
        if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER:
            relevances = get_infersent_relevances(claim.claim, sentences,
                                                  relevance_scorer)
        else:
            relevances = get_word2vec_relevances(claim.claim, sentences,
                                                 relevance_scorer)

        # Combine the two results
        processed_sentences_with_relevance = list(zip(relevances, sentences))
        # Construct final string
        processed_sentences_with_relevance.sort(key=lambda item: item[0],
                                                reverse=True)
        process_result = ""
        for rel, sent in processed_sentences_with_relevance:
            if len(process_result) > 10000:
                # Some basic truncation to limit file size
                break
            process_result += f"|SEP| {rel}: {sent} \n"

        processed_sentences.append(process_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "label": true_labels,
            "original": original_claims,
            "query": queries,
            "processed_claim": processed_claims,
            "processed_sentences": processed_sentences
        })
    save_results(export_df, "sentence_relevance_scorer",
                 f"claim_to_{RELEVANCE_TYPE}_relevance")
예제 #5
0
def process_file(filepath):
    processor = get_html_preprocessor()
    with open(filepath, 'r', errors='ignore') as c:
        html_content = c.read()
        processed = processor.process(html_content).text.strip()
        print(processed)
예제 #6
0
def test_document_relevance_scorer():
    """
    This runs the pipeline up to retrieving articles and ranking them by relevance.
    Determines, if there is shared articles between what is retrieved and what is given as related_articles,
    the ranking of the related article
    """
    num_examples = 1000  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_prepreprocessor = get_text_preprocessor()
    article_relevance_scorer = get_lsa_relevance_scorer()

    total_searched = 0
    average_rankings = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == num_examples:
            break
        print(idx)
        # Execute query
        q = bqg.get_query(claim)
        searched_articles = client.search(q).results

        # Process articles from raw HTML to parsed text
        pipeline_articles: List[PipelineArticle] = []
        for raw_article in searched_articles:
            if raw_article and raw_article.content:
                pipeline_article = PipelineArticle(raw_article)
                # Extract data from HTML
                html_process_result = html_preprocessor.process(
                    raw_article.content)
                pipeline_article.html_attributes = html_process_result.html_atts
                pipeline_article.raw_body_text = html_process_result.text
                pipeline_articles.append(pipeline_article)

        # Get Article Relevance
        preprocessed_claim_sentences = text_prepreprocessor.process(
            claim.claim + " " + claim.claimant).sentences
        preprocessed_claim = claim.claim + " " + claim.claimant
        if preprocessed_claim_sentences:
            preprocessed_claim = preprocessed_claim_sentences[0]
        pipeline_article_texts: List[str] = [
            p.raw_body_text for p in pipeline_articles
        ]
        article_relevances = article_relevance_scorer.analyze(
            preprocessed_claim, pipeline_article_texts)
        for article_relevance, pipeline_article in zip(article_relevances,
                                                       pipeline_articles):
            # Sometimes we get nan from numpy operations
            pipeline_article.relevance = article_relevance if math.isfinite(
                article_relevance) else 0

        # Based on article relevance, only consider the top relevances
        pipeline_articles.sort(key=lambda article: article.relevance,
                               reverse=True)

        sorted_urls = [article.url for article in pipeline_articles]
        claim_urls = [article.url for article in claim.related_articles]
        common_urls = list(set(sorted_urls).intersection(claim_urls))

        total_searched += 1
        if common_urls:
            # Determine index of shared url in the sorted urls
            index_sum = 0
            for url in common_urls:
                index_sum += sorted_urls.index(url)
            average_rankings.append(float(index_sum) / len(common_urls))

    print("RESULTS")
    print(total_searched)
    print(len(average_rankings))
    print(float(sum(average_rankings)) / len(average_rankings))
예제 #7
0
def main():
    """
    For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer
    """
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    relevance_scorer = get_lsa_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    ranked_articles_for_claims = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)

        ids.append(claim.id)
        original_claims.append(claim.claim)

        # Create master list of sentences
        article_texts = []
        article_urls = []
        for article in search_res.results:
            # Write the article for future checking
            url_filename = re.sub(
                r"([/:.])+", "_",
                article.url)  # Create a save-friendly filename
            filepath = f"output/{claim.id}/{url_filename}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(article.content)

            # Process the articles
            html_processed_text = html_preprocessor.process(
                article.content).text
            article_urls.append(article.url)
            article_texts.append(html_processed_text)

        # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing
        relevances = relevance_scorer.analyze(claim.claim, article_texts)
        print(relevances)
        articles_with_relevances = list(zip(article_urls, relevances))
        articles_with_relevances.sort(key=lambda x: x[1], reverse=True)

        # Create an export string with the URL and the relevance:
        article_rank_result = ""
        for url, rel in articles_with_relevances:
            article_rank_result += f"( {rel}: {url} )"
        ranked_articles_for_claims.append(article_rank_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "claim": original_claims,
            "ranked_articles": ranked_articles_for_claims
        })
    save_results(export_df, "document_relevance_scorer",
                 "claim_to_ranked_articles")