예제 #1
0
def process_all_html_files():
    """
    Get all files in ../html_files and preprocess, then dump results in a CSV
    """
    article_filenames = []
    processed_articles = []
    processor = get_html_preprocessor()

    html_dir = '../html_files'
    # Iterate through all HTML files
    for f in os.listdir(html_dir):
        filepath = f"{html_dir}/{f}"
        with open(filepath, 'r', errors='ignore') as c:
            print(f)
            html_content = c.read()
            processed = processor.process(html_content).text.strip()
            if len(processed) == 0:
                print(f"{f} processing gave empty body")
            article_filenames.append(f)
            processed_articles.append(processed)

    results_df = pd.DataFrame(data={
        "file": article_filenames,
        "processed": processed_articles
    })
    save_results(results_df, "html_processor", "html_processor")
def create_and_export_queries():
    """
    Output the generated queries only
    """
    num_examples = 100
    bqg = get_query_generator()
    truth_tup_extractor = get_truth_tuple_extractor()

    ids = []
    original_claims = []
    processed = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == num_examples:
            break
        ids.append(claim.id)
        original_claims.append(claim.claim)
        claim_truth_tuples = truth_tup_extractor.extract(claim.claimant + " " +
                                                         claim.claim)
        processed.append(bqg.get_query(claim, truth_tuples=claim_truth_tuples))

    export_df = pd.DataFrame(data={
        "id": ids,
        "original": original_claims,
        "queries": processed
    })
    save_results(export_df, "basic_query_generator", "queries")
예제 #3
0
def main():
    # Get text from HTML preprocess output
    text_df = pd.read_csv(
        "../output/html_processor/html_processor_01-23-2020_20-08-47.csv")
    preprocessor = get_text_preprocessor()
    texts = list(text_df["processed"])
    processed = __preprocess(preprocessor, texts)
    export_df = pd.DataFrame(data={"original": texts, "processed": processed})
    save_results(export_df, "text_preprocessor", "text_preprocessor_alphanum")
def execute_queries_export_urls():
    """
    Output the queries as well as the resulting URL's from the search client
    Will also create the HTML files returned from the query
    """
    num_examples = 30  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    timestamp = get_timestamp()

    ids = []
    original_claims = []
    data = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == num_examples:
            break

        ids.append(claim.id)
        original_claims.append(claim.claim)
        q = bqg.get_query(claim)
        res = client.search(q)
        export_str = ""
        for i, r in enumerate(res.results):
            # Add to URLs
            export_str += f"{r.score}: {r.url} | "
            # Write Result
            filepath = f"output/basic_query_generator/query_html_{timestamp}/{claim.id}_{i}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(r.content)
        data.append(export_str)

    export_df = pd.DataFrame(data={
        "id": ids,
        "original": original_claims,
        "results": data
    })
    save_results(export_df,
                 "basic_query_generator",
                 "query_to_url",
                 time_str=timestamp)
def test_extraction_from_preprocessed_train_data(processed_pkl_path: str):
    # Get relevance scorer
    relevance_scorer = get_word2vec_relevance_scorer()

    # Get extractor
    sentence_extractor = get_relevant_info_extractor()

    # Read the train data
    preprocessed_df = get_preprocessed_train_data(processed_pkl_path)

    claims = []
    extracted_info = []

    for idx, row in preprocessed_df.iterrows():
        claim: str = row["claim"]
        related_articles: List[str] = row["related_articles"]
        pipeline_sentences: List[PipelineSentence] = []

        # Run relevance scorer - put all the articles toeether
        for article in related_articles:
            article_sentences = tokenize_by_sentence(article)
            for sent in article_sentences:
                relevance = relevance_scorer.get_relevance(claim, sent)
                pipeline_sent = PipelineSentence(sent)
                pipeline_sent.relevance = relevance
                pipeline_sentences.append(pipeline_sent)

        # Extract sentences from annotated items
        extracted_sentences = sentence_extractor.extract(pipeline_sentences,
                                                         window=1)

        claims.append(claim)
        extracted_info.append(' . '.join(
            [x.sentence for x in extracted_sentences]))

    # Export results
    export_df = pd.DataFrame(data={
        "claim": claims,
        "extracted": extracted_info
    })
    save_results(export_df, "relevant_information_extractor",
                 "w2v_windowsize1")
def main():
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_preprocessor = get_text_preprocessor()
    # Create the appropriate relevance scorer
    relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \
        else get_word2vec_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    processed_claims = []
    queries = []
    processed_sentences = []
    true_labels = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)
        processed_claim = ' '.join(
            text_preprocessor.process(
                claim.claim).bert_sentences)  # Preprocessed claim

        ids.append(claim.id)
        original_claims.append(claim.claim)
        true_labels.append(claim.label)
        queries.append(query)
        processed_claims.append(processed_claim)
        # Continue in case of error
        if search_res.error:
            processed_sentences.append(f"Error: {search_res.error}")
            continue

        # Create master list of sentences
        sentences = []
        for article in search_res.results:
            html_processed = html_preprocessor.process(article.content).text
            text_processed = text_preprocessor.process(html_processed)
            sentences += text_processed.bert_sentences

        # Run relevance scores
        if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER:
            relevances = get_infersent_relevances(claim.claim, sentences,
                                                  relevance_scorer)
        else:
            relevances = get_word2vec_relevances(claim.claim, sentences,
                                                 relevance_scorer)

        # Combine the two results
        processed_sentences_with_relevance = list(zip(relevances, sentences))
        # Construct final string
        processed_sentences_with_relevance.sort(key=lambda item: item[0],
                                                reverse=True)
        process_result = ""
        for rel, sent in processed_sentences_with_relevance:
            if len(process_result) > 10000:
                # Some basic truncation to limit file size
                break
            process_result += f"|SEP| {rel}: {sent} \n"

        processed_sentences.append(process_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "label": true_labels,
            "original": original_claims,
            "query": queries,
            "processed_claim": processed_claims,
            "processed_sentences": processed_sentences
        })
    save_results(export_df, "sentence_relevance_scorer",
                 f"claim_to_{RELEVANCE_TYPE}_relevance")
예제 #7
0
def main():
    """
    For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer
    """
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    relevance_scorer = get_lsa_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    ranked_articles_for_claims = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)

        ids.append(claim.id)
        original_claims.append(claim.claim)

        # Create master list of sentences
        article_texts = []
        article_urls = []
        for article in search_res.results:
            # Write the article for future checking
            url_filename = re.sub(
                r"([/:.])+", "_",
                article.url)  # Create a save-friendly filename
            filepath = f"output/{claim.id}/{url_filename}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(article.content)

            # Process the articles
            html_processed_text = html_preprocessor.process(
                article.content).text
            article_urls.append(article.url)
            article_texts.append(html_processed_text)

        # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing
        relevances = relevance_scorer.analyze(claim.claim, article_texts)
        print(relevances)
        articles_with_relevances = list(zip(article_urls, relevances))
        articles_with_relevances.sort(key=lambda x: x[1], reverse=True)

        # Create an export string with the URL and the relevance:
        article_rank_result = ""
        for url, rel in articles_with_relevances:
            article_rank_result += f"( {rel}: {url} )"
        ranked_articles_for_claims.append(article_rank_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "claim": original_claims,
            "ranked_articles": ranked_articles_for_claims
        })
    save_results(export_df, "document_relevance_scorer",
                 "claim_to_ranked_articles")
예제 #8
0
def test_pipeline(process_range: range, config: Dict, train_data_path: str):
    raw_claims: List[LeadersPrizeClaim] = []
    raw_claim_dicts: List[dict] = []
    init_articles = not config.get(PipelineConfigKeys.RETRIEVE_ARTICLES, True)
    if init_articles:
        print(
            "Reading articles from training data. Will not call search client")
    for idx, claim_dict, claim in train_data_generator(
            train_data_path + "trial_combined_data_long.json"):
        if idx < process_range.start:
            continue
        elif idx >= process_range.stop:
            break
        # Add the articles if we're not retrieving from search client
        if init_articles:
            articles: List[SearchQueryResult] = []
            for related_article in claim.related_articles:
                article_html = get_train_article(train_data_path,
                                                 related_article.filepath)
                articles.append(
                    SearchQueryResult(content=article_html,
                                      url=related_article.url))
            claim.mock_search_results = articles
        raw_claims.append(claim)
        raw_claim_dicts.append(claim_dict)

    start_time = datetime.now()

    # Create pipeline
    pipeline = LeadersPrizePipeline(config)

    # Run the prediction
    results = pipeline.predict(raw_claims)

    print(f"{len(results)} processed in {datetime.now() - start_time}")

    # Export results
    claims = []
    labels = []
    reasoner_inputs = []
    pred_labels = []
    supporting_article_urls = []
    explanations = []
    for res in results:
        claims.append(res.preprocessed_claim)
        labels.append(res.original_claim.label)
        reasoner_input = ""
        for idx, sent in enumerate(res.sentences_for_transformer):
            if idx == 10:
                break
            reasoner_input += " " + sent.preprocessed_text
        reasoner_inputs.append(reasoner_input)
        pred_labels.append(res.submission_label)
        supporting_article_urls.append(", ".join(
            res.submission_article_urls.values()))
        explanations.append(res.submission_explanation)
    results_df = pd.DataFrame(
        data={
            "claim": claims,
            "label": labels,
            "reasoner_input": reasoner_inputs,
            "predicted": pred_labels,
            "article_urls": supporting_article_urls,
            "explanation": explanations
        })

    # Get accuracies
    eval_predictions(labels, pred_labels)
    # Get datacup score
    eval_datacup(raw_claim_dicts, results)

    save_results(results_df, "pipeline_test", "full_pipeline")