def sync_vs_async(query: str):
    client = get_search_client()
    now = datetime.now()
    async_results = client.search_async(query, num_results=30)
    print(f"{len(async_results)} async results in {datetime.now() - now}")
    now = datetime.now()
    sync_results = client.search(query, num_results=30)
    print(f"{len(sync_results)} sync results in {datetime.now() - now}")
def execute_query_export_html(query: str, save_html=False):
    client = get_search_client()
    results = client.search_async(query)
    time_str = get_timestamp()
    for i, r in enumerate(results):
        print(r.url)
        if save_html:
            # Write Result
            filepath = f"output/manual_client_search/{time_str}_{i}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(r.content)
Пример #3
0
def compare_query_results_with_train_data():
    """
    Compare the resulting URL's from searched articles with the ground truth articles given in training data
    """
    num_examples = 1000  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()

    ids = []
    claims = []
    queries = []
    training_urls = []
    client_urls = []
    shared_urls_for_claim = []

    # TODO: Finish this off
    for idx, claim in train_data_generator("/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == num_examples:
            break
        print(idx)
        # Execute query
        q = bqg.get_query(claim)
        res = client.search(q)
        searched_urls = []
        # Get URLs from the result
        for r in res.results:
            searched_urls.append(r.url)
        # Get URL's from training data
        training_article_urls = []
        for item in claim.related_articles:
            training_article_urls.append(item.url)
        # Get shared items
        shared_urls = list(set(training_article_urls).intersection(searched_urls))

        ids.append(claim.id)
        claims.append(claim.claim)
        queries.append(q)
        training_urls.append(training_article_urls)
        client_urls.append(searched_urls)
        shared_urls_for_claim.append(shared_urls)

    # Get stats
    num_claims_with_shared_articles = sum([1 for item in shared_urls_for_claim if item])
    frac_shared = float(num_claims_with_shared_articles) / len(ids)
    print(f"{num_claims_with_shared_articles} claims searched with shared articles out of {len(ids)}: {frac_shared}")

    num_shared_articles = sum([len(item) for item in shared_urls_for_claim])
    total_train_articles = sum([len(item) for item in training_urls])
    print(f"{num_shared_articles} shared articles found in {total_train_articles} total: {float(num_shared_articles) / total_train_articles}")
def execute_queries_export_urls():
    """
    Output the queries as well as the resulting URL's from the search client
    Will also create the HTML files returned from the query
    """
    num_examples = 30  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    timestamp = get_timestamp()

    ids = []
    original_claims = []
    data = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == num_examples:
            break

        ids.append(claim.id)
        original_claims.append(claim.claim)
        q = bqg.get_query(claim)
        res = client.search(q)
        export_str = ""
        for i, r in enumerate(res.results):
            # Add to URLs
            export_str += f"{r.score}: {r.url} | "
            # Write Result
            filepath = f"output/basic_query_generator/query_html_{timestamp}/{claim.id}_{i}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(r.content)
        data.append(export_str)

    export_df = pd.DataFrame(data={
        "id": ids,
        "original": original_claims,
        "results": data
    })
    save_results(export_df,
                 "basic_query_generator",
                 "query_to_url",
                 time_str=timestamp)
def main():
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_preprocessor = get_text_preprocessor()
    # Create the appropriate relevance scorer
    relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \
        else get_word2vec_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    processed_claims = []
    queries = []
    processed_sentences = []
    true_labels = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)
        processed_claim = ' '.join(
            text_preprocessor.process(
                claim.claim).bert_sentences)  # Preprocessed claim

        ids.append(claim.id)
        original_claims.append(claim.claim)
        true_labels.append(claim.label)
        queries.append(query)
        processed_claims.append(processed_claim)
        # Continue in case of error
        if search_res.error:
            processed_sentences.append(f"Error: {search_res.error}")
            continue

        # Create master list of sentences
        sentences = []
        for article in search_res.results:
            html_processed = html_preprocessor.process(article.content).text
            text_processed = text_preprocessor.process(html_processed)
            sentences += text_processed.bert_sentences

        # Run relevance scores
        if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER:
            relevances = get_infersent_relevances(claim.claim, sentences,
                                                  relevance_scorer)
        else:
            relevances = get_word2vec_relevances(claim.claim, sentences,
                                                 relevance_scorer)

        # Combine the two results
        processed_sentences_with_relevance = list(zip(relevances, sentences))
        # Construct final string
        processed_sentences_with_relevance.sort(key=lambda item: item[0],
                                                reverse=True)
        process_result = ""
        for rel, sent in processed_sentences_with_relevance:
            if len(process_result) > 10000:
                # Some basic truncation to limit file size
                break
            process_result += f"|SEP| {rel}: {sent} \n"

        processed_sentences.append(process_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "label": true_labels,
            "original": original_claims,
            "query": queries,
            "processed_claim": processed_claims,
            "processed_sentences": processed_sentences
        })
    save_results(export_df, "sentence_relevance_scorer",
                 f"claim_to_{RELEVANCE_TYPE}_relevance")
Пример #6
0
from experiments.util.experiment_util import get_search_client
import json
search_client = get_search_client()

root_folder_path = "/Users/frankjia/Desktop/LeadersPrize/train_phase2_val"
train_json_filepath = "train_val_phase2.json"
new_train_json_path = "train.json"
with open(f"{root_folder_path}/{train_json_filepath}", "r") as f:
    train_json = json.load(f)

new_train_json = []
article_key = 0
for item in train_json:
    print(item["id"])
    new_rel_articles = {}
    given_rel_articles = item["related_articles"]
    for _, article_url in given_rel_articles.items():
        results = search_client.search(article_url, num_results=30)
        url_matched_result = None
        for result in results:
            if result.url == article_url:
                url_matched_result = result
                break
        if url_matched_result:
            # Write the output
            article_filepath = f"train_articles/{article_key}.html"
            with open(f"{root_folder_path}/{article_filepath}", "w") as f:
                f.write(url_matched_result.content)
            article_key += 1
            # add to new rel articles
            new_rel_articles[article_filepath] = url_matched_result.url
Пример #7
0
def test_document_relevance_scorer():
    """
    This runs the pipeline up to retrieving articles and ranking them by relevance.
    Determines, if there is shared articles between what is retrieved and what is given as related_articles,
    the ranking of the related article
    """
    num_examples = 1000  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_prepreprocessor = get_text_preprocessor()
    article_relevance_scorer = get_lsa_relevance_scorer()

    total_searched = 0
    average_rankings = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == num_examples:
            break
        print(idx)
        # Execute query
        q = bqg.get_query(claim)
        searched_articles = client.search(q).results

        # Process articles from raw HTML to parsed text
        pipeline_articles: List[PipelineArticle] = []
        for raw_article in searched_articles:
            if raw_article and raw_article.content:
                pipeline_article = PipelineArticle(raw_article)
                # Extract data from HTML
                html_process_result = html_preprocessor.process(
                    raw_article.content)
                pipeline_article.html_attributes = html_process_result.html_atts
                pipeline_article.raw_body_text = html_process_result.text
                pipeline_articles.append(pipeline_article)

        # Get Article Relevance
        preprocessed_claim_sentences = text_prepreprocessor.process(
            claim.claim + " " + claim.claimant).sentences
        preprocessed_claim = claim.claim + " " + claim.claimant
        if preprocessed_claim_sentences:
            preprocessed_claim = preprocessed_claim_sentences[0]
        pipeline_article_texts: List[str] = [
            p.raw_body_text for p in pipeline_articles
        ]
        article_relevances = article_relevance_scorer.analyze(
            preprocessed_claim, pipeline_article_texts)
        for article_relevance, pipeline_article in zip(article_relevances,
                                                       pipeline_articles):
            # Sometimes we get nan from numpy operations
            pipeline_article.relevance = article_relevance if math.isfinite(
                article_relevance) else 0

        # Based on article relevance, only consider the top relevances
        pipeline_articles.sort(key=lambda article: article.relevance,
                               reverse=True)

        sorted_urls = [article.url for article in pipeline_articles]
        claim_urls = [article.url for article in claim.related_articles]
        common_urls = list(set(sorted_urls).intersection(claim_urls))

        total_searched += 1
        if common_urls:
            # Determine index of shared url in the sorted urls
            index_sum = 0
            for url in common_urls:
                index_sum += sorted_urls.index(url)
            average_rankings.append(float(index_sum) / len(common_urls))

    print("RESULTS")
    print(total_searched)
    print(len(average_rankings))
    print(float(sum(average_rankings)) / len(average_rankings))
Пример #8
0
def main():
    """
    For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer
    """
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    relevance_scorer = get_lsa_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    ranked_articles_for_claims = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)

        ids.append(claim.id)
        original_claims.append(claim.claim)

        # Create master list of sentences
        article_texts = []
        article_urls = []
        for article in search_res.results:
            # Write the article for future checking
            url_filename = re.sub(
                r"([/:.])+", "_",
                article.url)  # Create a save-friendly filename
            filepath = f"output/{claim.id}/{url_filename}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(article.content)

            # Process the articles
            html_processed_text = html_preprocessor.process(
                article.content).text
            article_urls.append(article.url)
            article_texts.append(html_processed_text)

        # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing
        relevances = relevance_scorer.analyze(claim.claim, article_texts)
        print(relevances)
        articles_with_relevances = list(zip(article_urls, relevances))
        articles_with_relevances.sort(key=lambda x: x[1], reverse=True)

        # Create an export string with the URL and the relevance:
        article_rank_result = ""
        for url, rel in articles_with_relevances:
            article_rank_result += f"( {rel}: {url} )"
        ranked_articles_for_claims.append(article_rank_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "claim": original_claims,
            "ranked_articles": ranked_articles_for_claims
        })
    save_results(export_df, "document_relevance_scorer",
                 "claim_to_ranked_articles")