def generate_partial_subindex_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_wiki_pages = filter_documents(all_articles) if args.debug: filtered_wiki_pages = filtered_wiki_pages[:42] subindex = {} for raw_document in filtered_wiki_pages.iterrows(): page_id = raw_document[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_document[1]['text']) words_counter = Counter(filtered_tokens) # Invert word -> doc and add raw and relative term count for count in words_counter.items(): term = count[0] raw_count = count[1] tf = raw_count if args.variant == 'raw_count' else raw_count / len( filtered_tokens) idf = words_with_idf.loc[term]['idf'] tfidf = tf * idf subindex.setdefault(term, {'docs': []})['docs'].append( (page_id, raw_count, tfidf)) print('Finished processing batch #{}'.format(batch_id)) return subindex
def process_count_batch(batch_id: int) -> Counter: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) article_texts = reduce_document_to_text_column(filtered_articles) combined_tokens = [] for raw_article in article_texts: filtered_tokens = process_normalise_tokenise_filter(raw_article) combined_tokens.extend(filtered_tokens) return get_word_counts(combined_tokens)
def generate_batch_mappings(batch_id: int): print('Processing batch #{}...'.format(batch_id)) parital_result = {} batch_file_path = get_wiki_batch_path(batch_id) batch_df = read_jsonl_and_map_to_df(batch_file_path, ['id']) for line_index, row in batch_df.iterrows(): page_id = row[0] # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé' page_id = unicodedata.normalize('NFC', page_id) parital_result[page_id] = (batch_id, line_index) return parital_result
def generate_document_length_mapping_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_articles = filter_documents(all_articles) # filtered_articles.set_index('id', drop=False) if (args.debug): filtered_articles = filtered_articles.head(n=3) partial_document_length_mappings = {} for raw_doc_row in filtered_articles.iterrows(): page_id = raw_doc_row[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_doc_row[1]['text']) partial_document_length_mappings[page_id] = len(filtered_tokens) return partial_document_length_mappings
def process_generate_df_batch(id: int) -> Counter: colour = TERM_COLOURS[id % len(TERM_COLOURS)] print( colored('Start processing batch #{}'.format(id), colour, attrs=['bold'])) start_time = time.time() batch_file_path = '{}wiki-{:03}.jsonl'.format(DATA_WIKI_PATH, id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) article_texts = reduce_document_to_text_column(filtered_articles) accumulated_batch_idfs = Counter() for index, raw_article in enumerate(article_texts): filtered_tokens = process_normalise_tokenise_filter(raw_article) # use set to prevent multiple occurrences of word in doc words_set = set(filtered_tokens) if (index % 5000 == 0): print( colored( 'Processing document [{} / {}] of batch #{}...'.format( index, len(article_texts), id), colour)) # count for included words will be one words_in_doc = Counter(words_set) accumulated_batch_idfs += words_in_doc print( colored('Finished processing batch #{} after {:.2f} seconds'.format( id, time.time() - start_time), colour, attrs=['bold'])) return accumulated_batch_idfs
word = word_count[0] df = word_count[1] idf = math.log10(COLLECTION_DOCUMENTS_NUMBER / df) result.append((word, idf)) return result def export_result(result: list): write_list_to_jsonl(GENERATED_IDF_PATH, result) if __name__ == '__main__': start_time = time.time() words_with_df = generate_df_all() print( colored('Counted frequencies of {:,} words'.format(len(words_with_df)), attrs=['bold'])) words_with_idf = get_words_with_idf(words_with_df) print('Added inverse document frequencies') print('Top 10 extract: {}'.format(words_with_idf[0:10])) print('Finished processing after {:.2f} seconds'.format(time.time() - start_time)) export_result(words_with_idf) # Vocabulary size should be equal from the frequency count in task #1 vocabulary = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH)[0] assert (len(vocabulary) == len(words_with_idf))
def count_documents_batch(batch_id: int) -> int: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) return len(filtered_articles)
import math import matplotlib.pyplot as plt from scipy import stats from dataaccess.files_constants import GENERATED_COUNTS_PATH from dataaccess.files_io import read_jsonl_and_map_to_df from util.plots import show_plot_and_save_figure, prepare_seaborn_plots if __name__ == '__main__': counts = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH) x_ranks = range(1, len(counts[1]) + 1) y_counts = [count for count in counts[1]] x_ranks_log = [math.log10(rank) for rank in x_ranks] y_counts_log = [math.log10(count) for count in y_counts] slope, intercept, r_value, p_value, std_err = stats.linregress( x_ranks_log, y_counts_log) r_squared = r_value**2 print('slope: {}; intercept: {}; r-squared: {}, p: {}'.format( slope, intercept, r_squared, p_value)) prepare_seaborn_plots() plt.plot(x_ranks_log, y_counts_log, 'o') plt.plot(x_ranks_log, [intercept + slope * rank for rank in x_ranks_log], 'red') plt.xlabel('log(rank)') plt.ylabel('log(frequency)') plt.figtext(0.2, 0.45, 'R$^2$ = {:.5f}'.format(r_squared))
import pandas as pd from dataaccess.files_constants import DATA_TRAINING_PATH, DATA_DEV_LABELED_PATH, DATA_TEST_UNLABELED_PATH from dataaccess.files_io import read_jsonl_and_map_to_df from documentretrieval.data_constants import CLAIMS_COLUMNS_LABELED, CLAIMS_COLUMNS_UNLABELED claims_training = read_jsonl_and_map_to_df( DATA_TRAINING_PATH, CLAIMS_COLUMNS_LABELED).set_index('id', drop=False) claims_dev = read_jsonl_and_map_to_df( DATA_DEV_LABELED_PATH, CLAIMS_COLUMNS_LABELED).set_index('id', drop=False) claims_test = read_jsonl_and_map_to_df( DATA_TEST_UNLABELED_PATH, CLAIMS_COLUMNS_UNLABELED).set_index('id', drop=False) def get_claim(id: int, dataset: str = 'train') -> str: return get_corresponding_dataset(dataset).loc[id]['claim'] def get_claim_row(id: int, dataset: str = 'train') -> pd.Series: return get_corresponding_dataset(dataset).loc[id] def get_all_claims(dataset: str = 'train') -> pd.DataFrame: return get_corresponding_dataset(dataset) def claim_is_verifiable(claim_id: int, dataset: str = 'train') -> bool: return get_corresponding_dataset( dataset).loc[claim_id]['verifiable'] == 'VERIFIABLE'
from dataaccess.files_constants import GENERATED_DOCUMENT_LENGTH_MAPPING from dataaccess.files_io import read_jsonl_and_map_to_df doc_length_mapping = read_jsonl_and_map_to_df(GENERATED_DOCUMENT_LENGTH_MAPPING, ['page_id', 'length']).set_index('page_id', drop=False) def get_length_of_doc(page_id: str) -> int: return doc_length_mapping.loc[page_id]['length']
from dataaccess.files_constants import GENERATED_DOCUMENT_NORMS_MAPPING from dataaccess.files_io import read_jsonl_and_map_to_df docs_norms = read_jsonl_and_map_to_df(GENERATED_DOCUMENT_NORMS_MAPPING, ['doc', 'norm']).set_index('doc', drop=False) def get_norm_for_doc_text(page_id: str) -> float: return docs_norms.loc[page_id]['norm']
import pandas as pd from dataaccess.files_constants import GENERATED_COUNTS_PATH from documentretrieval.data_constants import COLLECTION_TOTAL_WORDS from dataaccess.files_io import read_jsonl_and_map_to_df terms_with_occurrences = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH, ['term', 'occurrences']).set_index('term', drop=False) def get_collection_occurrences_of_term(term: str) -> int: return terms_with_occurrences.loc[term]['occurrences'] def get_collection_probability_for_term(term: str) -> float: occurences = get_collection_occurrences_of_term(term) return occurences / COLLECTION_TOTAL_WORDS def get_terms_with_occurrences_mapping() -> pd.DataFrame: return terms_with_occurrences
from dataaccess.files_constants import GENERATED_IDF_PATH from dataaccess.files_io import read_jsonl_and_map_to_df words_with_idf = read_jsonl_and_map_to_df( GENERATED_IDF_PATH, ['word', 'idf']).set_index('word', drop=False) def get_idf_for_term(term: str) -> float: try: return words_with_idf.loc[term]['idf'] except KeyError: # this can happen for tokens from doc titles, as the IDF values are only generated for doc text return 0