def generate_partial_subindex_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_wiki_pages = filter_documents(all_articles) if args.debug: filtered_wiki_pages = filtered_wiki_pages[:42] subindex = {} for raw_document in filtered_wiki_pages.iterrows(): page_id = raw_document[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_document[1]['text']) words_counter = Counter(filtered_tokens) # Invert word -> doc and add raw and relative term count for count in words_counter.items(): term = count[0] raw_count = count[1] tf = raw_count if args.variant == 'raw_count' else raw_count / len( filtered_tokens) idf = words_with_idf.loc[term]['idf'] tfidf = tf * idf subindex.setdefault(term, {'docs': []})['docs'].append( (page_id, raw_count, tfidf)) print('Finished processing batch #{}'.format(batch_id)) return subindex
def process_count_batch(batch_id: int) -> Counter: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) article_texts = reduce_document_to_text_column(filtered_articles) combined_tokens = [] for raw_article in article_texts: filtered_tokens = process_normalise_tokenise_filter(raw_article) combined_tokens.extend(filtered_tokens) return get_word_counts(combined_tokens)
def generate_batch_mappings(batch_id: int): print('Processing batch #{}...'.format(batch_id)) parital_result = {} batch_file_path = get_wiki_batch_path(batch_id) batch_df = read_jsonl_and_map_to_df(batch_file_path, ['id']) for line_index, row in batch_df.iterrows(): page_id = row[0] # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé' page_id = unicodedata.normalize('NFC', page_id) parital_result[page_id] = (batch_id, line_index) return parital_result
def generate_document_length_mapping_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_articles = filter_documents(all_articles) # filtered_articles.set_index('id', drop=False) if (args.debug): filtered_articles = filtered_articles.head(n=3) partial_document_length_mappings = {} for raw_doc_row in filtered_articles.iterrows(): page_id = raw_doc_row[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_doc_row[1]['text']) partial_document_length_mappings[page_id] = len(filtered_tokens) return partial_document_length_mappings
def retrieve_wiki_page(page_id: str) -> WikiDocument: page_id = page_id.strip() # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé' page_id = unicodedata.normalize('NFC', page_id) # Find correct batch file and read only relevant line batch_id, line = wiki_page_mapping.loc[page_id].values wiki_batch_path = get_wiki_batch_path(batch_id) with open(wiki_batch_path) as fp: for i, json_line in enumerate(fp): if i == line: return WikiDocument(json_line) # If this code runs, a mapping error occured print(colored('Error: Line {} not found in wiki-page {}'.format(line, batch_id), 'red'))
def count_documents_batch(batch_id: int) -> int: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) return len(filtered_articles)