def _extract_tokens(row: List[str], index: Index, question_text: str, doc_iid: int) -> Tuple[List[int], List[int]]: doc_tokens = list(index.get_document_by_int_id(doc_iid)) query_tokens = [index.token2id.get(token, 0) for token in index.tokenize(question_text)] row.extend([json.dumps(query_tokens), json.dumps(doc_tokens)]) return doc_tokens, query_tokens
def build(): global INDEX INDEX = Index(env='default') batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids()) helpers.log(f'Building maps for {INDEX.count()} documents.') int2wid = {} wid2int = {} for batch_maps in parallel.execute(_process_batch, batches): batch_int2wid, batch_wid2int = batch_maps int2wid.update(batch_int2wid) wid2int.update(batch_wid2int) with open(TOKEN2ID, 'wb')as file: pickle.dump(INDEX.token2id, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(ID2DF, 'wb')as file: pickle.dump(INDEX.id2df, file) with open(ID2TF, 'wb')as file: pickle.dump(INDEX.id2tf, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(INT2WID, 'wb') as file: pickle.dump(int2wid, file) with open(WID2INT, 'wb') as file: pickle.dump(wid2int, file) helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
def accuracies(database: str, table: str): """Compute accuracy over a dataset given a retrieval database. Statistics are split over question types and levels.""" dataset = Dataset.from_file(TRAIN_HOTPOT_SET) global INDEX INDEX = Index() # shape: (question type [comparison, bridge], level [hard, medium, easy], number of found gold articles [0, 1, 2]) hits = np.zeros((2, 3, 3)) with sqlite3.connect(database) as conn: for question in tqdm(dataset, unit='questions'): target = _extract_target(question) prediction = _fetch_prediction(table, conn, question) if prediction is None: continue (prediction, ) = prediction prediction = pickle.loads(prediction) _update_hits(prediction, target, question, hits) full, half, full_hard, full_medium, full_easy, full_comparison, full_bridge = _accuracies( hits) logging.info(f'[{datetime.now()}]\t[Full Accuracy: {round(full, 4)}]') logging.info(f'[{datetime.now()}]\t[Half Accuracy: {round(half, 4)}]') logging.info(f'[{"-" * 10}]') logging.info( f'[{datetime.now()}]\t[Easy Question Full Accuracy: {round(full_easy, 4)}]' ) logging.info( f'[{datetime.now()}]\t[Medium Question Full Accuracy: {round(full_medium, 4)}]' ) logging.info( f'[{datetime.now()}]\t[Hard Question Full Accuracy: {round(full_hard, 4)}]' ) logging.info(f'[{"-" * 10}]') logging.info( f'[{datetime.now()}]\t[Comparison Question Full Accuracy: {round(full_comparison, 4)}]' ) logging.info( f'[{datetime.now()}]\t[Bridge Question Full Accuracy: {round(full_bridge, 4)}]' ) return { 'full': full, 'half': half, 'full_hard': full_hard, 'full_medium': full_medium, 'full_easy': full_easy, 'full_comparison': full_comparison, 'full_bridge': full_bridge }
def _bigram_unigram_5000(query: str, index: Index, n: int = 5000) -> List[Tuple[int, float]]: """ Retrieves the at most n candidates from the full set of articles based on query-document pair bigram/unigram matches. Uses pre-built inverted index. Assumed to be equivalent to Algorithm 2, Appendix C of HotpotQA paper. Possible mismatches: -- unigram/bigrams counts in our case are considered only over first at most 500 characters of the article. Not clear if they use full article or not. -- implementation does not follow algorithm exactly since that seems very inefficient. We made it better, but maybe some edge-cases result in different results. :param query: A string of words to match. :param index: The prebuilt inverted index. :param n: The control threshold :return: A list of at most 5000 candidates. """ # tokenize, step, filter stopwords and collect unigrams and bigrams tokenized_query = index.tokenize(query) query_unigrams = set(tokenized_query) query_bigrams = set(nltk.bigrams(tokenized_query)) # count the overlapping n-gram for each query-document pair overlap_set = Counter() for bigram in query_bigrams: for (doc_id, _) in index.bigram_query(bigram[0], bigram[1], request=10000): overlap_set[doc_id] += 1 for unigram in query_unigrams: for (doc_id, _) in index.unigram_query(unigram, request=10000): overlap_set[doc_id] += 1 # Get the best n+1 documents and filter all the ones that have a count equal to the smallest count in the list. most_common = overlap_set.most_common(n + 1) candidates = filter(lambda t: t[1] > most_common[-1][1], most_common) return [candidate for candidate in candidates]
def process(): """Filter the collection of 5 million document to just the top 5000 at most according to bigram/unigram filtering per question. Processed in parallel.""" global_start = datetime.now() global INDEX INDEX = Index(env='tfidf') os.makedirs(DIR_NAME) (batches, no_batches, no_queries), total_retrieved = retrieve.load_dataset_batches(), 0 retrieve.create_retrieval_db(DB_NAME) helpers.log(f'Retrieving documents. Workers: {os.cpu_count()}') start = datetime.now() for batch_retrieval in parallel.execute(_process_question_batch, batches): total_retrieved += batch_retrieval end = datetime.now() helpers.log(f'Finished retrieval in {end - start}. Filtered {total_retrieved}/{no_queries}') global_end = datetime.now() helpers.log(f'Finished process in {global_end - global_start}.')
def build(skip_relevant: bool = True): global INDEX, COLUMNS, QUESTION_COUNTS global SKIP_RELEVANT SKIP_RELEVANT = skip_relevant INDEX = Index('tfidf') helpers.log('Loaded index.') os.makedirs(ct.CANDIDATES_DIR, exist_ok=True) with open(ct.TRAIN_HOTPOT_SET, 'r') as file: question_set = json.load(file) train_question_set = question_set[:ct.TRAIN_DEV_SPLIT] dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:] with open(ct.DEV_HOTPOT_SET, 'r') as file: test_question_set = json.load(file) iterator: List[Tuple[str, str, Callable]] = [ # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK), # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK), (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK) ] for (_set, split, candidate_db_path, chunk) in iterator: start = datetime.now() db = sqlite3.connect(candidate_db_path) cursor = db.cursor() cursor.execute(sql.create_candidate_table) db.commit() helpers.log('Created candidates table.') QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall() QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS} helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.') cursor.close() db.close() helpers.log(f'Creating {split} candidate set with {len(_set)} question.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set)) for batch_count in parallel.execute(_build_candidates, _set_generator): total_count += batch_count helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
def build(): assert constants.TRAIN_FEATURES_CHUNK > 1 assert constants.DEV_FEATURES_CHUNK > 1 global INDEX INDEX = Index('tfidf') helpers.log('Loaded index.') global EXTRACTORS EXTRACTORS = [] if 'entity' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(EntityExtractor(INDEX)) if 'ibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=False)) if 'nibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=True)) if 'bigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False)) if 'nbigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True)) if 'qword' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(QuestionWordFeatureExtractor()) if 'doclen' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(DocumentLengthFeatureExtractor()) helpers.log('Loaded extractors.') global COLUMNS COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES) COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name) COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME) helpers.log('Loaded column names.') os.makedirs(constants.FEATURES_DIR, exist_ok=True) iterator: List[Tuple[str, str, Callable]] = [ # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK), # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK), (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK) ] for (candidate_db_path, feature_db_path, chunk) in iterator: start_time = datetime.now() _set = candidate_db_path.split("/")[-1].split(".")[1] done = False feature_db = sqlite3.connect(feature_db_path) cursor = feature_db.cursor() while not done: try: cursor.execute(sql.create_features_table(COLUMNS)) feature_db.commit() done = True except Exception as e: helpers.log(e) helpers.log(f'Created {_set} features table.') (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone() start = start if start is not None else 0 # first id in the database cursor.close() feature_db.close() helpers.log(f'Starting feature build at {start}.') candidate_db = sqlite3.connect(candidate_db_path) cursor = candidate_db.cursor() (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone() # last id in the database cursor.close() candidate_db.close() id_range = range(start + 1, stop + 1) helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range)) _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator) for grand_batch_idx, _batch_set in _batched_set_generator: grand_batch_count = 0 for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'): grand_batch_count += batch_count total_count += batch_count helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.') helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')