def build(): global INDEX INDEX = Index(env='default') batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids()) helpers.log(f'Building maps for {INDEX.count()} documents.') int2wid = {} wid2int = {} for batch_maps in parallel.execute(_process_batch, batches): batch_int2wid, batch_wid2int = batch_maps int2wid.update(batch_int2wid) wid2int.update(batch_wid2int) with open(TOKEN2ID, 'wb')as file: pickle.dump(INDEX.token2id, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(ID2DF, 'wb')as file: pickle.dump(INDEX.id2df, file) with open(ID2TF, 'wb')as file: pickle.dump(INDEX.id2tf, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(INT2WID, 'wb') as file: pickle.dump(int2wid, file) with open(WID2INT, 'wb') as file: pickle.dump(wid2int, file) helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
def build(use_less_memory: bool): """Build the corpus of TREC documents files asynchronously from the HotpotQA raw wiki data. Expects the uncompressed HotpotQA raw wiki data available in the ``./data/raw`` folder. Folders are processed in order. Resulting documents are collected and sorted according to their ids. Each persisted file carries ``CHUNK_SIZE`` documents and is named as ``{first_doc_id_in_file}@{last_doc_id_in_file}``. If asked to use less memory, it will defer persistence to the child processes. :param use_less_memory: Whether to use less memory by not sorting documents and instead persisting them under a file with the same name as the folder from which the raw data originate. :return: None. """ global USE_LESS_MEMORY USE_LESS_MEMORY = use_less_memory assert os.path.exists( RAW_DATA_DIR ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}' os.makedirs(os.path.abspath(TREC_CORPUS_DIR), exist_ok=True) folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*'))) doc_triples = [] # create document database helpers.log('Creating documents database.') db = sqlite3.connect(DOCUMENT_DB) cursor: sqlite3.Cursor = db.cursor() cursor.execute( "CREATE TABLE IF NOT EXISTS documents (id INTEGER PRIMARY KEY, text TEXT)" ) db.commit() cursor.close() db.close() dfs = [] helpers.log('Extracting TREC documents.') if USE_LESS_MEMORY: for _ in parallel.execute(_process_raw_data_folder, folder_paths): pass logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.') else: for doc_triples_by_folder in parallel.execute(_process_raw_data_folder, folder_paths): doc_triples.extend(doc_triples_by_folder) doc_triples = sorted(doc_triples, key=lambda triple: triple[0]) logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.') logging.info( f'[{datetime.now()}]\t[{os.getpid()}]\tPersisting TREC documents.') for _ in parallel.execute(_process_doc_triples, parallel.chunk(100000, doc_triples)): pass logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tPersistence done.') logging.info( f'[{datetime.now()}]\t[{os.getpid()}]\tFinished building TREC corpus.') return
def _evaluate_epoch(model: nn.Module, ref: str, data_loader: DataLoader, trec_eval: str, trec_eval_agg: str, save: bool) -> METRICS: model.eval() epoch_run = Run() epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures) acc = 0 final_scores = torch.empty((len(data_loader.dataset), 1), dtype=torch.float) question_ids = [] document_ids = [] with torch.no_grad(): for idx, batch in enumerate(data_loader): (questions, documents, features, targets, batch_question_ids, batch_document_ids) = batch questions = questions.to(device=ct.DEVICE, non_blocking=True) documents = documents.to(device=ct.DEVICE, non_blocking=True) features = features.to(device=ct.DEVICE, non_blocking=True) targets = targets.to(device=ct.DEVICE, non_blocking=True) batch_size = questions.shape[0] scores, encodings = model(questions, documents, features) acc += torch.sum( (torch.round(scores) == targets).to(dtype=torch.float)) question_ids.extend(batch_question_ids) document_ids.extend(batch_document_ids) if batch_size == ct.BATCH_SIZE: final_scores[idx * ct.BATCH_SIZE:(idx + 1) * ct.BATCH_SIZE] = scores else: final_scores[idx * ct.BATCH_SIZE:] = scores for batch_run in parallel.execute( _build_run, parallel.chunk( 10000, zip(question_ids, document_ids, final_scores.numpy()))): epoch_run.update_rankings(batch_run) acc = acc / len(data_loader.dataset) _, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval, trec_eval_agg, save) return epoch_run, acc.item(), \ trec_eval_agg['map_cut_10'], trec_eval_agg['ndcg_cut_10'], trec_eval_agg['recall_10'], \ trec_eval_agg['map_cut_100'], trec_eval_agg['ndcg_cut_100'], trec_eval_agg['recall_100'], \ trec_eval_agg['map_cut_1000'], trec_eval_agg['ndcg_cut_1000'], trec_eval_agg['recall_1000'], \ trec_eval_agg['P_5']
def load_dataset_batches() -> Tuple[List[List[Question]], int, int]: """Load the dataset in batches of ``CHUNK_SIZE`` and calculate lengths.""" helpers.log( f'Loading dataset in chunks. Data file: {TRAIN_HOTPOT_SET}. Chunk size: {CHUNK_SIZE}.' ) start = datetime.now() training_set = Dataset.from_file(TRAIN_HOTPOT_SET) batches = parallel.chunk(CHUNK_SIZE, training_set.questions) no_batches = len(batches) no_queries = len(training_set) end = datetime.now() helpers.log( f'Finished loading in {end - start}. Batches: {no_batches}. Queries: {no_queries}.' ) return batches, no_batches, no_queries
def build(skip_relevant: bool = True): global INDEX, COLUMNS, QUESTION_COUNTS global SKIP_RELEVANT SKIP_RELEVANT = skip_relevant INDEX = Index('tfidf') helpers.log('Loaded index.') os.makedirs(ct.CANDIDATES_DIR, exist_ok=True) with open(ct.TRAIN_HOTPOT_SET, 'r') as file: question_set = json.load(file) train_question_set = question_set[:ct.TRAIN_DEV_SPLIT] dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:] with open(ct.DEV_HOTPOT_SET, 'r') as file: test_question_set = json.load(file) iterator: List[Tuple[str, str, Callable]] = [ # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK), # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK), (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK) ] for (_set, split, candidate_db_path, chunk) in iterator: start = datetime.now() db = sqlite3.connect(candidate_db_path) cursor = db.cursor() cursor.execute(sql.create_candidate_table) db.commit() helpers.log('Created candidates table.') QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall() QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS} helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.') cursor.close() db.close() helpers.log(f'Creating {split} candidate set with {len(_set)} question.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set)) for batch_count in parallel.execute(_build_candidates, _set_generator): total_count += batch_count helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
def __init__(self, normalized: bool): super().__init__(None) self.normalized = normalized if os.path.isfile(constants.IBM_MODEL): with open(constants.IBM_MODEL, 'rb') as file: self.ibm1 = pickle.load(file) else: dataset = Dataset.from_file(constants.TRAIN_HOTPOT_SET) bitext = [] batches = parallel.chunk(constants.CHUNK_SIZE, dataset.questions) # for partial_bitext in map(_build_bitext, batches): for partial_bitext in parallel.execute(_build_bitext, batches): bitext.extend(partial_bitext) self.ibm1 = nltk.IBMModel1(bitext, 5) os.makedirs(constants.TRANSLATION_MODEL_DIR, exist_ok=True) with open(constants.IBM_MODEL, 'wb') as file: pickle.dump(self.ibm1, file)
def evaluate_test_set(model_name: str, output_dir: str): os.makedirs('./evaluation', exist_ok=True) config = models[model_name] query_encoder = config.query_encoder(config.embedding_dim) document_encoder = config.document_encoder(config.embedding_dim) scorer = config.scorer(**config.scorer_kwargs) model = config.ranker(query_encoder, document_encoder, scorer).to(device=ct.DEVICE) optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs) _load_checkpoint(model, optimizer, config) dataset = QueryDocumentsDataset(ct.TEST_FEATURES_DB) data_loader = DataLoader(dataset, ct.BATCH_SIZE, False, collate_fn=QueryDocumentsDataset.collate, num_workers=os.cpu_count(), pin_memory=True) model.eval() epoch_run = Run() epoch_eval = Evaluator(ct.TEST_TREC_REFERENCE, measures=pytrec_eval.supported_measures) final_scores = torch.empty((len(data_loader.dataset), 1), dtype=torch.float) question_ids = [] document_ids = [] with torch.no_grad(): for idx, batch in tqdm(enumerate(data_loader)): (questions, documents, features, targets, batch_question_ids, batch_document_ids) = batch questions = questions.to(device=ct.DEVICE, non_blocking=True) documents = documents.to(device=ct.DEVICE, non_blocking=True) features = features.to(device=ct.DEVICE, non_blocking=True) batch_size = questions.shape[0] scores, encodings = model(questions, documents, features) question_ids.extend(batch_question_ids) document_ids.extend(batch_document_ids) if batch_size == ct.BATCH_SIZE: final_scores[idx * ct.BATCH_SIZE:(idx + 1) * ct.BATCH_SIZE] = scores else: final_scores[idx * ct.BATCH_SIZE:] = scores for batch_run in parallel.execute( _build_run, parallel.chunk( 10000, zip(question_ids, document_ids, final_scores.numpy()))): epoch_run.update_rankings(batch_run) trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, save=False) os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, model_name + '_hotpot.json'), 'w') as file: # use DEV_HOTPOT because that corresponds to our test set. the actual hotpot test set is unlabeled. json.dump(epoch_run.to_json(ct.TEST_FEATURES_DB, ct.DEV_HOTPOT_SET), file, indent=True) print(json.dumps(trec_eval_agg, indent=True))
def build(): assert constants.TRAIN_FEATURES_CHUNK > 1 assert constants.DEV_FEATURES_CHUNK > 1 global INDEX INDEX = Index('tfidf') helpers.log('Loaded index.') global EXTRACTORS EXTRACTORS = [] if 'entity' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(EntityExtractor(INDEX)) if 'ibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=False)) if 'nibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=True)) if 'bigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False)) if 'nbigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True)) if 'qword' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(QuestionWordFeatureExtractor()) if 'doclen' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(DocumentLengthFeatureExtractor()) helpers.log('Loaded extractors.') global COLUMNS COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES) COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name) COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME) helpers.log('Loaded column names.') os.makedirs(constants.FEATURES_DIR, exist_ok=True) iterator: List[Tuple[str, str, Callable]] = [ # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK), # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK), (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK) ] for (candidate_db_path, feature_db_path, chunk) in iterator: start_time = datetime.now() _set = candidate_db_path.split("/")[-1].split(".")[1] done = False feature_db = sqlite3.connect(feature_db_path) cursor = feature_db.cursor() while not done: try: cursor.execute(sql.create_features_table(COLUMNS)) feature_db.commit() done = True except Exception as e: helpers.log(e) helpers.log(f'Created {_set} features table.') (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone() start = start if start is not None else 0 # first id in the database cursor.close() feature_db.close() helpers.log(f'Starting feature build at {start}.') candidate_db = sqlite3.connect(candidate_db_path) cursor = candidate_db.cursor() (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone() # last id in the database cursor.close() candidate_db.close() id_range = range(start + 1, stop + 1) helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range)) _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator) for grand_batch_idx, _batch_set in _batched_set_generator: grand_batch_count = 0 for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'): grand_batch_count += batch_count total_count += batch_count helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.') helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')
def run_eval(_set: str, config: Config): start = datetime.now() if _set == 'train': feature_db = ct.TRAIN_FEATURES_DB ref = ct.TRAIN_TREC_REFERENCE elif _set == 'dev': feature_db = ct.DEV_FEATURES_DB ref = ct.DEV_TREC_REFERENCE elif _set == 'test': feature_db = ct.TEST_FEATURES_DB ref = ct.TEST_TREC_REFERENCE else: raise ValueError(f'Unknown set {_set}.') with open(ct.INT2WID, 'rb') as file: global INT2WID INT2WID = pickle.load(file) with open(ct.WID2TITLE, 'rb') as file: global WID2TITLE WID2TITLE = pickle.load(file) trec_eval = ct.L2R_EVAL.format(config.name, 'test') trec_eval_agg = ct.L2R_EVAL_AGG.format(config.name, 'test') query_encoder = config.query_encoder(config.embedding_dim) document_encoder = config.document_encoder(config.embedding_dim) scorer = config.scorer(**config.scorer_kwargs) model = config.ranker(query_encoder, document_encoder, scorer).to(device=ct.DEVICE) # noinspection PyCallingNonCallable optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs) _ = _load_checkpoint(model, optimizer, config) helpers.log( f'Loaded maps, model, and optimizer in {datetime.now() - start}.') test_data_set = QueryDocumentsDataset(feature_db) test_data_loader = data.DataLoader( test_data_set, ct.BATCH_SIZE, False, num_workers=os.cpu_count(), collate_fn=QueryDocumentsDataset.collate) model.eval() epoch_run = Run() epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures) final_scores = torch.empty((len(test_data_loader.dataset), 1), dtype=torch.float) question_ids = [] document_ids = [] with torch.no_grad(): for idx, batch in tqdm(enumerate(test_data_loader)): (questions, documents, features, targets, batch_question_ids, batch_document_ids) = batch questions = questions.to(device=ct.DEVICE, non_blocking=True) documents = documents.to(device=ct.DEVICE, non_blocking=True) features = features.to(device=ct.DEVICE, non_blocking=True) batch_size = questions.shape[0] scores, encodings = model(questions, documents, features) question_ids.extend(batch_question_ids) document_ids.extend(batch_document_ids) if batch_size == ct.BATCH_SIZE: final_scores[idx * ct.BATCH_SIZE:(idx + 1) * ct.BATCH_SIZE] = scores else: final_scores[idx * ct.BATCH_SIZE:] = scores for batch_run in tqdm( parallel.execute( _build_run, parallel.chunk( 10000, zip(question_ids, document_ids, final_scores.numpy())))): epoch_run.update_rankings(batch_run) trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval, trec_eval_agg, False) er_10 = 0 for stats in trec_eval.values(): er_10 += stats['recall_10'] == 1.0 er_10 /= len(trec_eval) print(f'ndcg@10:\t\t{trec_eval_agg["ndcg_cut_10"]:.4f}') print(f'map@10:\t\t{trec_eval_agg["map_cut_10"]:.4f}') print(f'er@10:\t\t{er_10:.4f}') print(f'recall@10:\t\t{trec_eval_agg["recall_10"]:.4f}') print(f'recall@100:\t\t{trec_eval_agg["recall_100"]:.4f}') print(f'recall@1000:\t\t{trec_eval_agg["recall_1000"]:.4f}')