def _build_group_title_map(folder_path: str) -> Dict[str, int]: title2wid: Dict[str, int] = {} file_paths = sorted(glob(os.path.join(folder_path, '*.bz2'))) for file_path in file_paths: with bz2.BZ2File(file_path) as file: for line in file: doc = json.loads(line) doc_wid, doc_title = int(doc['id']), doc['title'] if title2wid.get(doc_title, None) is None: title2wid[doc_title] = doc_wid else: # Hack for taking care of the double title that points to a proper article and to a disambiguation # article. Assumes the only article of interest is the one that is not a disambiguation one. helpers.log( f'Title {doc_title} has the WID {title2wid.get(doc_title)}. Current WID: {doc_wid}.' ) if doc_wid == 2209045: helpers.log( f'Replacing WID {title2wid.get(doc_title)} with WID {doc_wid}.' ) title2wid[doc_title] = doc_wid logging.info( f'[{datetime.now()}]\t[{os.getpid()}]\tBuilt title maps for folder {folder_path.split("/")[-1]}.' ) return title2wid
def build(): global INDEX INDEX = Index(env='default') batches = parallel.chunk(CHUNK_SIZE, INDEX.document_int_ids()) helpers.log(f'Building maps for {INDEX.count()} documents.') int2wid = {} wid2int = {} for batch_maps in parallel.execute(_process_batch, batches): batch_int2wid, batch_wid2int = batch_maps int2wid.update(batch_int2wid) wid2int.update(batch_wid2int) with open(TOKEN2ID, 'wb')as file: pickle.dump(INDEX.token2id, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(ID2DF, 'wb')as file: pickle.dump(INDEX.id2df, file) with open(ID2TF, 'wb')as file: pickle.dump(INDEX.id2tf, file) with open(ID2TOKEN, 'wb')as file: pickle.dump(INDEX.id2token, file) with open(INT2WID, 'wb') as file: pickle.dump(int2wid, file) with open(WID2INT, 'wb') as file: pickle.dump(wid2int, file) helpers.log(f'Finished building maps. Mapped {len(int2wid)}/{INDEX.index.document_count()}')
def build(use_less_memory: bool): """Build the corpus of TREC documents files asynchronously from the HotpotQA raw wiki data. Expects the uncompressed HotpotQA raw wiki data available in the ``./data/raw`` folder. Folders are processed in order. Resulting documents are collected and sorted according to their ids. Each persisted file carries ``CHUNK_SIZE`` documents and is named as ``{first_doc_id_in_file}@{last_doc_id_in_file}``. If asked to use less memory, it will defer persistence to the child processes. :param use_less_memory: Whether to use less memory by not sorting documents and instead persisting them under a file with the same name as the folder from which the raw data originate. :return: None. """ global USE_LESS_MEMORY USE_LESS_MEMORY = use_less_memory assert os.path.exists( RAW_DATA_DIR ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}' os.makedirs(os.path.abspath(TREC_CORPUS_DIR), exist_ok=True) folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*'))) doc_triples = [] # create document database helpers.log('Creating documents database.') db = sqlite3.connect(DOCUMENT_DB) cursor: sqlite3.Cursor = db.cursor() cursor.execute( "CREATE TABLE IF NOT EXISTS documents (id INTEGER PRIMARY KEY, text TEXT)" ) db.commit() cursor.close() db.close() dfs = [] helpers.log('Extracting TREC documents.') if USE_LESS_MEMORY: for _ in parallel.execute(_process_raw_data_folder, folder_paths): pass logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.') else: for doc_triples_by_folder in parallel.execute(_process_raw_data_folder, folder_paths): doc_triples.extend(doc_triples_by_folder) doc_triples = sorted(doc_triples, key=lambda triple: triple[0]) logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tExtraction done.') logging.info( f'[{datetime.now()}]\t[{os.getpid()}]\tPersisting TREC documents.') for _ in parallel.execute(_process_doc_triples, parallel.chunk(100000, doc_triples)): pass logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tPersistence done.') logging.info( f'[{datetime.now()}]\t[{os.getpid()}]\tFinished building TREC corpus.') return
def create_retrieval_db(db_name: str) -> None: """Create the sqlite database where retrieval results will be persisted.""" helpers.log(f'Creating retrieval SQL table. Database: {db_name}') with sqlite3.connect(db_name) as db: cursor = db.cursor() cursor.execute(sql.create_table()) db.commit() return
def _process_batch(batch: Tuple[int, List[int]]) -> Tuple[Dict[int, int], Dict[int, int]]: no, batch = batch wid2int = {} int2wid = {} for int_doc_id in batch: wid = INDEX.get_wid(int_doc_id) wid2int[wid] = int_doc_id int2wid[int_doc_id] = wid helpers.log(f'Finished batch. Mapped {len(wid2int)}.') return int2wid, wid2int
def _build_candidates(numbered_batch: Tuple[int, Tuple[str, Dict[str, Any]]]) -> int: try: start_time = datetime.now() batch_index, batch = numbered_batch _set, start = batch[0] _, stop = batch[-1] if _set == 'train': candidate_db_path = constants.TRAIN_CANDIDATES_DB feature_db_path = constants.TRAIN_FEATURES_DB elif _set == 'dev': candidate_db_path = constants.DEV_CANDIDATES_DB feature_db_path = constants.DEV_FEATURES_DB elif _set == 'test': candidate_db_path = constants.TEST_CANDIDATES_DB feature_db_path = constants.TEST_FEATURES_DB else: raise ValueError(f'Unknown dataset {_set}.') done = False candidate_db = sqlite3.connect(candidate_db_path) candidate_cursor = candidate_db.cursor() candidate_rows = candidate_cursor.execute(sql.fetch_candidate_batch, (start, stop)).fetchall() candidate_cursor.close() candidate_db.close() batch_count = 0 rows = [] feature_db = sqlite3.connect(feature_db_path) feature_cursor = feature_db.cursor() for candidate_row in candidate_rows: (_id, question_id, _type, level, doc_iid, doc_wid, doc_title, question_text, doc_text, question_tokens, doc_tokens, tfidf, relevance) = candidate_row exists = feature_cursor.execute('SELECT id FROM features WHERE id = ?', (_id,)).fetchone() if exists is not None: continue row: List[str] = [_id, question_id, _type, level, doc_iid, doc_wid, doc_title, question_text, doc_text, question_tokens, doc_tokens, tfidf] _extract_features(row, EXTRACTORS, json.loads(question_text), json.loads(doc_text)) row.append(relevance) rows.append(row) batch_count += 1 rows_to_db(_set, rows) helpers.log(f'Processed batch {batch_index} of {batch_count} pairs in {datetime.now() - start_time}') feature_cursor.close() feature_db.close() return batch_count except Exception as e: helpers.log(e)
def build() -> None: helpers.log('Building Trec Eval references.') with open(ct.TRAIN_HOTPOT_SET, 'r') as file: question_set = json.load(file) dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:] train_question_set = question_set[:ct.TRAIN_DEV_SPLIT] with open(ct.DEV_HOTPOT_SET, 'r') as file: test_question_set = json.load(file) iterator = [('train', train_question_set), ('dev', dev_question_set), ('test', test_question_set)] for _set, reference in parallel.execute(_build_reference, iterator): helpers.log(f'Created reference {reference} for {_set} set.')
def __init__(self, database: str): start = datetime.now() connection = sqlite3.connect(database) cursor = connection.cursor() self.data = cursor.execute( f'SELECT {", ".join(self._features)} FROM features').fetchall() helpers.log( f'Initialized {database.split(".")[-3]} dataset in {datetime.now() - start}' )
def __init__(self) -> None: tree = ElementTree.parse(INDRI_PARAMETERS) if INDRI_PARAMETERS.split('/')[-1] == 'indri_stop_stem.xml': helpers.log('Loading stopwords.') stopwords = set() for elem in tree.find('stopper').iter('word'): stopwords.add(elem.text) self.stopwords = frozenset(stopwords) elif INDRI_PARAMETERS.split('/')[-1] == 'index.xml': self.stopwords = None else: raise NotImplementedError(f'Unknown index setting: {INDRI_PARAMETERS.split("/")[-1]}') self._punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
def _load_checkpoint(model, optimizer, config: Config): best_statistic = 0 start = datetime.now() if os.path.isfile(ct.L2R_TRAIN_PROGRESS.format(config.name)): with open(ct.L2R_BEST_MODEL.format(config.name), 'rb') as file: checkpoint = torch.load(file, map_location=ct.DEVICE) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) model.epochs_trained = checkpoint['epoch'] best_statistic = checkpoint['best_statistic'] helpers.log( f'Loaded checkpoint from {ct.L2R_BEST_MODEL.format(config.name)} in {datetime.now() - start}.' ) return best_statistic
def to_json(self, db, dataset_path) -> List[dict]: helpers.log('Creating hotpot data.') dataset = Dataset.from_file(dataset_path) questions = [] connection = sqlite3.connect(db) cursor = connection.cursor() doc_results = cursor.execute( f"SELECT DISTINCT doc_title, document_text FROM features" ).fetchall() title2text = { json.loads(doc_title): json.loads(doc_text) for (doc_title, doc_text) in doc_results } cursor.close() connection.close() helpers.log('Loaded title2text.') for question_id, ranking in tqdm(self.items()): context = [] sorted_by_score = sorted(ranking.items(), key=lambda value: value[1], reverse=True) for rank in range(min(10, len(ranking))): (title, score) = sorted_by_score[rank] doc_text = title2text[title] article = [ paragraph.split(constants.EOS.strip()) for paragraph in doc_text.split(constants.EOP.strip()) ] article.insert(0, title) article.insert(1, score) context.append(article) full_question = dataset.find_by_id(question_id) question = { '_id': full_question.id, 'level': full_question.level, 'type': full_question.type, 'question': full_question.question, 'context': context, 'answer': full_question.answer, 'supporting_facts': full_question.supporting_facts } questions.append(question) return questions
def _process_raw_data_folder(folder_path: str): """Load documents from the JSON collections line by line. Extract document id, title, and first paragraph from each document in each JSON. Create TREC documents and collect in a list. If ``USE_LESS_MEMORY`` is set to ``True``, the TREC documents will be persisted to disk under a file named after the folder from which the files originate. If set to ``False`` the documents will be returned for further processing in the main thread. Store the document string in a database for later reference. :param folder_path: The path to the folder where the compressed JSON collection of raw wiki data lies. :return: A sorted collection of (document_id, document_title, trec_document_string) """ doc_count = 0 doc_pairs: List[Tuple[int, str]] = [] doc_triples = [] file_paths = sorted(glob(os.path.join(folder_path, '*.bz2'))) for file_path in file_paths: with bz2.BZ2File(file_path) as file: for line in file: doc = json.loads(line.decode('utf-8')) doc_id, doc_title, doc_str = _extract_doc(doc) doc_pairs.append((doc_id, doc_str)) doc_count += 1 # doc_triples.append((doc_id, doc_title, _build_trec(doc_id, doc_title, doc_str))) folder = folder_path.split("/")[-1] helpers.log(f'Extracted documents from folder {folder}.') db = sqlite3.connect(DOCUMENT_DB) cursor = db.cursor() cursor.executemany("INSERT INTO documents (id, text) VALUES (?, ?)", doc_pairs) db.commit() cursor.close() db.close() helpers.log(f'Persisted {doc_count} documents to database.') if USE_LESS_MEMORY: file_name = os.path.join(TREC_CORPUS_DIR, f'{folder}.trectext') # _process_doc_triples(doc_triples, file_name) else: return doc_triples
def _save_epoch_stats(name: str, epoch: int, train_loss: float, train_stats: Tuple[float, ...], dev_stats: Tuple[float, ...]): with open(ct.L2R_TRAIN_PROGRESS.format(name), 'a') as f: writer = csv.writer(f) writer.writerow([epoch, train_loss, *train_stats, *dev_stats]) helpers.log( f'[Epoch {epoch:03d}][Train Acc: {train_stats[0]:0.4f}]' f'[Train MAP@10: {train_stats[1]:0.4f}][Train NDCG@10: {train_stats[2]:0.4f}]' f'[Train Precision@5: {train_stats[10]:0.4f}]' f'[Train Loss: {train_loss:0.4f}]' f'[Dev Acc: {dev_stats[0]:0.4f}]' f'[Dev MAP@10: {dev_stats[1]:0.4f}][Dev NDCG@10: {dev_stats[2]:0.4f}]' f'[Dev Recall@10: {dev_stats[3]:0.4f}]' f'[Dev MAP@100: {dev_stats[4]:0.4f}][Dev NDCG@100: {dev_stats[5]:0.4f}]' f'[Dev Recall@100: {dev_stats[6]:0.4f}]' f'[Dev Recall@1000: {dev_stats[9]:0.4f}]')
def load_dataset_batches() -> Tuple[List[List[Question]], int, int]: """Load the dataset in batches of ``CHUNK_SIZE`` and calculate lengths.""" helpers.log( f'Loading dataset in chunks. Data file: {TRAIN_HOTPOT_SET}. Chunk size: {CHUNK_SIZE}.' ) start = datetime.now() training_set = Dataset.from_file(TRAIN_HOTPOT_SET) batches = parallel.chunk(CHUNK_SIZE, training_set.questions) no_batches = len(batches) no_queries = len(training_set) end = datetime.now() helpers.log( f'Finished loading in {end - start}. Batches: {no_batches}. Queries: {no_queries}.' ) return batches, no_batches, no_queries
def _process_question_batch(question_numbered_batch: Tuple[int, Tuple[Question]]) -> int: """If the batch was not previously processed, filter a batch and persist to SQLite database.""" (no, questions), retrieved = question_numbered_batch, 0 already_processed = retrieve.check_already_processed(DB_NAME, question_numbered_batch) if len(already_processed) == len(questions): helpers.log(f'Batch {no} already processed. Skipping.') return 0 for question in questions: if already_processed.get(question.id, False): continue retrieval = INDEX.unigram_query(question.question, request=5000) retrieve.persist_retrieval(DB_NAME, question, retrieval) retrieved += 1 helpers.log(f'Retrieved questions: {retrieved}/{len(questions)}.') return retrieved
def build(skip_relevant: bool = True): global INDEX, COLUMNS, QUESTION_COUNTS global SKIP_RELEVANT SKIP_RELEVANT = skip_relevant INDEX = Index('tfidf') helpers.log('Loaded index.') os.makedirs(ct.CANDIDATES_DIR, exist_ok=True) with open(ct.TRAIN_HOTPOT_SET, 'r') as file: question_set = json.load(file) train_question_set = question_set[:ct.TRAIN_DEV_SPLIT] dev_question_set = question_set[ct.TRAIN_DEV_SPLIT:] with open(ct.DEV_HOTPOT_SET, 'r') as file: test_question_set = json.load(file) iterator: List[Tuple[str, str, Callable]] = [ # (train_question_set, 'train', ct.TRAIN_CANDIDATES_DB, ct.CANDIDATES_CHUNK), # (dev_question_set, 'dev', ct.DEV_CANDIDATES_DB, ct.CANDIDATES_CHUNK), (test_question_set, 'test', ct.TEST_CANDIDATES_DB, ct.CANDIDATES_CHUNK) ] for (_set, split, candidate_db_path, chunk) in iterator: start = datetime.now() db = sqlite3.connect(candidate_db_path) cursor = db.cursor() cursor.execute(sql.create_candidate_table) db.commit() helpers.log('Created candidates table.') QUESTION_COUNTS = cursor.execute(sql.count_question_rows).fetchall() QUESTION_COUNTS = {json.loads(_id): _count for (_id, _count) in QUESTION_COUNTS} helpers.log(f'Retrieved question counts for {len(QUESTION_COUNTS)} questions.') cursor.close() db.close() helpers.log(f'Creating {split} candidate set with {len(_set)} question.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([split] * len(_set), _set)) for batch_count in parallel.execute(_build_candidates, _set_generator): total_count += batch_count helpers.log(f'Created {split} candidate set with {total_count} questions in {datetime.now() - start}')
def rows_to_db(_set: str, rows: List[Any]): if _set == 'train': db_path = constants.TRAIN_FEATURES_DB elif _set == 'dev': db_path = constants.DEV_FEATURES_DB elif _set == 'test': db_path = constants.TEST_FEATURES_DB else: raise ValueError(f'Unknown set. {_set}') done = False while not done: try: connection = sqlite3.connect(db_path) cursor = connection.cursor() cursor.executemany(sql.insert_features(COLUMNS), [tuple(row) for row in rows]) connection.commit() cursor.close() connection.close() done = True except Exception as e: helpers.log(e)
def process( command: str) -> Tuple[Dict[str, Dict[str, float]], Dict[str, float]]: helpers.log('Loading int2wid and wid2title mappings.') global _WID2TITLE, _INT2WID with open(WID2TITLE, 'rb') as file: _WID2TITLE = pickle.load(file) with open(INT2WID, 'rb') as file: _INT2WID = pickle.load(file) model_type, model_name = command.split('@') dataset_id = helpers.training_set_id() if model_type == 'term': dir_path = os.path.join(TERM_RETRIEVALS_DIR, f'{model_name}.{dataset_id}') else: raise ValueError(f'Unknown model type: {model_type}') reference_path = os.path.join(dir_path, 'reference.json') reference_exists = os.path.isfile(reference_path) if not reference_exists: _create_trec_eval_reference(dir_path) run_path = os.path.join(dir_path, 'retrievals.json') run_exists = os.path.isfile(run_path) if not run_exists: run = _create_trec_run(dir_path) else: with open(run_path, 'r') as file: run = json.load(file) trec_eval_path = os.path.join(dir_path, 'trec_eval.json') trec_eval_agg_path = os.path.join(dir_path, 'trec_eval_agg.json') evaluator = Evaluator(reference_path, measures=pytrec_eval.supported_measures) trec_eval, trec_eval_agg = evaluator.evaluate(run, trec_eval_path, trec_eval_agg_path) return trec_eval, trec_eval_agg
def build(): assert os.path.exists( RAW_DATA_DIR ), f'Cannot find raw data in {os.path.abspath(RAW_DATA_DIR)}' os.makedirs(os.path.abspath(INDEX_DIR), exist_ok=True) folder_paths = sorted(glob(os.path.join(RAW_DATA_DIR, '*'))) title2wid = {} wid2title = {} logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilding title maps.') for group_title2wid in parallel.execute(_build_group_title_map, folder_paths): for (title, wid) in group_title2wid.items(): if title2wid.get(title, None) is None: title2wid[title] = wid wid2title[wid] = title else: # Hack for taking care of the double title that points to a proper article and to a disambiguation # article. Assumes the only article of interest is the one that is not a disambiguation one. helpers.log( f'Title {title} has the WID {title2wid.get(title)}. Current WID: {wid}.' ) if wid == 2209045: helpers.log( f'Replacing WID {title2wid.get(title)} with WID {wid}.' ) title2wid[title] = wid wid2title[wid] = title with open(WID2TITLE, 'wb') as file: pickle.dump(wid2title, file) with open(TITLE2WID, 'wb') as file: pickle.dump(title2wid, file) logging.info(f'[{datetime.now()}]\t[{os.getpid()}]\tBuilt title maps.') return
def __init__(self, env: str = 'default', verbose: bool = False, avg_len=False): if verbose: helpers.log(f'Loading index {INDRI_INDEX_DIR} with {env} query environment.') start = datetime.now() self.index = pyndri.Index(f'{INDRI_INDEX_DIR}') self.token2id, self.id2token, self.id2df = self.index.get_dictionary() self.id2tf = self.index.get_term_frequencies() if avg_len: # Monte Carlo Estimation for document length: doc_lengths = np.empty(self.index.document_count(), dtype=np.float) for (idx, doc_iid) in enumerate(range(self.index.document_base(), self.index.maximum_document())): doc_lengths[idx] = self.index.document_length(doc_iid) self.avg_doc_len = float(doc_lengths.mean()) self.tokenizer = Tokenizer() if os.path.isfile(TITLE2WID): with open(TITLE2WID, 'rb') as file: self.title2wid = pickle.load(file) if os.path.isfile(WID2TITLE): with open(WID2TITLE, 'rb') as file: self.wid2title = pickle.load(file) try: if os.path.isfile(WID2INT): with open(WID2INT, 'rb') as file: self.wid2int = pickle.load(file) if os.path.isfile(INT2WID): with open(INT2WID, 'rb') as file: self.int2wid = pickle.load(file) except FileNotFoundError: helpers.log('ID mappings do not exist yet. Not loaded.') if env == 'default': self.env = pyndri.QueryEnvironment(self.index) elif env == 'tfidf': self.env = pyndri.TFIDFQueryEnvironment(self.index, k1=1.2, b=0.75) elif env == 'prf': env = pyndri.QueryEnvironment(self.index) self.env = pyndri.PRFQueryEnvironment(env, fb_docs=10, fb_terms=10) else: raise ValueError(f'Unknown environment configuration {env}') stop = datetime.now() if verbose: helpers.log(f'Loaded index in {stop - start}.')
def process(): """Filter the collection of 5 million document to just the top 5000 at most according to bigram/unigram filtering per question. Processed in parallel.""" global_start = datetime.now() global INDEX INDEX = Index(env='tfidf') os.makedirs(DIR_NAME) (batches, no_batches, no_queries), total_retrieved = retrieve.load_dataset_batches(), 0 retrieve.create_retrieval_db(DB_NAME) helpers.log(f'Retrieving documents. Workers: {os.cpu_count()}') start = datetime.now() for batch_retrieval in parallel.execute(_process_question_batch, batches): total_retrieved += batch_retrieval end = datetime.now() helpers.log(f'Finished retrieval in {end - start}. Filtered {total_retrieved}/{no_queries}') global_end = datetime.now() helpers.log(f'Finished process in {global_end - global_start}.')
def _build_candidates(numbered_batch: Tuple[int, List[Dict[str, Any]]]) -> int: start = datetime.now() (no, batch), db, cursor = numbered_batch, None, None processed_count = 0 skipped_count = 0 relevant_count = 0 for split, question in batch: if split == 'train': no_candidates = ct.TRAIN_NO_CANDIDATES candidate_db_path = ct.TRAIN_CANDIDATES_DB elif split == 'dev': no_candidates = ct.DEV_NO_CANDIDATES candidate_db_path = ct.DEV_CANDIDATES_DB elif split == 'test': no_candidates = ct.TEST_NO_CANDIDATES candidate_db_path = ct.TEST_CANDIDATES_DB else: raise ValueError(f'Unknown set {split}.') _id = question['_id'] _type = question['type'] _level = question['level'] _str = question['question'] relevant_titles = list(map(lambda item: item[0], question['supporting_facts'])) if QUESTION_COUNTS.get(_id, 0) == no_candidates: skipped_count += 1 continue # store relevant documents row rows: List[List[str]] = [] relevant_doc_iids = set(INDEX.wid2int[INDEX.title2wid[title]] for title in relevant_titles) if split != 'test': for (candidate_idx, doc_iid) in enumerate(relevant_doc_iids): row: List[str] = [json.dumps(_id), json.dumps(_type), json.dumps(_level)] doc_wid, doc_title = _extract_doc_identifiers(row, INDEX, doc_iid) doc_text = _extract_text(row, _str, doc_wid) doc_tokens, question_tokens = _extract_tokens(row, INDEX, _str, doc_iid) tfidf_score = _extract_tfidf_score(row, INDEX, doc_tokens, question_tokens) relevance = _extract_relevance(row, doc_iid, relevant_doc_iids) rows.append(row) # store irrelevant documents row in order scored by tf-idf until reached candidate_set length result_idx = 0 candidate_idx = ct.RELEVANT_DOCUMENTS results = INDEX.unigram_query(_str, no_candidates) while candidate_idx < no_candidates: (doc_iid, tfidf_score) = results[result_idx] row: List[str] = [json.dumps(_id), json.dumps(_type), json.dumps(_level)] relevance = _extract_relevance(row, doc_iid, relevant_doc_iids, False) if relevance == 1: relevant_count += 1 if not SKIP_RELEVANT: result_idx += 1 continue doc_wid, doc_title = _extract_doc_identifiers(row, INDEX, doc_iid) doc_text = _extract_text(row, _str, doc_wid) doc_tokens, question_tokens = _extract_tokens(row, INDEX, _str, doc_iid) row.append(json.dumps(tfidf_score)) row.append(json.dumps(relevance)) rows.append(row) candidate_idx += 1 result_idx += 1 if db is None: db = sqlite3.connect(candidate_db_path) cursor = db.cursor() cursor.executemany(sql.insert_candidate, rows) db.commit() processed_count += 1 if db is not None: cursor.close() db.close() end = datetime.now() helpers.log(f'Processed batch {no} in {end - start}. Processed {processed_count}. Skipped {skipped_count}. ' f'Relevant documents found {relevant_count}') return len(batch)
def run(config: Config) -> None: start = datetime.now() os.makedirs(ct.L2R_MODEL_DIR.format(config.name), exist_ok=True) with open(ct.INT2WID, 'rb') as file: global INT2WID INT2WID = pickle.load(file) with open(ct.WID2TITLE, 'rb') as file: global WID2TITLE WID2TITLE = pickle.load(file) trec_eval_train = ct.L2R_EVAL.format(config.name, 'train') trec_eval_agg_train = ct.L2R_EVAL_AGG.format(config.name, 'train') trec_eval_dev = ct.L2R_EVAL.format(config.name, 'dev') trec_eval_agg_dev = ct.L2R_EVAL_AGG.format(config.name, 'dev') query_encoder = config.query_encoder(config.embedding_dim) document_encoder = config.document_encoder(config.embedding_dim) scorer = config.scorer(**config.scorer_kwargs) model = config.ranker(query_encoder, document_encoder, scorer).to(device=ct.DEVICE) # noinspection PyCallingNonCallable optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs) helpers.log( f'Loaded maps, model, and optimizer in {datetime.now() - start}.') train_loader, dev_loader = _load_datasets() best_recall_100 = _load_checkpoint(model, optimizer, config) remaining_epochs = config.epochs - model.epochs_trained train_stats = _evaluate_epoch(model, ct.TRAIN_TREC_REFERENCE, train_loader, trec_eval_train, trec_eval_agg_train, False) dev_stats = _evaluate_epoch(model, ct.DEV_TREC_REFERENCE, dev_loader, trec_eval_dev, trec_eval_agg_dev, False) # Hack to avoid having to adapt all index accesses below after adding run train_stats = train_stats[1:] dev_stats = dev_stats[1:] _save_epoch_stats(config.name, model.epochs_trained, -1, train_stats, dev_stats) for epoch in range(remaining_epochs): is_best = False last_epoch = (model.epochs_trained + 1) == config.epochs # train train_loss = _train_epoch(model, optimizer, train_loader, config) # only once every 10 epochs for speed. model.epochs_trained += 1 if model.epochs_trained % 10 == 0: # evaluate and save statistics train_stats = _evaluate_epoch(model, ct.TRAIN_TREC_REFERENCE, train_loader, trec_eval_train, trec_eval_agg_train, last_epoch) dev_stats = _evaluate_epoch(model, ct.DEV_TREC_REFERENCE, dev_loader, trec_eval_dev, trec_eval_agg_dev, last_epoch) # Hack to avoid having to adapt all index accesses below after adding run train_run = train_stats[0] train_stats = train_stats[1:] dev_run = dev_stats[0] dev_stats = dev_stats[1:] _save_epoch_stats(config.name, model.epochs_trained, train_loss, train_stats, dev_stats) # save model if dev_stats[6] >= best_recall_100: best_recall_100 = dev_stats[6] is_best = True _save_checkpoint(config.name, model, optimizer, best_recall_100, is_best, train_run, dev_run) return
def build(): assert constants.TRAIN_FEATURES_CHUNK > 1 assert constants.DEV_FEATURES_CHUNK > 1 global INDEX INDEX = Index('tfidf') helpers.log('Loaded index.') global EXTRACTORS EXTRACTORS = [] if 'entity' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(EntityExtractor(INDEX)) if 'ibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=False)) if 'nibm1' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(IBM1FeatureExtractor(normalized=True)) if 'bigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=False)) if 'nbigram' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(BigramOverlapFeatureExtractor(normalized=True)) if 'qword' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(QuestionWordFeatureExtractor()) if 'doclen' in constants.FEATURE_EXTRACTORS: EXTRACTORS.append(DocumentLengthFeatureExtractor()) helpers.log('Loaded extractors.') global COLUMNS COLUMNS = copy.copy(constants.FEATURE_BASE_COLUMN_NAMES) COLUMNS.extend(feature for extractor in EXTRACTORS for feature in extractor.feature_name) COLUMNS.append(constants.FEATURE_TARGET_COLUMN_NAME) helpers.log('Loaded column names.') os.makedirs(constants.FEATURES_DIR, exist_ok=True) iterator: List[Tuple[str, str, Callable]] = [ # (constants.TRAIN_CANDIDATES_DB, constants.TRAIN_FEATURES_DB, constants.TRAIN_FEATURES_CHUNK), # (constants.DEV_CANDIDATES_DB, constants.DEV_FEATURES_DB, constants.DEV_FEATURES_CHUNK), (constants.TEST_CANDIDATES_DB, constants.TEST_FEATURES_DB, constants.TEST_FEATURES_CHUNK) ] for (candidate_db_path, feature_db_path, chunk) in iterator: start_time = datetime.now() _set = candidate_db_path.split("/")[-1].split(".")[1] done = False feature_db = sqlite3.connect(feature_db_path) cursor = feature_db.cursor() while not done: try: cursor.execute(sql.create_features_table(COLUMNS)) feature_db.commit() done = True except Exception as e: helpers.log(e) helpers.log(f'Created {_set} features table.') (start,) = cursor.execute('SELECT MAX(id) FROM features').fetchone() start = start if start is not None else 0 # first id in the database cursor.close() feature_db.close() helpers.log(f'Starting feature build at {start}.') candidate_db = sqlite3.connect(candidate_db_path) cursor = candidate_db.cursor() (stop,) = cursor.execute('SELECT COUNT(*) FROM candidates').fetchone() # last id in the database cursor.close() candidate_db.close() id_range = range(start + 1, stop + 1) helpers.log(f'Retrieved {len(id_range)} candidate indices for {_set} set.') total_count = 0 _set_generator = parallel.chunk(chunk, zip([_set] * len(id_range), id_range)) _batched_set_generator = parallel.chunk(constants.GRAND_CHUNK, _set_generator) for grand_batch_idx, _batch_set in _batched_set_generator: grand_batch_count = 0 for batch_count in parallel.execute(_build_candidates, _batch_set, _as='process'): grand_batch_count += batch_count total_count += batch_count helpers.log(f'Processed {_set} batch of features no {grand_batch_idx} with {grand_batch_count} pairs.') helpers.log(f'Created {_set} features set with {total_count} pairs in {datetime.now() - start_time}')
def run_eval(_set: str, config: Config): start = datetime.now() if _set == 'train': feature_db = ct.TRAIN_FEATURES_DB ref = ct.TRAIN_TREC_REFERENCE elif _set == 'dev': feature_db = ct.DEV_FEATURES_DB ref = ct.DEV_TREC_REFERENCE elif _set == 'test': feature_db = ct.TEST_FEATURES_DB ref = ct.TEST_TREC_REFERENCE else: raise ValueError(f'Unknown set {_set}.') with open(ct.INT2WID, 'rb') as file: global INT2WID INT2WID = pickle.load(file) with open(ct.WID2TITLE, 'rb') as file: global WID2TITLE WID2TITLE = pickle.load(file) trec_eval = ct.L2R_EVAL.format(config.name, 'test') trec_eval_agg = ct.L2R_EVAL_AGG.format(config.name, 'test') query_encoder = config.query_encoder(config.embedding_dim) document_encoder = config.document_encoder(config.embedding_dim) scorer = config.scorer(**config.scorer_kwargs) model = config.ranker(query_encoder, document_encoder, scorer).to(device=ct.DEVICE) # noinspection PyCallingNonCallable optimizer = config.optimizer(model.parameters(), **config.optimizer_kwargs) _ = _load_checkpoint(model, optimizer, config) helpers.log( f'Loaded maps, model, and optimizer in {datetime.now() - start}.') test_data_set = QueryDocumentsDataset(feature_db) test_data_loader = data.DataLoader( test_data_set, ct.BATCH_SIZE, False, num_workers=os.cpu_count(), collate_fn=QueryDocumentsDataset.collate) model.eval() epoch_run = Run() epoch_eval = Evaluator(ref, measures=pytrec_eval.supported_measures) final_scores = torch.empty((len(test_data_loader.dataset), 1), dtype=torch.float) question_ids = [] document_ids = [] with torch.no_grad(): for idx, batch in tqdm(enumerate(test_data_loader)): (questions, documents, features, targets, batch_question_ids, batch_document_ids) = batch questions = questions.to(device=ct.DEVICE, non_blocking=True) documents = documents.to(device=ct.DEVICE, non_blocking=True) features = features.to(device=ct.DEVICE, non_blocking=True) batch_size = questions.shape[0] scores, encodings = model(questions, documents, features) question_ids.extend(batch_question_ids) document_ids.extend(batch_document_ids) if batch_size == ct.BATCH_SIZE: final_scores[idx * ct.BATCH_SIZE:(idx + 1) * ct.BATCH_SIZE] = scores else: final_scores[idx * ct.BATCH_SIZE:] = scores for batch_run in tqdm( parallel.execute( _build_run, parallel.chunk( 10000, zip(question_ids, document_ids, final_scores.numpy())))): epoch_run.update_rankings(batch_run) trec_eval, trec_eval_agg = epoch_eval.evaluate(epoch_run, trec_eval, trec_eval_agg, False) er_10 = 0 for stats in trec_eval.values(): er_10 += stats['recall_10'] == 1.0 er_10 /= len(trec_eval) print(f'ndcg@10:\t\t{trec_eval_agg["ndcg_cut_10"]:.4f}') print(f'map@10:\t\t{trec_eval_agg["map_cut_10"]:.4f}') print(f'er@10:\t\t{er_10:.4f}') print(f'recall@10:\t\t{trec_eval_agg["recall_10"]:.4f}') print(f'recall@100:\t\t{trec_eval_agg["recall_100"]:.4f}') print(f'recall@1000:\t\t{trec_eval_agg["recall_1000"]:.4f}')