def init(top_k, get_text): global PROCESS_DB, PROCESS_RANKER, TOP_K, GET_TEXT PROCESS_DB = DocDB() Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) PROCESS_RANKER = TfidfDocRanker() TOP_K = top_k GET_TEXT = get_text
def init(db_path, full_doc_db_path, ranker_path): global PROCESS_TOK, PROCESS_DB, PROCESS_RANKER, PROCESS_FULL_DOC_DB, DOC_TITLES PROCESS_TOK = CoreNLPTokenizer() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = DocDB(db_path) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) PROCESS_FULL_DOC_DB = DocDB(full_doc_db_path, full_docs=True) Finalize(PROCESS_FULL_DOC_DB, PROCESS_FULL_DOC_DB.close, exitpriority=100) PROCESS_RANKER = TfidfDocRanker(ranker_path) DOC_TITLES = PROCESS_FULL_DOC_DB.get_doc_titles()
def main(dataset_path, k_list_to_check, ranker_path=None, normalize_ranker=False, num_workers=1, filter=False, tokenizer='corenlp'): dataset = load_dataset(dataset_path) if filter: print(f"Filtering questions without question mark.") old_len = len(dataset) dataset = [x for x in dataset if '?' in x['question']] print(f"Removed {len(dataset)-old_len} questions.") ranker = TfidfDocRanker(tfidf_path=ranker_path, normalize_vectors=normalize_ranker, tokenizer=tokenizer) gold_dict = build_gold_dict(dataset) regular_table = prettytable.PrettyTable( ['Top K', 'Hits', 'Perfect Questions', 'At Least One']) cat_table_dict = { cat: prettytable.PrettyTable( ['Top K', 'Hits', 'Perfect Questions', 'At Least One']) for cat in CATEGORIES } max_k = max(k_list_to_check) print(f"Retrieving top {max_k} ...") start = time.time() result_dict = get_top_k(dataset, ranker, max_k, num_workers) print(f"Done, took {time.time()-start} ms.") for k in k_list_to_check: print(f"Calculating scores for top {k}...") start = time.time() scores, category_scores = top_k_coverage_score(gold_dict, result_dict, k) print(f"Done, took {time.time()-start} ms.") regular_table.add_row([ k, scores['Hits'], scores['Perfect Questions'], scores['At Least One'] ]) for cat in cat_table_dict: cat_table_dict[cat].add_row([ k, category_scores[cat]['Hits'], category_scores[cat]['Perfect Questions'], category_scores[cat]['At Least One'] ]) print("Overall Results:") print(regular_table) for cat, table in cat_table_dict.items(): print('\n**********************************************\n') print(f"Category: {cat} Results:") print(table)
def main(): parser = argparse.ArgumentParser(description='Evaluate tf-idf scoring on full squad.') parser.add_argument('--ranker', action='store_true', help='Whether to use bi-gram hashing or not') parser.add_argument('--per-doc', action='store_true') parser.add_argument('--num-workers', type=int, default=1) args = parser.parse_args() ranker = None if args.ranker: print("Loading ranker...") ranker = TfidfDocRanker() if args.per_doc: return main_for_document(ranker, args.num_workers) print("Loading data...") corpus = SquadRelevanceCorpus() # if args.corpus == "dev": # questions = corpus.get_dev() # else: # questions = corpus.get_train() questions = corpus.get_dev() question_preprocessor = SquadTextLengthPreprocessor(600) questions = [question_preprocessor.preprocess(x) for x in questions if (question_preprocessor.preprocess(x) is not None)] if args.num_workers <= 1: if args.ranker: init() gold_ranks = [get_rank_in_distractors(q) for q in tqdm(questions)] else: # Setup worker pool workers = ProcessPool( args.num_workers, initializer=init if args.ranker else None, initargs=[] ) gold_ranks = [] with tqdm(total=len(questions)) as pbar: for rank in tqdm(workers.imap_unordered(get_rank_in_distractors, questions)): gold_ranks.append(rank) pbar.update() mean_rank = np.mean(gold_ranks) precision_at_1 = Counter(gold_ranks)[1]/len(gold_ranks) print(f"Mean Rank: {mean_rank}") print(f"Precision @ 1: {precision_at_1}")
def init(): global PROCESS_RANKER PROCESS_RANKER = TfidfDocRanker()
from hotpot.tfidf_retriever.tfidf_doc_ranker import TfidfDocRanker logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default=None) parser.add_argument('--normalize', action='store_true') args = parser.parse_args() logger.info('Initializing ranker...') ranker = TfidfDocRanker(tfidf_path=args.model, normalize_vectors=args.normalize) # ------------------------------------------------------------------------------ # Drop in to interactive # ------------------------------------------------------------------------------ def process(query, k=1): doc_names, doc_scores = ranker.closest_docs(query, k) table = prettytable.PrettyTable(['Rank', 'Doc Id', 'Doc Score']) for i in range(len(doc_names)): table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]]) print(table) banner = """
def init(top_k): global PROCESS_DB, PROCESS_RANKER, TOP_K PROCESS_DB = OldDocbDB(DRQA_DOC_DB) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) PROCESS_RANKER = TfidfDocRanker(DRQA_RANKER, strict=False, tokenizer='simple') TOP_K = top_k
PROCESS_TOK = CoreNLPTokenizer() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) def tokenize_words(text): global PROCESS_TOK return PROCESS_TOK.tokenize(text).words() def tokenize_sentences(sentences): global PROCESS_TOK return [PROCESS_TOK.tokenize(s).words() if s != '' else [] for s in sentences] print("Loading TF-IDF...") tfidf_ranker = TfidfDocRanker() db = DocDB() loader = ResourceLoader() # loader = HotpotQuestions().get_resource_loader() word_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_word_counts.txt')) title_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_title_word_counts.txt')) word_counts.update(title_counts) voc = set(word_counts.keys()) print("Loading encoder...") spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2, max_num_question_words=None, max_num_context_words=None) encoder = SentenceEncoderIterativeModel(model_dir_path=args.encoder_model, vocabulary=voc, spec=spec, loader=loader, use_char_inputs=False,
def modified_main(dataset_path, k_list_to_check, ranker_path=None, normalize_ranker=False, num_workers=1, tokenizer='corenlp', docdb_path=None, out=None): dataset = load_dataset(dataset_path) ranker = TfidfDocRanker(tfidf_path=ranker_path, normalize_vectors=normalize_ranker, tokenizer=tokenizer) docdb = DocDB(docdb_path) print("Building modified queries...") ranked_gold_dict = build_ranked_golds(dataset, docdb=docdb, ranker=ranker) regular_table = prettytable.PrettyTable([ 'Top K', 'Second Paragraph Hits', 'Second Paragraph Hits Modified Query' ]) cat_table_dict = { cat: prettytable.PrettyTable([ 'Top K', 'Second Paragraph Hits', 'Second Paragraph Hits Modified Query' ]) for cat in CATEGORIES } max_k = max(k_list_to_check) print(f"Retrieving top {max_k} ...") start = time.time() reg_result_dict, ranked_result_dict = get_ranked_top_k( dataset, ranked_gold_dict, ranker, max_k, num_workers) print(f"Done, took {time.time()-start} ms.") for k in k_list_to_check: print(f"Calculating scores for top {k}...") start = time.time() reg_scores, reg_category_scores = modified_top_k_coverage_score( ranked_gold_dict, reg_result_dict, k) mod_scores, mod_category_scores = modified_top_k_coverage_score( ranked_gold_dict, ranked_result_dict, k) print(f"Done, took {time.time()-start} ms.") regular_table.add_row([ k, reg_scores['Second Paragraph Hits'], mod_scores['Second Paragraph Hits'] ]) for cat in cat_table_dict: cat_table_dict[cat].add_row([ k, reg_category_scores[cat]['Second Paragraph Hits'], mod_category_scores[cat]['Second Paragraph Hits'] ]) output_str = 'Overall Results:\n' output_str += regular_table.__str__() + '\n' for cat, table in cat_table_dict.items(): output_str += '\n**********************************************\n' output_str += f"Category: {cat} Results:\n" output_str += table.__str__() + '\n' if out is None: print(output_str) else: with open(out, 'w') as f: f.write(output_str)