def run_ap_baseline(query_pickle, data_dir): ranker = KLRanker(data_dir) query = load_from_pickle(query_pickle) _start_time = time.time() aps = [] for query_idx in tqdm(query): if 'languagemodel' in query[query_idx]: query_lm = query[query_idx]['languagemodel'] else: query_lm = normalize(query[query_idx]['wordcount'], inplace=False) answer_set = query[query_idx]['answer'] name_answer_set = {} for key, val in answer_set.items(): docname = 'T' + str(key).zfill(4) name_answer_set[docname] = val ret = ranker.rank(query_lm, negquery=None) # Calculate Mean Average Precision ap = average_precision(ret, name_answer_set) aps.append(ap) # Time end _end_time = time.time() print("Mean Average Precision: {}".format(sum(aps) / len(aps))) print("Time taken: {} seconds".format(_end_time - _start_time))
def test_klranker(): cwd = os.path.dirname(__file__) test_collection_dir = os.path.join(cwd, 'PTV.test') test_query_pickle = os.path.join(test_collection_dir, 'PTV.test.query.pickle') test_scores_pickle = os.path.join(test_collection_dir, 'PTV.test.scores.pickle') test_query = load_from_pickle(test_query_pickle) test_scores = load_from_pickle(test_scores_pickle) ranker = KLRanker(test_collection_dir) for query_idx, test_ret in zip(test_query, test_scores): query_lm = normalize(test_query[query_idx]['wordcount'], inplace=False) ret = ranker.rank(query_lm) assert ret == test_ret
def test_pickle(): obj = {1: 2, 3: 4} test_pickle_file = 'test.pickle' utils.save_to_pickle(test_pickle_file, obj) obj2 = utils.load_from_pickle(test_pickle_file) assert obj == obj2 os.remove(test_pickle_file)
def run_reformat(data_dir, out_dir, lex_file, query_pickle): if not os.path.exists(out_dir): os.makedirs(out_dir) data_name = os.path.basename(data_dir) print("Reading lex...") encoded_lex_file = os.path.join(data_dir, data_name + '.lex') encoded_lex_dict = reader.readLex(encoded_lex_file) print("Reading background...") background_file = os.path.join(data_dir, data_name + '.background') background = reader.readBackground(background_file, encoded_lex_dict) print("Reading doclengs...") docleng_file = os.path.join(data_dir, data_name + '.doclength') doclengs = reader.readDocLength(docleng_file) # Change key from index to docname print("Change doclengths keys to document names...") namekey_doclengs = {} for doc_idx, length in doclengs.items(): docname = 'T' + str(doc_idx).zfill(4) namekey_doclengs[docname] = length print("Reading inverted index...") index_file = os.path.join(data_dir, data_name + '.index') inverted_index = reader.readInvIndex(index_file) print("Converting inverted index docnames...") named_inverted_index = {} for wordID, docs_prob in tqdm(inverted_index.items()): named_docs_prob = {} for docID, prob in docs_prob.items(): docname = 'T' + str(docID).zfill(4) named_docs_prob[docname] = prob named_inverted_index[wordID] = named_docs_prob print("Reading document models...") documents_lm = {} docmodel_dir = os.path.join(data_dir, 'docmodel', '*') for docpath in tqdm(glob(docmodel_dir)): docname = os.path.basename(docpath) documents_lm[docname] = reader.readDocModel(docpath) print( "Converting documents language model to wordcount with document lengths" ) documents_wc = {} for docname, lm in tqdm(documents_lm.items()): wc = {} length = namekey_doclengs[docname] for word_idx, word_prob in lm.items(): wc[word_idx] = round(word_prob * length) documents_wc[docname] = {'wordcount': wc} print("Saving lex to pickle...") lex_dict = reader.readLex(lex_file) lex_pickle = os.path.join(out_dir, 'lex.pickle') utils.save_to_pickle(lex_pickle, lex_dict) print("Saving documents to pickle...") document_pickle = os.path.join(out_dir, 'document.pickle') utils.save_to_pickle(document_pickle, documents_wc) print("Saving indices to pickle...") indices = { 'background': background, 'doclengs': namekey_doclengs, 'inverted_index': named_inverted_index } index_pickle = os.path.join(out_dir, 'indices.pickle') utils.save_to_pickle(index_pickle, indices) print("Saving query to pickle...") old_query_pickle = os.path.join(data_dir, 'query.pickle') old_query = utils.load_from_pickle(old_query_pickle) query = {} for query_lm, ans_dict, q_idx in old_query: query_wc = {} length = len(query_lm.keys()) for word_idx, word_prob in query_lm.items(): count = round(word_prob * length) assert count == 1, query_lm query_wc[word_idx] = count query[q_idx] = { 'answer': ans_dict, 'wordcount': query_wc, 'languagemodel': query_lm, } utils.save_to_pickle(query_pickle, query)
return query if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( '-l', "--lex_pickle", type=str, default= './iscr/searchengine/data/PTV_onebest_fromMATBN_charSeg/lex.pickle') parser.add_argument('-q', '--query_file', type=str, default='./data/query/PTV.utf8.jieba.query') parser.add_argument('-a', '--answer_file', type=str, default='./data/query/PTV.ans') parser.add_argument('-o', '--out_pickle', type=str, default='./queries/dnn.query.pickle') args = parser.parse_args() lex_dict = load_from_pickle(args.lex_pickle) query_answer = build_query_answer(lex_dict, args.query_file, args.answer_file, args.out_pickle)