def __init__(self, stops, minsize=3): """initialize index variables""" self.ix = None self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize) self.umls = umls.UMLSLookup() self.term_dict = {} self.token2cuis = {} self.concept_dict = {"__NULL__": 0} self.synsets = {}
def disambiguate_query(self, ix, term_dict, concept_dict, token2cuis, query, table_name): """shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS""" qcuis = {} umls_lookup = umls.UMLSLookup() # tokenize query q = self.tokenize_query(query) # convert query into gensim doc2idx format q2idx = ix.doc2idx(q) # get cuis from query tokens for idx in q2idx: if idx in token2cuis and token2cuis[idx] != ["__NULL__"]: for cui in token2cuis[idx]: if cui in qcuis: # increase cui count qcuis[cui] += 1 else: # initialize cui count qcuis[cui] = 1 # perform shallow word-sense disambiguation enc_query = [] for idx in q2idx: if idx in term_dict: # disambiguate only for terms contained within term_dict max_edges = 0 # relative maximum connections (edges) if len(token2cuis[idx]) == 1: # monosemous term ref_cui = token2cuis[idx][0] # encode (term, cui) pair enc_query.append([term_dict[idx], concept_dict[ref_cui]]) else: # polysemous term candidates = [] # loop over cadidate concepts for subj_cui in token2cuis[idx]: num_edges = 0 # number of edges if qcuis[subj_cui] == 1: # subj_cui is only associated with current term (idx) obj_cuis = list(set(qcuis.keys()).difference({subj_cui})) else: # subj_cui is associated with other terms in the query too obj_cuis = list(qcuis.keys()) num_edges += umls_lookup.compute_num_edges(obj_cuis, subj_cui, table_name) # remember that subj and obj are inverted within UMLS <s, p, o> triples # verify connectivity if num_edges > max_edges: # set candidates to subj_cui candidates = [subj_cui] # update max_edges max_edges = num_edges else: # append subj_cui to candidates candidates.append(subj_cui) # keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering ref_cui = candidates[0] # encode (term, cui) pair enc_query.append([term_dict[idx], concept_dict[ref_cui]]) else: # term oov continue return enc_query
def get_syns(self, term2cui, term_dict): """get synonymic relations between words within corpus (derived from a semantic lexicon)""" syns = {} umls_lookup = umls.UMLSLookup() analyzer = SimpleAnalyzer() for term, cui in term2cui.items(): if term in term_dict: if cui != '__NULL__': # get synset composed of single-word terms (reference term excluded) synset = {syn[0].lower() for syn in umls_lookup.lookup_synonyms(cui, preferred=False) if len(list(analyzer(syn[0]))) == 1 and syn[0].lower() in term_dict and syn[0].lower() != term} if len(synset) > 0: syns[term] = list(synset) else: syns[term] = list() else: syns[term] = list() return syns
def cui2source(self, term2cui, source='MSH'): """keep only CUIs presenting an entry in the given 'source' lexicon""" cui2source = {} umls_lookup = umls.UMLSLookup() for term, cui in tqdm(term2cui.items()): if cui == '__NULL__': # skip __NULL__ concepts cui2source[term] = '__NULL__' else: # lookup codes and sources from UMLS codes_and_sources = umls_lookup.lookup_code(cui=cui, preferred=False) source_code = [code for code, src, _ in codes_and_sources if src == source] if source_code: # CUI in source - keep it cui2source[term] = cui else: # CUI not in source - discard it cui2source[term] = '__NULL__' # return cui2source return cui2source
def main(_): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # load options opts = Options() # set folders corpus_folder = 'corpus/' + opts.corpus_name + '/' + opts.corpus_name index_folder = 'corpus/' + opts.corpus_name + '/index' model_folder = 'corpus/' + opts.corpus_name + '/models/' + opts.model_name data_folder = 'corpus/' + opts.corpus_name + '/data' query_folder = 'corpus/' + opts.corpus_name + '/queries' qrels_folder = 'corpus/' + opts.corpus_name + '/qrels' rankings_folder = 'corpus/' + opts.corpus_name + '/rankings/' + opts.model_name # create folders if not os.path.exists(data_folder): os.makedirs(data_folder) if not os.path.exists(index_folder): os.makedirs(index_folder) if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) if not os.path.exists(model_folder): os.makedirs(model_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # establish connection with UMLS db umls_lookup = umls.UMLSLookup() # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + opts.query_fname) """ PRE PROCESSING """ # pre process distributional data if not os.path.exists(data_folder + '/words.json'): # compute required data words = tf_utils.process_corpus(corpus_folder, data_folder) # build dataset to train CBOW + RMC model data, cfs, word_dict, reverse_word_dict = tf_utils.build_dataset( words, opts.min_cut_freq, data_folder) del words # free memory from unnecessary data print('Most common words (+ UNK)', count[:10]) print('Total number of words (+ UNK) within {}: {}'.format( opts.corpus_name, len(data))) print('Number of unique words (+ UNK) for {}: {}'.format( opts.corpus_name, len(count))) else: # load required data print('load processed data required to train CBOW + RMC model') with open(data_folder + '/data.json', 'r') as df: data = json.load(df) with open(data_folder + '/docs.json', 'r') as cf: corpus = json.load(cf) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dictionary reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # pre process relational data if not os.path.exists(data_folder + '/term2cui.json'): # map terms to cuis using QuickUMLS term2cui = tf_utils.get_term2cui(word_dict, data_folder, threshold=opts.threshold, stypes_fname=opts.stypes_fname) else: # load (term, cui) pairs print('load (term, cui) pairs') with open(data_folder + '/term2cui.json', 'r') as tcf: term2cui = json.load(tcf) # get synonyms for each word within vocabulary given semantic lexicon print( 'get synonyms for each word within vocabulary given semantic lexicon') syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup) # get synonyms as an array of synonym pairs syns = [ list(itertools.product([word], synset)) for word, synset in syns.items() ] syns = [pairs for pairs in syns if pairs] syns = np.array([pair for pairs in syns for pair in pairs]) print('Total number of synonymy relations within {}: {}'.format( opts.corpus_name, syns.shape[0])) # load required data to perform retrieval print('load required data to perform retrieval') with open(data_folder + '/docs.json', 'r') as cf: corpus = json.load(cf) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) # get docs and docnos from corpus docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space """ NETWORK TRAINING """ # begin training with tf.Graph().as_default(), tf.Session() as sess: # set graph-level random seed tf.set_random_seed(opts.seed) # start data index tf_globals.initialize() # setup the model model = JointRCM(len(word_dict), syns, opts) # create model saving operation - keeps as many saved models as number of epochs saver = tf.train.Saver(max_to_keep=opts.epochs) # initialize the variables using global_variables_initializer() sess.run(tf.global_variables_initializer()) print('start training') print('number of batches per epoch: {}'.format( len(data) // opts.batch_size)) best_score_per_epoch = [] for epoch in range(opts.epochs): # train CBOW print('training epoch {}'.format(epoch + 1)) # loop over (len(data) // opts.batch_size) batches for i in tqdm(range(len(data) // opts.batch_size)): batch_inputs, batch_labels = tf_utils.generate_batch( data, opts.batch_size, opts.context_window) feed_dict = { model.inputs: batch_inputs, model.labels: batch_labels } # run cbow train_op sess.run(model.cbow_train_op, feed_dict=feed_dict) if (i + 1) % opts.minimize_rcm_every == 0: # run rcm train_op sess.run(model.rcm_train_op) # store trained CBOW print('storing model at epoch {}'.format(epoch + 1)) model_checkpoint_path = os.path.join( os.getcwd(), model_folder, opts.model_name + str(epoch + 1) + '.ckpt') save_path = saver.save(sess, model_checkpoint_path) print("model saved in file: {}".format(save_path)) """ DOCUMENT RETRIEVAL """ # get embs after training epoch word_embs = sess.run(model.word_embs) # evaluate CBOW for IR tasks print('evaluating at epoch {}'.format(epoch + 1)) # compute doc embeddings and return list of filtered doc ids doc_embs, filt_ids = tf_utils.compute_doc_embs( docs, word_dict, word_embs, idfs) # set query embs and ids q_embs = [] q_ids = [] # loop over queries and generate rankings for qid, qtext in q.items(): # prepare queries for semantic matching q_proj = tf_utils.prepare_query(qtext[opts.field], word_dict, word_embs) if q_proj is None: print('query {} does not contain known terms'.format(qid)) else: q_embs.append(q_proj) q_ids.append(qid) q_embs = np.array(q_embs) # perform search and evaluate model effectiveness tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs, rankings_folder, opts.model_name + '_' + str(epoch + 1), filt_ids) scores = tf_utils.evaluate( ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder, opts.model_name + '_' + str(epoch + 1), qrels_folder, opts.qrels_fname) best_score_per_epoch.append(scores[opts.ref_measure]) print('best model (in terms of {}) found at epoch: {}'.format( opts.ref_measure, np.argsort(best_score_per_epoch)[-1] + 1))
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name index_folder = 'corpus/' + FLAGS.corpus_name + '/index' # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) # if not os.path.exists(model_folder): # os.makedirs(model_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # set random seed - enable reproducibility np.random.seed(FLAGS.seed) # establish connection with UMLS db umls_lookup = umls.UMLSLookup() # load required data print( 'load processed data required to retrofit word vectors and perform retrieval tasks' ) with open(data_folder + '/docs.json', 'r') as df: corpus = json.load(df) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dict reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # pre process relational data if not os.path.exists(data_folder + '/term2cui.json'): # map terms to cuis using QuickUMLS term2cui = tf_utils.get_term2cui(word_dict, data_folder, threshold=FLAGS.threshold, stypes_fname=FLAGS.stypes_fname) else: # laod (term, cui) pairs print('load (term, cui) pairs') with open(data_folder + '/term2cui.json', 'r') as tcf: term2cui = json.load(tcf) """ SEMANTIC PROCESSING """ # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) """ RETROFITTING """ if FLAGS.retrofit: # get synonyms for each word within vocabulary print('get synonyms') syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup) if FLAGS.syn_weights: # convert collection frequencies from list to dict cfs = dict(cfs) else: cfs = None # retrofit word vectors print('retrofit word vectors for {} iterations'.format( FLAGS.iterations)) word_embs = retrofit(word_embs, syns, reverse_word_dict, FLAGS.iterations, alpha=1.0, beta=FLAGS.beta, cfs=cfs) # compute doc embeddings print('compute document vectors w/ retrofitted word vectors') doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) if not FLAGS.reranking: """ RETRIEVAL """ print('perform retrieval over the entire collection') # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # set query embs and ids q_embs = [] q_ids = [] # loop over queries and generate rankings for qid, qtext in q.items(): # prepare queries for semantic matching q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict, word_embs) if q_proj is None: print('query {} does not contain known terms'.format(qid)) else: q_embs.append(q_proj) q_ids.append(qid) q_embs = np.array(q_embs) # perform search and evaluate model effectiveness tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs, rankings_folder, FLAGS.model_name) scores = tf_utils.evaluate( ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder, FLAGS.model_name, qrels_folder, FLAGS.qrels_fname) else: """ RE-RANKING """ print('perform re-ranking over top 1000 documents from a baseline run') # parse and store qrels with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking( run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print( 'fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [ qscore[FLAGS.ref_measure] for qscore in test_res.values() ]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # load options opts = Options() # set folders query_folder = 'corpus/' + opts.corpus_name + '/queries' qrels_folder = 'corpus/' + opts.corpus_name + '/qrels' rankings_folder = 'corpus/' + opts.corpus_name + '/rankings/' + opts.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # load utils functions - set random seed utils = Utils(opts.seed) # load UMLS lookup functions umls_lookup = umls.UMLSLookup() # load queries print('load {} queries'.format(opts.corpus_name)) queries = utils.read_queries(query_folder + '/' + opts.qfname) # load BoW run bow_model = read_ranking(opts.bow_model_path) # load models print('load models') txt_d2v_model = gensim.models.Doc2Vec.load(opts.txt_d2v_model_path) concept_d2v_model = gensim.models.Doc2Vec.load(opts.concept_d2v_model_path) retro_model = np.load(opts.retro_model_path, allow_pickle=True).item() ##### QUERY EXPANSION ##### N_top_docs = opts.num_top_docs N_top_words_doc = opts.num_top_words_per_doc N_top_words_query = opts.num_top_words queries_t = {} queries_c = {} queries_r = {} print('perform query expansion for each model') for qid, qtext in tqdm(queries.items()): # get N_top_docs for given query print('get top {} docs for query {}'.format(N_top_docs, qid)) query_top_docs = get_query_top_docs(bow_model, qid, N_top_docs) ''' for each doc in query_top_docs, pick N_top_words_doc, add to pool sort the pool, get N_top_words_query to add in given query ''' top_concept_t = {} top_concept_c = {} top_concept_r = {} for top_doc_id in query_top_docs: top_t = top_words_of_doc(txt_d2v_model, top_doc_id, N_top_words_doc) if top_doc_id in concept_d2v_model.docvecs: top_c = top_words_of_doc(concept_d2v_model, top_doc_id, N_top_words_doc) if top_doc_id in retro_model: top_retro_doc = retro_model[top_doc_id] if opts.beta < 0.5: # prioritize concepts top_r = top_words_of_vector(concept_d2v_model, top_retro_doc, N_top_words_doc) else: # prioritize words top_r = top_words_of_vector(txt_d2v_model, top_retro_doc, N_top_words_doc) for i in range(N_top_words_doc): if len(top_t ) == N_top_words_doc: # doc_id found by txt_d2v_model term = top_t[i][0] score = top_t[i][1] if term in top_concept_t: # combsum top_concept_t[term] += score else: top_concept_t[term] = score if len( top_c ) == N_top_words_doc: # doc_id found by concept_d2v_model term = top_c[i][0] score = top_c[i][1] if term in top_concept_c: # combsum top_concept_c[term] += score else: top_concept_c[term] = score if len(top_r ) == N_top_words_doc: # doc_id found by retro_model term = top_r[i][0] score = top_r[i][1] if term in top_concept_r: # combsum top_concept_r[term] += score else: top_concept_r[term] = score # sorting top_concept lists sorted_candidates_t = sorted( top_concept_t.items(), key=operator.itemgetter( 1)) # [(id1,min_sim), ... , (idn, max_sim)] sorted_candidates_c = sorted( top_concept_c.items(), key=operator.itemgetter( 1)) # [(id1,min_sim), ... , (idn, max_sim)] sorted_candidates_r = sorted( top_concept_r.items(), key=operator.itemgetter( 1)) # [(id1,min_sim), ... , (idn, max_sim)] top_term_t = sorted_candidates_t[-N_top_words_query:] top_term_c = sorted_candidates_c[-N_top_words_query:] top_term_r = sorted_candidates_r[-N_top_words_query:] query_new_t = qtext[opts.qfield] query_new_c = qtext[opts.qfield] query_new_r = qtext[opts.qfield] # query_new_t = '' # query_new_c = '' # query_new_r = '' count_t = 0 count_c = 0 count_r = 0 for term, _ in top_term_t: query_new_t += ' ' + term count_t += 1 for cui, _ in top_term_c: cui = cui.upper() term_variants = [ term_and_source for term_and_source in umls_lookup.lookup_synonyms(cui=cui, preferred=True) if term_and_source[1] == 'MSH' ] term = term_variants[0][0] # preferred term query_new_c += ' ' + term count_c += 1 if opts.beta < 0.5: for cui, _ in top_term_r: cui = cui.upper() term_variants = [ term_and_source for term_and_source in umls_lookup.lookup_synonyms(cui=cui, preferred=True) if term_and_source[1] == 'MSH' ] term = term_variants[0][0] query_new_r += ' ' + term count_r += 1 else: for term, _ in top_term_r: query_new_r += ' ' + term count_r += 1 queries_t[qid] = {opts.qfield: query_new_t} queries_c[qid] = {opts.qfield: query_new_c} queries_r[qid] = {opts.qfield: query_new_r} es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) # set Index instance ix = Index() print('search and evaluate text-based doc2vec query expansion') # perform lexical search over given query field w/ chosen model ix.lexical_search(queries_t, opts.qfield, rankings_folder, opts.model_name + '_txt_d2v') # evaluate performed search scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder, opts.model_name + '_txt_d2v', qrels_folder, opts.qrels_fname) print('search and evaluate concept-based doc2vec query expansion') # perform lexical search over given query field w/ chosen model ix.lexical_search(queries_c, opts.qfield, rankings_folder, opts.model_name + '_concept_d2v') # evaluate performed search scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder, opts.model_name + '_concept_d2v', qrels_folder, opts.qrels_fname) print('search and evaluate retrofitted doc2vec query expansion') # perform lexical search over given query field w/ chosen model ix.lexical_search(queries_r, opts.qfield, rankings_folder, opts.model_name + '_retro_d2v') # evaluate performed search scores = utils.evaluate(['recall.20', 'P_20', 'map'], rankings_folder, opts.model_name + '_retro_d2v', qrels_folder, opts.qrels_fname)