def do_cv(unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt', metric='P@10', program='RankLib', indriscore=False, otherscore=False, ranker='ListNet', rparams=None, kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt', scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt', fixparts=True, normscores=False, intval=True,unknownscoresfilename=None,trainallparam=True,testallparam=False,featurefile=None): """Creates a ranked list output file in TREC format doing training and cross validation for LeToR. :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract. :param splitdrugs: split drugs into multiple features? :param metric: metric to train on. See RankLib help for options. :param program: Program to do LeToR with. Default RankLib. :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file. :param targetproxy: Use proximity to the work 'target' as a feature. :param dist: distance threshold for 'target' proximity :param journaldisease: Use disease presence in journal name as a feature. :param textlen: Use abtract length as a feature. :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt' :param ranker: LeToR ranker to use. See RankLib help for options. :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value. :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training. :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing. :param fixparts: Boolean. Fixed cross-valiation partitions if True. :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help. :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams. :param intval: Boolean. Use RankLib internal validation. True preferred. :param termfile: Explicit set of CUI terms to use. A list in a pickle file. :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number). :param nodrugs: Boolean. If True, do not use any drug information as a feature. """ unknown_base = unknown_docs_filename[:-11] parastr = 'n' if indriscore: parastr += '_is' if otherscore: parastr += '_os' if normscores: parastr += '_ns' if scorefile: parastr += '_sf' if not intval: parastr += '_nov' meta_docs = None unknown_meta_docs = None if indriscore: #basescores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/outputcrossval_trainbasescores17nd18.txt') #unknownscores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/final5basescorestestmodified.txt') #with newest query formulation added basescores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_traincrossvalfinal9basescores.txt') unknownscores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9basescores.txt') else: basescores = None unknownscores = None if otherscore: basetfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='qrel_tfidfbase_run.txt') basebm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='qrel_bm25base_run.txt') #with newest query formulation added unknowntfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9_tfidf_run.txt') unknownbm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9_bm25_run.txt') else: basetfidfscores = None basebm25scores = None unknowntfidfscores = None unknownbm25scores = None if scorefile: kprecscores = load_indriscores(kscorefile, normscores) precscores = load_indriscores(scorefile, normscores) else: kprecscores = None precscores = None if trainallparam==True: topics = l2r.load_topics(topicfile='crossvalidationtopics.xml') train_all = cv_dir + os.sep + features_template.format(parastr, 'all') # if not os.path.exists(train_all) or indriscore: known_docs = l2r.load_docs() l2r.save_all_features(topics, known_docs, train_all, known=True, metadocs=meta_docs, scores=basescores, tfidfscores=basetfidfscores, bm25scores=basebm25scores, precscores=kprecscores) if testallparam==True: topics = l2r.load_topics(topicfile='topic6.xml') # if not os.path.exists(test_all): test_all = cv_dir + os.sep + unknown_template.format(unknown_base, parastr, 'all') unknown_docs=l2r.load_docs('testdatadocs.txt') l2r.save_all_features(topics, unknown_docs, test_all, known=False, metadocs=unknown_meta_docs,scores=unknownscores, tfidfscores=unknowntfidfscores, bm25scores=unknownbm25scores, precscores=precscores) cv_file = cv_dir + os.sep + 'cv_sets.txt' if fixparts and os.path.exists(cv_file): cv_sets = [] with open(cv_file, 'r') as cvsetfile: for line in cvsetfile: cv_sets.append(int(line.strip())) else: cv_sets = gen_cv_sets() with open(cv_file, 'w') as cvsetfile: for i in cv_sets: cvsetfile.write('{}\n'.format(i)) #all_qnos = list(range(1, 31)) all_qnos = [1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80] qscores ={} pmids = {} train_all='/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/cv_files_featuresmodifiedfinal5/s_n_is_os_known_features_all' test_all='/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/cv_files_featuresmodifiedfinal5/s_modifiedfeature_n_os_unknown_features_all_1500docs' for i in range(1, 10): training_set=[] test_set=[] index1=[] index2=[] model_file = model_name.format(parastr, ranker, i) train_filename = '/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/'+cv_dir + os.sep + features_template.format(parastr, i) test_filename = '/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/'+cv_dir + os.sep + unknown_template.format(unknown_base, parastr, i) index1= [j for j, e in enumerate(cv_sets) if e != i] for x in index1: a=all_qnos[x] training_set.append(str(a)) index2= [k for k, e in enumerate(cv_sets) if e == i] for x in index2: a=all_qnos[x] test_set.append(str(a)) filter_file(train_all, train_filename, training_set) filter_file(test_all, test_filename, test_set) # if not os.path.exists(model_file) or indriscore: l2r.train_model(train_filename, model_file, ranker=l2r.rankers[ranker], metric=metric, program=program, params=rparams, validation=intval, featurefile=featurefile) l2r.predict(model_file, test_filename, score_filename.format(parastr, ranker, i), metric=metric, program=program, params=rparams, featurefile=featurefile) if program == 'RankLib': qscores.update(l2r.load_rankings(score_filename.format(parastr, ranker, i))) pmids.update(l2r.load_pmids_from_features(test_filename)) elif program == 'Quickrank': qpmids = l2r.load_pmids_from_features(test_filename) qscores.update(l2r.load_quickrank_scores(qpmids, score_filename.format(parastr, ranker, i))) pmids.update(qpmids) runfilename = '/Users/lowellmilliken/Downloads/trec_eval.9.0/'+unknown_base + 'tvs_L2R_{}_{}_{}_run.txt'.format(ranker, metric, parastr) l2r.save_reranked(qscores, pmids, runfilename) return runfilename
def do_cv( unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt', meta=False, splitdrugs=False, metric='P@10', program='RankLib', filtered=False, targetproxy=False, dist=5, journaldisease=False, textlen=True, indriscore=False, otherscore=False, ranker='ListNet', rparams=None, kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt', scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt', fixparts=True, normscores=False, phraseterms=False, intval=True, termfile=None, termkeyfile=None, nodrugs=False): """Creates a ranked list output file in TREC format doing training and cross validation for LeToR. :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract. :param splitdrugs: split drugs into multiple features? :param metric: metric to train on. See RankLib help for options. :param program: Program to do LeToR with. Default RankLib. :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file. :param targetproxy: Use proximity to the work 'target' as a feature. :param dist: distance threshold for 'target' proximity :param journaldisease: Use disease presence in journal name as a feature. :param textlen: Use abtract length as a feature. :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt' :param ranker: LeToR ranker to use. See RankLib help for options. :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value. :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training. :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing. :param fixparts: Boolean. Fixed cross-valiation partitions if True. :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help. :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams. :param intval: Boolean. Use RankLib internal validation. True preferred. :param termfile: Explicit set of CUI terms to use. A list in a pickle file. :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number). :param nodrugs: Boolean. If True, do not use any drug information as a feature. """ unknown_base = unknown_docs_filename[:-11] parastr = 'n' if meta: parastr += '_m' if splitdrugs: parastr += '_sd' if nodrugs: parastr += '_nd' if filtered: parastr += '_f' if targetproxy: parastr += '_t{}'.format(dist) if journaldisease: parastr += '_jd' if textlen: parastr += '_tl' if indriscore: parastr += '_is' if otherscore: parastr += '_os' if normscores: parastr += '_ns' if scorefile: parastr += '_sf' if phraseterms: parastr += '_pt' if not intval: parastr += '_nov' topics = l2r.load_topics(distance=dist) filteredstr = '_filtered' if termfile is None: if not phraseterms: if filtered: termfile = 'terms{}.pickle'.format(filteredstr) termkeyfile = 'term_keys{}.pickle'.format(filteredstr) else: termfile = 'terms{}.pickle'.format('') termkeyfile = 'term_keys{}.pickle'.format('') else: termfile = 'fterms.pickle' termkeyfile = 'fterms_keys.pickle' else: parastr += '_' + termfile[:-11] if not os.path.exists(termfile): if not filtered: meta_docs = l2r.load_docs('qrel_docs.txt.meta') else: meta_docs = l2r.load_docs('qrel_docs.txt.meta.filtered5') l2r.save_terms(meta_docs, filtered) with open(termfile, 'rb') as infile: terms = pickle.load(infile) with open(termkeyfile, 'rb') as infile: term_keys = pickle.load(infile) meta_docs = None unknown_meta_docs = None if indriscore: basescores = load_indriscores(unknown_base + 'basescores_run.txt', normscores) unknownscores = load_indriscores(unknown_base + 'run.txt', normscores) else: basescores = None unknownscores = None if otherscore: basetfidfscores = load_indriscores('qrel_tfidfbase_run.txt', normscores) basebm25scores = load_indriscores('qrel_bm25base_run.txt', normscores) unknownitftdfscores = load_indriscores( unknown_base + 'tfidfbase_run.txt', normscores) unknownbm25scores = load_indriscores(unknown_base + 'bm25base_run.txt', normscores) else: basetfidfscores = None basebm25scores = None unknownitftdfscores = None unknownbm25scores = None if scorefile: kprecscores = load_indriscores(kscorefile, normscores) precscores = load_indriscores(scorefile, normscores) else: kprecscores = None precscores = None train_all = cv_dir + os.sep + features_template.format(parastr, 'all') test_all = cv_dir + os.sep + unknown_template.format( unknown_base, parastr, 'all') if filtered: train_all += filteredstr test_all += filteredstr # if not os.path.exists(train_all) or indriscore: known_docs = l2r.load_docs() if meta: if not filtered: meta_docs = l2r.load_docs('qrel_docs.txt.meta') else: meta_docs = l2r.load_docs('qrel_docs.txt.meta.filtered5') l2r.save_all_features(topics, known_docs, train_all, known=True, metadocs=meta_docs, terms=terms, term_keys=term_keys, splitdrugs=splitdrugs, targetproxy=targetproxy, journaldisease=journaldisease, textlen=textlen, scores=basescores, tfidfscores=basetfidfscores, bm25scores=basebm25scores, precscores=kprecscores, nodrugs=nodrugs) # if not os.path.exists(test_all): unknown_docs = l2r.load_docs(unknown_docs_filename) if meta: unknown_meta_docs = l2r.load_docs(unknown_docs_filename + '.meta') l2r.save_all_features(topics, unknown_docs, test_all, known=False, metadocs=unknown_meta_docs, terms=terms, term_keys=term_keys, splitdrugs=splitdrugs, targetproxy=targetproxy, journaldisease=journaldisease, textlen=textlen, scores=unknownscores, tfidfscores=unknownitftdfscores, bm25scores=unknownbm25scores, precscores=precscores, nodrugs=nodrugs) cv_file = cv_dir + os.sep + 'cv_sets.txt' if fixparts and os.path.exists(cv_file): cv_sets = [] with open(cv_file, 'r') as cvsetfile: for line in cvsetfile: cv_sets.append(int(line.strip())) else: cv_sets = gen_cv_sets() with open(cv_file, 'w') as cvsetfile: for i in cv_sets: cvsetfile.write('{}\n'.format(i)) all_qnos = list(range(1, 31)) qscores = {} pmids = {} for i in range(1, 11): model_file = model_name.format(parastr, ranker, i) train_filename = cv_dir + os.sep + features_template.format(parastr, i) test_filename = cv_dir + os.sep + unknown_template.format( unknown_base, parastr, i) training_set = [str(x) for x in all_qnos if cv_sets[x - 1] != i] test_set = [str(x) for x in all_qnos if cv_sets[x - 1] == i] filter_file(train_all, train_filename, training_set) filter_file(test_all, test_filename, test_set) # if not os.path.exists(model_file) or indriscore: l2r.train_model(train_filename, model_file, ranker=l2r.rankers[ranker], metric=metric, program=program, params=rparams, validation=intval) l2r.predict(model_file, test_filename, score_filename.format(parastr, ranker, i), metric=metric, program=program, params=rparams) if program == 'RankLib': qscores.update( l2r.load_rankings(score_filename.format(parastr, ranker, i))) pmids.update(l2r.load_pmids_from_features(test_filename)) elif program == 'Quickrank': qpmids = l2r.load_pmids_from_features(test_filename) qscores.update( l2r.load_quickrank_scores( qpmids, score_filename.format(parastr, ranker, i))) pmids.update(qpmids) runfilename = unknown_base + 'tvs_L2R_{}_{}_{}_run.txt'.format( ranker, metric, parastr) l2r.save_reranked(qscores, pmids, runfilename) return runfilename
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed May 1 13:59:42 2019 @author: lowellmilliken """ import learning_to_rank #from gene_drug import save_all #import gene_drug import xml_to_params import term_util topicfile = 'topics201801.xml' # returns a dict of <topic number> -> <topic object> #gene_drug.save_drug_graph('drug_data/relationships/relationships.tsv', 'original', 'pharmgkbDG.pickle') #save_all() topics = learning_to_rank.load_topics(topicfile, all_drugs=True) print(topics) for topic in topics.values(): qstring = xml_to_params.generate_query(topic) qstring = term_util.form_query(topic.qno, qstring) print(qstring)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Aug 28 07:16:00 2019 @author: lowellmilliken """ import learning_to_rank import time topics = learning_to_rank.load_topics('topics201801.xml', all_drugs=True) #print(topics) #i=1 docs = learning_to_rank.load_docs('doctest1.txt') with open('featurefile.txt', 'w') as outfile: for topic in topics.values(): #print(topic.qno) # query number = '1' for first in docs file, '2' for second, etc. #print(docs.keys()) #print(docs['1']) #print(docs) #with open('featurefile.txt', 'a') as outfile: #learning_to_rank.gen_features(topic, docs['1'], outfile, 0, len(docs), qrels=None, known=False, splitdrugs=False, textlen=True, precscores=None) # precscores will come back later after we get to the query generation component start_time = time.time() learning_to_rank.gen_features(topic, docs[topic.qno], outfile, 0, len(docs), qrels=None,
def main(meta=True, filename='topics2018.xml', isexact=True, istreat=True, titleonly=False, issyn=True, isband=False, isscoreif=False, isdemo=False, isfiltdemo=True, isDrug=False, drugsyn=True, nodiseaseinC=False, geneinscoreif=False, lociandmut=False, expdrugs=False, drug_thres=0, target=False, agej=False, diseasej=False, genediseaset=False, genediseasek=False, nogene=False, nomut=False, prf=True, prfparams=(2,20,0.5,0.5), pmidfile=None, baseline=False, large=True, otherbase=False, indribase=False, noexp=False): """Generate a Indri parameter xml file from a topics xml file. :param meta: use metamap results for synonyms (default: True) :param filename: topics file name (default: topics2017.xml) :param isexact: use exact phrases (default: True) :param istreat: use treatment words (default: True) :param titleonly: search in title field only (default: False) :param issyn: contain synonyms in synonym tag (default: True) :param isband: use band tag (default: False) :param isscoreif: use scoreif tag for disease filtering (default: False) :param isdemo: use demographic information (default: False) :param isfiltdemo: filter and bin demographic information is isdemo is True (default: True) :param isDrug: use drug expansion (default: False) :param drugsyn: contain drugs in synonym tag (default: True) :param nodiseaseinC: do not use disease in query (default: False) :param geneinscoreif: use scoreif tag for gene filtering (default: False) :param lociandmut: use loci and mutation information in query (default: False) :param expdrugs: use drugs from Mallory et al. 2015 data (default: False) :param drug_thres: confidence threshold for expdrugs (default: 0) :param target: look for the word target within 5 words of the genes (default: False) :param agej: look for age in journal field (default: False) :param diseasej: look for disease in journal field (default: False) :param genediseaset: look for gene and disease in title field (default: False) :param genediseasek: look for gene and disease in keyword field (default: False) :param nogene: do not use gene information (default: False) :param nomut: do not use mutation terms such as amplification (default: False) :param prf: use pseudo-relevance feedback (default True) :param prfparams: PRF parameteres (default: (2,20,0.5,0.5)) :param pmidfile: file name of pmid file to restrict search (default: None) :param baseline: create baseline search params using BM25 and tfidf (default: False) :param large: get 5000 results instead of 1000 (default: True) :param otherbase: if pmidfile contains PMIDs that are not from the qrels file this should be True (default: False) :param indribase: create queries with no expansion or structure (default: False) :return: """ # from ???? can't find in TREC 2017 PM papers as of 3/8/2018, but it was definitely in one earlier... # The papers were revised between now and then and it may have been removed for some reason from a paper # treatment_words = 'surgery resistance therapy recurrence treatment targets prognosis malignancy prognostic study survival therapeutical patient outcome' #index_name = 'indexes/medline-ja2018-index' index_name='/Users/lowellmilliken/Documents/precision_medicine_contd/indexes/medline-ja2018-index-final2' pre_base = '<parameters><index>{}</index><runID>testRun</runID><trecFormat>true</trecFormat>\n'.format(index_name) post_base = '</parameters>' if prf: pre_base = '<parameters><index>{}</index><runID>testRun</runID><trecFormat>true</trecFormat>'.format(index_name) pre_base += '<fbDocs>{}</fbDocs><fbTerms>{}</fbTerms><fbMu>{}</fbMu><fbOrigWeight>{}</fbOrigWeight>\n'.format( prfparams[0], prfparams[1], prfparams[2], prfparams[3]) if large: pre_base += '<count>5000</count>\n' outfilename = filename[:-4] + '_' if not indribase: if meta: outfilename += 'm_' else: outfilename += 'nm' if issyn: outfilename += 'as_' if isscoreif: outfilename += 'sf_' if isexact: outfilename += 'ex_' if istreat: outfilename += 'tr_' if isdemo and isfiltdemo: outfilename += 'fd_' elif isdemo: outfilename += 'd_' else: outfilename += 'nd_' if titleonly: outfilename += 't_' else: outfilename += 'ft_' outfilename += 'nsh_' if isDrug: outfilename += 'd_' if drugsyn: outfilename += 'ds_' if expdrugs: outfilename += 'ed{}_'.format(drug_thres) if nodiseaseinC: outfilename += 'ndC_' if geneinscoreif: outfilename += 'gsf_' if lociandmut: if not nomut: if not nogene: outfilename += 'lnm_' else: outfilename += 'lnmng_' else: if not nogene: outfilename += 'l_' else: outfilename += 'lng_' if target: outfilename += 't_' if agej: outfilename += 'aj_' if diseasej: outfilename += 'dj_' if genediseaset: outfilename += 'gdt_' if genediseasek: outfilename += 'gdk_' if noexp: outfilename += 'noexp_' if prf: outfilename += 'prf-{}-{}-{}-{}_'.format(prfparams[0], prfparams[1], prfparams[2], prfparams[3]) if pmidfile is not None: if otherbase: outfilename += 'ob-{}_'.format(pmidfile[:-10]) else: outfilename += 'basescores_' else: outfilename += 'indribase_' if large: outfilename += 'large_' outfilename += 'gfix_' if pmidfile is not None: if otherbase: outfilename += 'ob-{}_'.format(pmidfile[:-10]) else: outfilename += 'basescores_' outfilename += 'params.xml' if isDrug: if expdrugs: topicfile = '{}_drugthres-{}.pickle'.format(filename[:-4], drug_thres) else: topicfile = '{}_pgkb.pickle'.format(filename[:-4]) else: topicfile = '{}.pickle'.format(filename[:-4]) if os.path.exists(topicfile): with open(topicfile, 'rb') as infile: topics = pickle.load(infile) else: topics = learning_to_rank.load_topics(filename, drug_thres=drug_thres, emdrugs=expdrugs) with open(topicfile, 'wb') as outfile: pickle.dump(topics, outfile) qpmids = None if pmidfile is not None: qpmids = load_pmids(pmidfile) if baseline and pmidfile is not None: basename = pmidfile[:-10] with open(basename + '_tfidfbase_params.xml', 'w') as tfidffile, open(basename + '_bm25base_params.xml', 'w') as bm25file: tfidffile.write(pre_base + '<baseline>tfidf</baseline>\n') bm25file.write(pre_base + '<baseline>okapi</baseline>\n') #for number in range(1, len(topics) + 1): #topics2017=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] #topicscrossval=[1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80] # Held out set topics list topicsremain2018=[2, 8, 16, 28, 33, 34] for number in topicsremain2018: topic = topics[str(number)] query = ' '.join([topic.disease, ' '.join(topic.disease_syn), ' '.join(topic.genes), ' '.join(topic.other)]) query = form_query(str(number), clean(query), qpmids) tfidffile.write(query) bm25file.write(query) tfidffile.write(post_base) bm25file.write(post_base) return with open(outfilename, 'w') as outFile: outFile.write(pre_base) tree = ET.parse(filename) root = tree.getroot() for topic in root: number = topic.attrib['number'] topic = topics[number] if not indribase: qstring = generate_query(topic, isexact=isexact, meta=meta, istreat=istreat, isdemo=isdemo, isfiltdemo=isfiltdemo, issyn=issyn, target=target, isDrug=isDrug, drugsyn=drugsyn, isband=isband, isscoreif=isscoreif, nodiseaseinC=nodiseaseinC, lociandmut=lociandmut, nomut=nomut, nogene=nogene, titleonly=titleonly, agej=agej, diseasej=diseasej, genediseaset=genediseaset, genediseasek=genediseasek, geneinscoreif=geneinscoreif, noexp=noexp) else: qstring = clean(topic.base) outFile.write(form_query(number, qstring, qpmids)) outFile.write(post_base) print('output to ' + outfilename)
def do_cv( unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt', metric='P@10', program='RankLib', indriscore=False, otherscore=False, ranker='ListNet', rparams=None, kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt', scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt', fixparts=True, normscores=False, intval=True, unknownscoresfilename=None, trainallparam=True, testallparam=False): """Creates a ranked list output file in TREC format doing training and cross validation for LeToR. :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract. :param splitdrugs: split drugs into multiple features? :param metric: metric to train on. See RankLib help for options. :param program: Program to do LeToR with. Default RankLib. :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file. :param targetproxy: Use proximity to the work 'target' as a feature. :param dist: distance threshold for 'target' proximity :param journaldisease: Use disease presence in journal name as a feature. :param textlen: Use abtract length as a feature. :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt' :param ranker: LeToR ranker to use. See RankLib help for options. :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value. :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training. :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing. :param fixparts: Boolean. Fixed cross-valiation partitions if True. :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help. :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams. :param intval: Boolean. Use RankLib internal validation. True preferred. :param termfile: Explicit set of CUI terms to use. A list in a pickle file. :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number). :param nodrugs: Boolean. If True, do not use any drug information as a feature. """ unknown_base = unknown_docs_filename[:-11] parastr = 'n' if indriscore: parastr += '_is' if otherscore: parastr += '_os' if normscores: parastr += '_ns' if scorefile: parastr += '_sf' if not intval: parastr += '_nov' topics = l2r.load_topics(topicfile='crossvalidationtopics.xml') meta_docs = None unknown_meta_docs = None if indriscore: basescores = load_indriscores(unknown_base + 'basescores_run.txt', normscores) unknownscores = load_indriscores(unknown_base + 'run.txt', normscores) else: basescores = None unknownscores = None if otherscore: basetfidfscores = parse_results_for_top_N.load_indri_tfidf_scores( res_filename='qrel_tfidfbase_run.txt') basebm25scores = parse_results_for_top_N.load_indri_tfidf_scores( res_filename='qrel_bm25base_run.txt') #unknowntfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename=unknownscoresfilename+'tfidfbase_run.txt') #unknownbm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename=unknownscoresfilename+'bm25base_run.txt') unknowntfidfscores = None unknownbm25scores = None else: basetfidfscores = None basebm25scores = None unknowntfidfscores = None unknownbm25scores = None if scorefile: kprecscores = load_indriscores(kscorefile, normscores) precscores = load_indriscores(scorefile, normscores) else: kprecscores = None precscores = None if trainallparam == True: train_all = cv_dir + os.sep + features_template.format(parastr, 'all') # if not os.path.exists(train_all) or indriscore: known_docs = l2r.load_docs() l2r.save_all_features(topics, known_docs, train_all, known=True, metadocs=meta_docs, scores=basescores, tfidfscores=basetfidfscores, bm25scores=basebm25scores, precscores=kprecscores) if testallparam == True: # if not os.path.exists(test_all): test_all = cv_dir + os.sep + unknown_template.format( unknown_base, parastr, 'all') unknown_docs = l2r.load_docs(unknown_docs_filename) l2r.save_all_features(topics, unknown_docs, test_all, known=False, metadocs=unknown_meta_docs, scores=unknownscores, tfidfscores=unknowntfidfscores, bm25scores=unknownbm25scores, precscores=precscores) cv_file = cv_dir + os.sep + 'cv_sets.txt' if fixparts and os.path.exists(cv_file): cv_sets = [] with open(cv_file, 'r') as cvsetfile: for line in cvsetfile: cv_sets.append(int(line.strip())) else: cv_sets = gen_cv_sets() with open(cv_file, 'w') as cvsetfile: for i in cv_sets: cvsetfile.write('{}\n'.format(i))