예제 #1
0
def do_cv(unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt',
           metric='P@10', program='RankLib', indriscore=False, otherscore=False, ranker='ListNet', rparams=None,
          kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt',
          scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt',
          fixparts=True, normscores=False, intval=True,unknownscoresfilename=None,trainallparam=True,testallparam=False,featurefile=None):
    """Creates a ranked list output file in TREC format doing training and cross validation for LeToR.

    :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run
    :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract.
    :param splitdrugs: split drugs into multiple features?
    :param metric: metric to train on. See RankLib help for options.
    :param program: Program to do LeToR with. Default RankLib.
    :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file.
    :param targetproxy: Use proximity to the work 'target' as a feature.
    :param dist: distance threshold for 'target' proximity
    :param journaldisease: Use disease presence in journal name as a feature.
    :param textlen: Use abtract length as a feature.
    :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt
    :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt'
    :param ranker: LeToR ranker to use. See RankLib help for options.
    :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value.
    :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training.
    :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing.
    :param fixparts: Boolean. Fixed cross-valiation partitions if True.
    :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help.
    :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams.
    :param intval: Boolean. Use RankLib internal validation. True preferred.
    :param termfile: Explicit set of CUI terms to use. A list in a pickle file.
    :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number).
    :param nodrugs: Boolean. If True, do not use any drug information as a feature.
    """
    unknown_base = unknown_docs_filename[:-11]
    parastr = 'n'

    if indriscore:
        parastr += '_is'
    if otherscore:
        parastr += '_os'
    if normscores:
        parastr += '_ns'

    if scorefile:
        parastr += '_sf'
    if not intval:
        parastr += '_nov'

    

    meta_docs = None
    unknown_meta_docs = None

    if indriscore:
        #basescores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/outputcrossval_trainbasescores17nd18.txt')
        #unknownscores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/final5basescorestestmodified.txt')
        #with newest query formulation added
        basescores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_traincrossvalfinal9basescores.txt')
        unknownscores = load_indriscores('/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9basescores.txt')
    else:
        basescores = None
        unknownscores = None

    if otherscore:
        basetfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='qrel_tfidfbase_run.txt')
        basebm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='qrel_bm25base_run.txt')

        #with newest query formulation added
        unknowntfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9_tfidf_run.txt')
        unknownbm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename='/Users/lowellmilliken/Downloads/indri-5.14/runquery/output_testcrossvalfinal9_bm25_run.txt')

    else:
        basetfidfscores = None
        basebm25scores = None

        unknowntfidfscores = None
        unknownbm25scores = None

    if scorefile:
        kprecscores = load_indriscores(kscorefile, normscores)
        precscores = load_indriscores(scorefile, normscores)
    else:
        kprecscores = None
        precscores = None

    if trainallparam==True:
        topics = l2r.load_topics(topicfile='crossvalidationtopics.xml')
        train_all = cv_dir + os.sep + features_template.format(parastr, 'all')
    

    # if not os.path.exists(train_all) or indriscore:
        known_docs = l2r.load_docs()
    
        l2r.save_all_features(topics, known_docs, train_all, known=True, metadocs=meta_docs,
                              scores=basescores, tfidfscores=basetfidfscores, bm25scores=basebm25scores,
                              precscores=kprecscores)
    if testallparam==True:
       topics = l2r.load_topics(topicfile='topic6.xml')
    # if not os.path.exists(test_all):
       test_all = cv_dir + os.sep + unknown_template.format(unknown_base, parastr, 'all')
       unknown_docs=l2r.load_docs('testdatadocs.txt')
       l2r.save_all_features(topics, unknown_docs, test_all, known=False, metadocs=unknown_meta_docs,scores=unknownscores, tfidfscores=unknowntfidfscores, bm25scores=unknownbm25scores,
                          precscores=precscores)

    cv_file = cv_dir + os.sep + 'cv_sets.txt'
    if fixparts and os.path.exists(cv_file):
        cv_sets = []
        with open(cv_file, 'r') as cvsetfile:
            for line in cvsetfile:
                cv_sets.append(int(line.strip()))
    else:
        cv_sets = gen_cv_sets()
        with open(cv_file, 'w') as cvsetfile:
            for i in cv_sets:
                cvsetfile.write('{}\n'.format(i))

    #all_qnos = list(range(1, 31))
    all_qnos = [1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80]
    qscores ={}
    pmids = {}
    train_all='/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/cv_files_featuresmodifiedfinal5/s_n_is_os_known_features_all'
    test_all='/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/cv_files_featuresmodifiedfinal5/s_modifiedfeature_n_os_unknown_features_all_1500docs'
    for i in range(1, 10):
        training_set=[]
        test_set=[]
        index1=[]
        index2=[]
        model_file = model_name.format(parastr, ranker, i)

        train_filename = '/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/'+cv_dir + os.sep + features_template.format(parastr, i)
        test_filename = '/Users/lowellmilliken/Documents/precision_medicine_contd/lmillik-artpm-c576ced69e03/'+cv_dir + os.sep + unknown_template.format(unknown_base, parastr, i)
        index1= [j for j, e in enumerate(cv_sets) if e != i]

        for x in index1:
            a=all_qnos[x]
            training_set.append(str(a))

        index2= [k for k, e in enumerate(cv_sets) if e == i]

        for x in index2:
            a=all_qnos[x]
            test_set.append(str(a))


        filter_file(train_all, train_filename, training_set)
        filter_file(test_all, test_filename, test_set)
   
        # if not os.path.exists(model_file) or indriscore:
        l2r.train_model(train_filename, model_file, ranker=l2r.rankers[ranker], metric=metric, program=program, params=rparams, validation=intval, featurefile=featurefile)

        l2r.predict(model_file, test_filename, score_filename.format(parastr, ranker, i), metric=metric, program=program, params=rparams, featurefile=featurefile)

        if program == 'RankLib':
            qscores.update(l2r.load_rankings(score_filename.format(parastr, ranker, i)))
            pmids.update(l2r.load_pmids_from_features(test_filename))
        elif program == 'Quickrank':
            qpmids = l2r.load_pmids_from_features(test_filename)
            qscores.update(l2r.load_quickrank_scores(qpmids, score_filename.format(parastr, ranker, i)))
            pmids.update(qpmids)

    runfilename = '/Users/lowellmilliken/Downloads/trec_eval.9.0/'+unknown_base + 'tvs_L2R_{}_{}_{}_run.txt'.format(ranker, metric, parastr)
    l2r.save_reranked(qscores, pmids, runfilename)

    return runfilename
예제 #2
0
def do_cv(
        unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt',
        meta=False,
        splitdrugs=False,
        metric='P@10',
        program='RankLib',
        filtered=False,
        targetproxy=False,
        dist=5,
        journaldisease=False,
        textlen=True,
        indriscore=False,
        otherscore=False,
        ranker='ListNet',
        rparams=None,
        kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt',
        scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt',
        fixparts=True,
        normscores=False,
        phraseterms=False,
        intval=True,
        termfile=None,
        termkeyfile=None,
        nodrugs=False):
    """Creates a ranked list output file in TREC format doing training and cross validation for LeToR.

    :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run
    :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract.
    :param splitdrugs: split drugs into multiple features?
    :param metric: metric to train on. See RankLib help for options.
    :param program: Program to do LeToR with. Default RankLib.
    :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file.
    :param targetproxy: Use proximity to the work 'target' as a feature.
    :param dist: distance threshold for 'target' proximity
    :param journaldisease: Use disease presence in journal name as a feature.
    :param textlen: Use abtract length as a feature.
    :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt
    :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt'
    :param ranker: LeToR ranker to use. See RankLib help for options.
    :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value.
    :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training.
    :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing.
    :param fixparts: Boolean. Fixed cross-valiation partitions if True.
    :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help.
    :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams.
    :param intval: Boolean. Use RankLib internal validation. True preferred.
    :param termfile: Explicit set of CUI terms to use. A list in a pickle file.
    :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number).
    :param nodrugs: Boolean. If True, do not use any drug information as a feature.
    """
    unknown_base = unknown_docs_filename[:-11]
    parastr = 'n'

    if meta:
        parastr += '_m'
    if splitdrugs:
        parastr += '_sd'
    if nodrugs:
        parastr += '_nd'
    if filtered:
        parastr += '_f'
    if targetproxy:
        parastr += '_t{}'.format(dist)
    if journaldisease:
        parastr += '_jd'
    if textlen:
        parastr += '_tl'
    if indriscore:
        parastr += '_is'
    if otherscore:
        parastr += '_os'
    if normscores:
        parastr += '_ns'

    if scorefile:
        parastr += '_sf'
    if phraseterms:
        parastr += '_pt'
    if not intval:
        parastr += '_nov'

    topics = l2r.load_topics(distance=dist)

    filteredstr = '_filtered'
    if termfile is None:
        if not phraseterms:
            if filtered:
                termfile = 'terms{}.pickle'.format(filteredstr)
                termkeyfile = 'term_keys{}.pickle'.format(filteredstr)
            else:
                termfile = 'terms{}.pickle'.format('')
                termkeyfile = 'term_keys{}.pickle'.format('')
        else:
            termfile = 'fterms.pickle'
            termkeyfile = 'fterms_keys.pickle'
    else:
        parastr += '_' + termfile[:-11]

    if not os.path.exists(termfile):
        if not filtered:
            meta_docs = l2r.load_docs('qrel_docs.txt.meta')
        else:
            meta_docs = l2r.load_docs('qrel_docs.txt.meta.filtered5')

        l2r.save_terms(meta_docs, filtered)

    with open(termfile, 'rb') as infile:
        terms = pickle.load(infile)
    with open(termkeyfile, 'rb') as infile:
        term_keys = pickle.load(infile)

    meta_docs = None
    unknown_meta_docs = None

    if indriscore:
        basescores = load_indriscores(unknown_base + 'basescores_run.txt',
                                      normscores)
        unknownscores = load_indriscores(unknown_base + 'run.txt', normscores)
    else:
        basescores = None
        unknownscores = None

    if otherscore:
        basetfidfscores = load_indriscores('qrel_tfidfbase_run.txt',
                                           normscores)
        basebm25scores = load_indriscores('qrel_bm25base_run.txt', normscores)

        unknownitftdfscores = load_indriscores(
            unknown_base + 'tfidfbase_run.txt', normscores)
        unknownbm25scores = load_indriscores(unknown_base + 'bm25base_run.txt',
                                             normscores)
    else:
        basetfidfscores = None
        basebm25scores = None

        unknownitftdfscores = None
        unknownbm25scores = None

    if scorefile:
        kprecscores = load_indriscores(kscorefile, normscores)
        precscores = load_indriscores(scorefile, normscores)
    else:
        kprecscores = None
        precscores = None

    train_all = cv_dir + os.sep + features_template.format(parastr, 'all')
    test_all = cv_dir + os.sep + unknown_template.format(
        unknown_base, parastr, 'all')
    if filtered:
        train_all += filteredstr
        test_all += filteredstr
    # if not os.path.exists(train_all) or indriscore:
    known_docs = l2r.load_docs()
    if meta:
        if not filtered:
            meta_docs = l2r.load_docs('qrel_docs.txt.meta')
        else:
            meta_docs = l2r.load_docs('qrel_docs.txt.meta.filtered5')

    l2r.save_all_features(topics,
                          known_docs,
                          train_all,
                          known=True,
                          metadocs=meta_docs,
                          terms=terms,
                          term_keys=term_keys,
                          splitdrugs=splitdrugs,
                          targetproxy=targetproxy,
                          journaldisease=journaldisease,
                          textlen=textlen,
                          scores=basescores,
                          tfidfscores=basetfidfscores,
                          bm25scores=basebm25scores,
                          precscores=kprecscores,
                          nodrugs=nodrugs)
    # if not os.path.exists(test_all):
    unknown_docs = l2r.load_docs(unknown_docs_filename)
    if meta:
        unknown_meta_docs = l2r.load_docs(unknown_docs_filename + '.meta')
    l2r.save_all_features(topics,
                          unknown_docs,
                          test_all,
                          known=False,
                          metadocs=unknown_meta_docs,
                          terms=terms,
                          term_keys=term_keys,
                          splitdrugs=splitdrugs,
                          targetproxy=targetproxy,
                          journaldisease=journaldisease,
                          textlen=textlen,
                          scores=unknownscores,
                          tfidfscores=unknownitftdfscores,
                          bm25scores=unknownbm25scores,
                          precscores=precscores,
                          nodrugs=nodrugs)

    cv_file = cv_dir + os.sep + 'cv_sets.txt'
    if fixparts and os.path.exists(cv_file):
        cv_sets = []
        with open(cv_file, 'r') as cvsetfile:
            for line in cvsetfile:
                cv_sets.append(int(line.strip()))
    else:
        cv_sets = gen_cv_sets()
        with open(cv_file, 'w') as cvsetfile:
            for i in cv_sets:
                cvsetfile.write('{}\n'.format(i))

    all_qnos = list(range(1, 31))
    qscores = {}
    pmids = {}
    for i in range(1, 11):
        model_file = model_name.format(parastr, ranker, i)
        train_filename = cv_dir + os.sep + features_template.format(parastr, i)
        test_filename = cv_dir + os.sep + unknown_template.format(
            unknown_base, parastr, i)
        training_set = [str(x) for x in all_qnos if cv_sets[x - 1] != i]
        test_set = [str(x) for x in all_qnos if cv_sets[x - 1] == i]

        filter_file(train_all, train_filename, training_set)
        filter_file(test_all, test_filename, test_set)

        # if not os.path.exists(model_file) or indriscore:
        l2r.train_model(train_filename,
                        model_file,
                        ranker=l2r.rankers[ranker],
                        metric=metric,
                        program=program,
                        params=rparams,
                        validation=intval)

        l2r.predict(model_file,
                    test_filename,
                    score_filename.format(parastr, ranker, i),
                    metric=metric,
                    program=program,
                    params=rparams)

        if program == 'RankLib':
            qscores.update(
                l2r.load_rankings(score_filename.format(parastr, ranker, i)))
            pmids.update(l2r.load_pmids_from_features(test_filename))
        elif program == 'Quickrank':
            qpmids = l2r.load_pmids_from_features(test_filename)
            qscores.update(
                l2r.load_quickrank_scores(
                    qpmids, score_filename.format(parastr, ranker, i)))
            pmids.update(qpmids)

    runfilename = unknown_base + 'tvs_L2R_{}_{}_{}_run.txt'.format(
        ranker, metric, parastr)
    l2r.save_reranked(qscores, pmids, runfilename)

    return runfilename
예제 #3
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May  1 13:59:42 2019

@author: lowellmilliken
"""

import learning_to_rank
#from gene_drug import save_all
#import gene_drug
import xml_to_params
import term_util
topicfile = 'topics201801.xml'

# returns a dict of <topic number> -> <topic object>
#gene_drug.save_drug_graph('drug_data/relationships/relationships.tsv', 'original', 'pharmgkbDG.pickle')
#save_all()
topics = learning_to_rank.load_topics(topicfile, all_drugs=True)
print(topics)

for topic in topics.values():

    qstring = xml_to_params.generate_query(topic)
    qstring = term_util.form_query(topic.qno, qstring)
    print(qstring)
예제 #4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 28 07:16:00 2019

@author: lowellmilliken
"""
import learning_to_rank
import time

topics = learning_to_rank.load_topics('topics201801.xml', all_drugs=True)
#print(topics)
#i=1
docs = learning_to_rank.load_docs('doctest1.txt')
with open('featurefile.txt', 'w') as outfile:
    for topic in topics.values():
        #print(topic.qno)
        # query number = '1' for first in docs file, '2' for second, etc.
        #print(docs.keys())
        #print(docs['1'])
        #print(docs)
        #with open('featurefile.txt', 'a') as outfile:
        #learning_to_rank.gen_features(topic, docs['1'], outfile, 0, len(docs), qrels=None, known=False, splitdrugs=False, textlen=True, precscores=None) # precscores will come back later after we get to the query generation component

        start_time = time.time()
        learning_to_rank.gen_features(topic,
                                      docs[topic.qno],
                                      outfile,
                                      0,
                                      len(docs),
                                      qrels=None,
예제 #5
0
def main(meta=True, filename='topics2018.xml', isexact=True, istreat=True,
         titleonly=False, issyn=True, isband=False, isscoreif=False, isdemo=False, isfiltdemo=True,
         isDrug=False, drugsyn=True, nodiseaseinC=False, geneinscoreif=False, lociandmut=False, expdrugs=False,
         drug_thres=0, target=False, agej=False, diseasej=False, genediseaset=False, genediseasek=False, nogene=False,
         nomut=False, prf=True, prfparams=(2,20,0.5,0.5), pmidfile=None, baseline=False, large=True, otherbase=False,
         indribase=False, noexp=False):
    """Generate a Indri parameter xml file from a topics xml file.

    :param meta: use metamap results for synonyms (default: True)
    :param filename: topics file name (default: topics2017.xml)
    :param isexact: use exact phrases (default: True)
    :param istreat: use treatment words (default: True)
    :param titleonly: search in title field only (default: False)
    :param issyn: contain synonyms in synonym tag (default: True)
    :param isband: use band tag (default: False)
    :param isscoreif: use scoreif tag for disease filtering (default: False)
    :param isdemo: use demographic information (default: False)
    :param isfiltdemo: filter and bin demographic information is isdemo is True (default: True)
    :param isDrug: use drug expansion (default: False)
    :param drugsyn: contain drugs in synonym tag (default: True)
    :param nodiseaseinC: do not use disease in query (default: False)
    :param geneinscoreif: use scoreif tag for gene filtering (default: False)
    :param lociandmut: use loci and mutation information in query (default: False)
    :param expdrugs: use drugs from Mallory et al. 2015 data (default: False)
    :param drug_thres: confidence threshold for expdrugs (default: 0)
    :param target: look for the word target within 5 words of the genes (default: False)
    :param agej: look for age in journal field (default: False)
    :param diseasej: look for disease in journal field (default: False)
    :param genediseaset: look for gene and disease in title field (default: False)
    :param genediseasek: look for gene and disease in keyword field (default: False)
    :param nogene: do not use gene information (default: False)
    :param nomut: do not use mutation terms such as amplification (default: False)
    :param prf: use pseudo-relevance feedback (default True)
    :param prfparams: PRF parameteres (default: (2,20,0.5,0.5))
    :param pmidfile: file name of pmid file to restrict search (default: None)
    :param baseline: create baseline search params using BM25 and tfidf (default: False)
    :param large: get 5000 results instead of 1000 (default: True)
    :param otherbase: if pmidfile contains PMIDs that are not from the qrels file this should be True (default: False)
    :param indribase: create queries with no expansion or structure (default: False)
    :return:
    """
    # from ???? can't find in TREC 2017 PM papers as of 3/8/2018, but it was definitely in one earlier...
    # The papers were revised between now and then and it may have been removed for some reason from a paper
    # treatment_words = 'surgery resistance therapy recurrence treatment targets prognosis malignancy prognostic study survival therapeutical patient outcome'

    #index_name = 'indexes/medline-ja2018-index'
    index_name='/Users/lowellmilliken/Documents/precision_medicine_contd/indexes/medline-ja2018-index-final2'
    pre_base = '<parameters><index>{}</index><runID>testRun</runID><trecFormat>true</trecFormat>\n'.format(index_name)
    post_base = '</parameters>'

    if prf:
        pre_base = '<parameters><index>{}</index><runID>testRun</runID><trecFormat>true</trecFormat>'.format(index_name)
        pre_base += '<fbDocs>{}</fbDocs><fbTerms>{}</fbTerms><fbMu>{}</fbMu><fbOrigWeight>{}</fbOrigWeight>\n'.format(
            prfparams[0], prfparams[1], prfparams[2], prfparams[3])

    if large:
        pre_base += '<count>5000</count>\n'

    outfilename = filename[:-4] + '_'

    if not indribase:
        if meta:
            outfilename += 'm_'
        else:
            outfilename += 'nm'

        if issyn:
            outfilename += 'as_'

        if isscoreif:
            outfilename += 'sf_'

        if isexact:
            outfilename += 'ex_'

        if istreat:
            outfilename += 'tr_'

        if isdemo and isfiltdemo:
            outfilename += 'fd_'
        elif isdemo:
            outfilename += 'd_'
        else:
            outfilename += 'nd_'

        if titleonly:
            outfilename += 't_'
        else:
            outfilename += 'ft_'

        outfilename += 'nsh_'

        if isDrug:
            outfilename += 'd_'
            if drugsyn:
                outfilename += 'ds_'
                if expdrugs:
                    outfilename += 'ed{}_'.format(drug_thres)

        if nodiseaseinC:
            outfilename += 'ndC_'

        if geneinscoreif:
            outfilename += 'gsf_'

        if lociandmut:
            if not nomut:
                if not nogene:
                    outfilename += 'lnm_'
                else:
                    outfilename += 'lnmng_'
            else:
                if not nogene:
                    outfilename += 'l_'
                else:
                    outfilename += 'lng_'

        if target:
            outfilename += 't_'

        if agej:
            outfilename += 'aj_'
        if diseasej:
            outfilename += 'dj_'
        if genediseaset:
            outfilename += 'gdt_'
        if genediseasek:
            outfilename += 'gdk_'

        if noexp:
            outfilename += 'noexp_'

        if prf:
            outfilename += 'prf-{}-{}-{}-{}_'.format(prfparams[0], prfparams[1], prfparams[2], prfparams[3])

        if pmidfile is not None:
            if otherbase:
                outfilename += 'ob-{}_'.format(pmidfile[:-10])
            else:
                outfilename += 'basescores_'
    else:
        outfilename += 'indribase_'

    if large:
        outfilename += 'large_'

    outfilename += 'gfix_'

    if pmidfile is not None:
        if otherbase:
            outfilename += 'ob-{}_'.format(pmidfile[:-10])
        else:
            outfilename += 'basescores_'

    outfilename += 'params.xml'

    if isDrug:
        if expdrugs:
            topicfile = '{}_drugthres-{}.pickle'.format(filename[:-4], drug_thres)
        else:
            topicfile = '{}_pgkb.pickle'.format(filename[:-4])
    else:
        topicfile = '{}.pickle'.format(filename[:-4])

    if os.path.exists(topicfile):
        with open(topicfile, 'rb') as infile:
            topics = pickle.load(infile)
    else:
        topics = learning_to_rank.load_topics(filename, drug_thres=drug_thres, emdrugs=expdrugs)
        with open(topicfile, 'wb') as outfile:
            pickle.dump(topics, outfile)

    qpmids = None
    if pmidfile is not None:
        qpmids = load_pmids(pmidfile)

    if baseline and pmidfile is not None:
        basename = pmidfile[:-10]
        with open(basename + '_tfidfbase_params.xml', 'w') as tfidffile, open(basename + '_bm25base_params.xml', 'w') as bm25file:
            tfidffile.write(pre_base + '<baseline>tfidf</baseline>\n')
            bm25file.write(pre_base + '<baseline>okapi</baseline>\n')
            #for number in range(1, len(topics) + 1):
            #topics2017=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
            #topicscrossval=[1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80]
            # Held out set topics list
            topicsremain2018=[2, 8, 16, 28, 33, 34]
            
            for number in topicsremain2018:
                topic = topics[str(number)]
                query = ' '.join([topic.disease, ' '.join(topic.disease_syn), ' '.join(topic.genes), ' '.join(topic.other)])
                query = form_query(str(number), clean(query), qpmids)
                tfidffile.write(query)
                bm25file.write(query)

            tfidffile.write(post_base)
            bm25file.write(post_base)

        return

    with open(outfilename, 'w') as outFile:
        outFile.write(pre_base)
        tree = ET.parse(filename)
        root = tree.getroot()

        for topic in root:
            number = topic.attrib['number']
            topic = topics[number]

            if not indribase:
                qstring = generate_query(topic, isexact=isexact, meta=meta, istreat=istreat, isdemo=isdemo,
                                         isfiltdemo=isfiltdemo, issyn=issyn, target=target, isDrug=isDrug, drugsyn=drugsyn,
                                         isband=isband, isscoreif=isscoreif, nodiseaseinC=nodiseaseinC, lociandmut=lociandmut,
                                         nomut=nomut, nogene=nogene, titleonly=titleonly, agej=agej, diseasej=diseasej,
                                         genediseaset=genediseaset, genediseasek=genediseasek, geneinscoreif=geneinscoreif,
                                         noexp=noexp)
            else:
                qstring = clean(topic.base)

            outFile.write(form_query(number, qstring, qpmids))

        outFile.write(post_base)

    print('output to ' + outfilename)
예제 #6
0
def do_cv(
        unknown_docs_filename='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_alldocs.txt',
        metric='P@10',
        program='RankLib',
        indriscore=False,
        otherscore=False,
        ranker='ListNet',
        rparams=None,
        kscorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_basescores_large_gfix_run.txt',
        scorefile='topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-10-0.5-0.8_ob-topics2017_m_as_ex_tr_nd_ft_nsh_prf-2-20-0.5-0.5_large_gfix_large_gfix_run.txt',
        fixparts=True,
        normscores=False,
        intval=True,
        unknownscoresfilename=None,
        trainallparam=True,
        testallparam=False):
    """Creates a ranked list output file in TREC format doing training and cross validation for LeToR.

    :param unknown_docs_filename: name of file containing abstracts from the current Retrieval stage run
    :param meta: Boolean. Use metamap CUIs or not. Requires unknown_docs_filename + '.meta' file containing CUIs for each abstract.
    :param splitdrugs: split drugs into multiple features?
    :param metric: metric to train on. See RankLib help for options.
    :param program: Program to do LeToR with. Default RankLib.
    :param filtered: Filter CUIs to use with meta option. Requires either fterms.pickle or terms_filtered.pickle (for phraseterms) file.
    :param targetproxy: Use proximity to the work 'target' as a feature.
    :param dist: distance threshold for 'target' proximity
    :param journaldisease: Use disease presence in journal name as a feature.
    :param textlen: Use abtract length as a feature.
    :param indriscore: Use the indri score as a feature. Requires Indri scores for the qrel documents called unknown_docs_filename[:-11] + basescores_run.txt and a Indri results file called unknown_docs_filename[:-11] + run.txt
    :param otherscore: Use tf-idf and bm25 scores as a feature. Requires 'qrel_tfidfbase_run.txt' and 'qrel_bm25base_run.txt' as well as unknown_docs_filename[:-11] + 'tfidfbase_run.txt' and unknown_docs_filename[:-11] + 'bm25base_run.txt'
    :param ranker: LeToR ranker to use. See RankLib help for options.
    :param rparams: LeToR parameters in a dictionary. See RankLib help for options. Parameter name including leading '-' is key and parameter value is value.
    :param kscorefile: Alternate score file for use as a feature. This should be scores for the known qrels for training.
    :param scorefile: Alternate score file for use as a feature. This should be scores for the unknown documents for testing.
    :param fixparts: Boolean. Fixed cross-valiation partitions if True.
    :param normscores: Boolean. If True, Indri scores are normalized by (score - minscore)/(maxscore - minscore). Using the '-norm' in rparams with a norm type is preferred. See RankLib help.
    :param phraseterms: Boolean. Use only metamapped CUI terms from original terms that are not unigrams.
    :param intval: Boolean. Use RankLib internal validation. True preferred.
    :param termfile: Explicit set of CUI terms to use. A list in a pickle file.
    :param termkeyfile: Keys for mapping terms in the term file to features. Dict in a pickle file. Key = term. Value = term number (which maps to a feature number).
    :param nodrugs: Boolean. If True, do not use any drug information as a feature.
    """
    unknown_base = unknown_docs_filename[:-11]
    parastr = 'n'

    if indriscore:
        parastr += '_is'
    if otherscore:
        parastr += '_os'
    if normscores:
        parastr += '_ns'

    if scorefile:
        parastr += '_sf'
    if not intval:
        parastr += '_nov'

    topics = l2r.load_topics(topicfile='crossvalidationtopics.xml')

    meta_docs = None
    unknown_meta_docs = None

    if indriscore:
        basescores = load_indriscores(unknown_base + 'basescores_run.txt',
                                      normscores)
        unknownscores = load_indriscores(unknown_base + 'run.txt', normscores)
    else:
        basescores = None
        unknownscores = None

    if otherscore:
        basetfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(
            res_filename='qrel_tfidfbase_run.txt')
        basebm25scores = parse_results_for_top_N.load_indri_tfidf_scores(
            res_filename='qrel_bm25base_run.txt')

        #unknowntfidfscores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename=unknownscoresfilename+'tfidfbase_run.txt')
        #unknownbm25scores = parse_results_for_top_N.load_indri_tfidf_scores(res_filename=unknownscoresfilename+'bm25base_run.txt')
        unknowntfidfscores = None
        unknownbm25scores = None
    else:
        basetfidfscores = None
        basebm25scores = None

        unknowntfidfscores = None
        unknownbm25scores = None

    if scorefile:
        kprecscores = load_indriscores(kscorefile, normscores)
        precscores = load_indriscores(scorefile, normscores)
    else:
        kprecscores = None
        precscores = None

    if trainallparam == True:
        train_all = cv_dir + os.sep + features_template.format(parastr, 'all')

        # if not os.path.exists(train_all) or indriscore:
        known_docs = l2r.load_docs()

        l2r.save_all_features(topics,
                              known_docs,
                              train_all,
                              known=True,
                              metadocs=meta_docs,
                              scores=basescores,
                              tfidfscores=basetfidfscores,
                              bm25scores=basebm25scores,
                              precscores=kprecscores)
    if testallparam == True:
        # if not os.path.exists(test_all):
        test_all = cv_dir + os.sep + unknown_template.format(
            unknown_base, parastr, 'all')
        unknown_docs = l2r.load_docs(unknown_docs_filename)
        l2r.save_all_features(topics,
                              unknown_docs,
                              test_all,
                              known=False,
                              metadocs=unknown_meta_docs,
                              scores=unknownscores,
                              tfidfscores=unknowntfidfscores,
                              bm25scores=unknownbm25scores,
                              precscores=precscores)

    cv_file = cv_dir + os.sep + 'cv_sets.txt'
    if fixparts and os.path.exists(cv_file):
        cv_sets = []
        with open(cv_file, 'r') as cvsetfile:
            for line in cvsetfile:
                cv_sets.append(int(line.strip()))
    else:
        cv_sets = gen_cv_sets()
        with open(cv_file, 'w') as cvsetfile:
            for i in cv_sets:
                cvsetfile.write('{}\n'.format(i))