def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')):
    """Get metrics (ndcg and map by default) for a run compared to a qrel file.

    Arguments:
        qrel_file -- qrel file with ground truth data
        run_file -- predictions from the run
        metrics -- which metrics to evaluate on,
                   can use any valid metrics that the trec_eval tool accepts

    Returns:
        metric_values -- dictionary of metric values (out of 100), rounded to two decimal places
    """
    with open(qrel_file, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(run_file, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
        
    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics))
    results = evaluator.evaluate(run)

    metric_values = {}
    for measure in sorted(metrics):
        res = pytrec_eval.compute_aggregated_measure(
                measure, 
                [query_measures[measure]  for query_measures in results.values()]
            )
        metric_values[measure] = np.round(100 * res, 2)
    return metric_values
Пример #2
0
def cal_ndcg(qrels, trec, k):
    with open(qrels, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(trec, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)
    results = evaluator.evaluate(run)
    for query_id, query_measures in sorted(results.items()):
        pass

    mes = {}
    for measure in sorted(query_measures.keys()):
        mes[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])

    metric = 'ndcg_cut_%d' % k
    if metric not in mes:
        print('Depth of NDCG not available.')
        exit()
    ndcg = mes[metric]

    return ndcg
Пример #3
0
def evaluate(eval_path, qrel_path, res_path):

    measures = {"map", "ndcg_cut", "recall", "P"}

    with open(qrel_path, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

    with open(res_path, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    all_metrics = evaluator.evaluate(run)

    metrics = {
        'P_5': 0,
        'P_10': 0,
        'P_20': 0,
        'ndcg_cut_5': 0,
        'ndcg_cut_10': 0,
        'ndcg_cut_20': 0,
        'ndcg_cut_100': 0,
        'map': 0,
        'recall_100': 0
    }

    nb_queries = len(all_metrics)
    for key, values in all_metrics.items():
        for metric in metrics:
            metrics[metric] += values[metric] / nb_queries

    with open(eval_path, 'w') as f:
        json.dump(metrics, f)
Пример #4
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10',
                   split: dict = None,
                   split_idx: int = -1) -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        # partial evaluation
        if split is not None and split_idx >= 0:
            for qid in copy.deepcopy(run):
                if qid not in split[split_idx]:
                    _ = run.pop(qid)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')
    parser.add_argument('measure')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
    
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {args.measure})

    results = evaluator.evaluate(run)
    
    def print_line(measure, scope, value):
        #scope = query_id = topic_id
        print('{:25s}{:8s}{:.22f}'.format(measure, scope, value))
    avg_DCG = []
    for query_id, query_measures in results.items():
   
        for measure, value in sorted(query_measures.items()):
            avg_DCG.append(value)
            print_line(measure, query_id, value)
    print(avg_DCG)
    print(mean(avg_DCG))
    print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
Пример #6
0
def setup_evaluator_from_relevance_file(qrel_path,
                                        measures={
                                            "map", "ndcg_cut", "recall", "P"
                                        }):
    with open(qrel_path, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    return pytrec_eval.RelevanceEvaluator(qrel, measures)
Пример #7
0
def pytrec_evaluation(runfile, qrelfile, measures = pytrec_eval.supported_measures):
    """ run trec_eval with "measures" from the Python interface """
    with open(runfile, "r") as ranking:
        run = pytrec_eval.parse_run(ranking)
    with open(qrelfile, "r") as qrel:
        qrel = pytrec_eval.parse_qrel(qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, measures)

    return evaluator.evaluate(run)
Пример #8
0
def eval_trec_file(run_file, ref_file):
    with open(run_file) as f:
        run=parse_run(f)
    with open(ref_file) as f:
        qrel = pytrec_eval.parse_qrel(f)

    results = eval_trec(run, qrel)
    avg=dict()
    for q in results:
        for k in results[q]:
            if k in avg:
                avg[k]+=results[q][k]
            else:
                avg[k]=results[q][k]
    for k in avg:
        avg[k] = avg[k]/len(results)
    return avg
Пример #9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run', nargs=2)

    # A bit too strict, as it does not allow for parametrized measures,
    # but sufficient for the example.
    parser.add_argument(
        '--measure',
        #choices=pytrec_eval.supported_measures,
        required=True)

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert all(map(os.path.exists, args.run))

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run[0], 'r') as f_run:
        first_run = pytrec_eval.parse_run(f_run)

    with open(args.run[1], 'r') as f_run:
        second_run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, {args.measure})

    first_results = evaluator.evaluate(first_run)
    print(first_results.keys())
    second_results = evaluator.evaluate(second_run)

    query_ids = list(set(first_results.keys()) & set(second_results.keys()))

    first_scores = [
        first_results[query_id][args.measure] for query_id in query_ids
    ]
    second_scores = [
        second_results[query_id][args.measure] for query_id in query_ids
    ]

    print(scipy.stats.ttest_rel(first_scores, second_scores))
Пример #10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            print_line(measure, query_id, value)

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    for measure in sorted(query_measures.keys()):
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))
Пример #11
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10') -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
Пример #12
0
def read_collection(collection_path, k=5):
    """Function that for every TREC collection reads queries , create folds for the Kfold cross validation
    ,reads the collection qrels ,save qrels and queries for each fold,reads documents
    on xml format and saves them into csv format""" #HR

    queries = read_queries(collection_path + '/queries')

    folds = build_folds(list(queries.keys()), k=k)

    with open(collection_path + '/qrels', 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    for i, fold in enumerate(folds):
        if not os.path.exists(collection_path + '/fold' + str(i)):
            os.makedirs(collection_path + '/fold' + str(i))
        save_qrel(collection_path + '/fold' + str(i) + '/qrels', qrel, fold)

    save_queries_csv(collection_path, queries, folds)

    documents = read_documents(collection_path + '/documents.xml')

    save_documents_csv(collection_path, documents)
Пример #13
0
    def __test(self):
        with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \
                f_trec_eval:
            trec_eval_output = parse_trec_eval(f_trec_eval)

        measures = set(
            measure if measure in pytrec_eval.supported_measures else
            prefix_match(measure, pytrec_eval.supported_measures)
            for measure in trec_eval_output['all'].keys())

        with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs)

        results = evaluator.evaluate(run)

        expected_measures = trec_eval_output['all']

        for measure in expected_measures:
            agg_measure_value = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measure_values[measure]
                    for query_measure_values in results.values()
                ])

            ground_truth_agg_measure_value = \
                trec_eval_output['all'][measure]

            self.assertAlmostEqual(agg_measure_value,
                                   ground_truth_agg_measure_value,
                                   places=3,
                                   msg=measure)
Пример #14
0
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))

    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # parse and store qrels
    if FLAGS.qrels_fname:
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision
    else:
        print("please provide qrels filename")
        return False
    """
	LEXICAL PREPROCESSING
	"""

    # parse input run
    print('parse input run')
    with open(FLAGS.run_path, 'r') as runf:
        run = pytrec_eval.parse_run(runf)
    """
	SEMANTIC PREPROCESSING
	"""

    # load required data
    print(
        'load processed data required to perform re-ranking over lexical model w/ semantic model'
    )
    with open(data_folder + '/docs.json', 'r') as cf:
        corpus = json.load(cf)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dictionary
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    # compute doc embeddings
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)
    """
	COMPUTE RE-RANKING
	"""

    # set random seed
    np.random.seed(FLAGS.seed)
    # load queries
    q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
    # get query ids
    qids = list(q.keys())
    # shuffle query ids
    np.random.shuffle(qids)

    if FLAGS.fixed_gamma:
        # perform re-ranking based on a fixed value of gamma
        print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                        str(FLAGS.fixed_gamma))
        # combine rankings using fixed gamma
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
        # store test ranking in combined run
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, docno)
                              for docno, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_gamma_' +
                             str(FLAGS.fixed_gamma) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_gamma_' +
                          str(FLAGS.fixed_gamma), qrels_folder,
                          FLAGS.qrels_fname)
    else:
        # learn optimal weight to combine runs
        print("learn optimal weight to combine runs with sweep: {}".format(
            FLAGS.sweep))
        # set variable to store scores and weights
        scores_and_weights = []

        # initialize kfold with FLAGS.num_folds
        kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
        for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
            print('fold n. {}'.format(fold))
            # restrict queries to train_qids and test_qids
            qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
            qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
            # obtain best combination on training queries
            train_score, best_train_weight = max(
                tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos,
                                           doc_embs, word_dict, word_embs,
                                           FLAGS.sweep,
                                           SCORE_NORMALIZERS[FLAGS.normalizer],
                                           FLAGS.ref_measure, evaluator))
            print('fold %d: best_train_weight=%.2f, %s =%.4f' %
                  (fold, best_train_weight, FLAGS.ref_measure, train_score))
            # compute combined run with best combination on test queries
            test_crun = tf_utils.compute_combined_run(
                run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                best_train_weight)
            # evaluate test run
            test_res = evaluator.evaluate(test_crun)
            # compute aggregated measure score for test queries
            test_score = pytrec_eval.compute_aggregated_measure(
                FLAGS.ref_measure,
                [qscore[FLAGS.ref_measure] for qscore in test_res.values()])
            # store averaged scores w/ best weights
            scores_and_weights.append(
                (np.mean([train_score, test_score]), best_train_weight))

        # get (best) weight that produces the highest averaged score
        best_score, best_weight = max(scores_and_weights)
        print('found best weight=%.2f' % (best_weight))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' +
                                        str(FLAGS.best_weight))
        # compute combined run based on test weight
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
        # store ranking in crun
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, doc_id)
                              for doc_id, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_best_weight_' +
                             str(FLAGS.best_weight) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print(
            'evaluate run combined w/ {}-fold cross validation and best weight={}'
            .format(FLAGS.num_folds, FLAGS.best_weight))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_best_weight_' +
                          str(FLAGS.best_weight), qrels_folder,
                          FLAGS.qrels_fname)
Пример #15
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', nargs="?", type=str)
    args = parser.parse_args()

    config = json.load(open(args.config, 'r'))

    IR_models = [
        mz.models.list_available()[i] for i in config["index_mz_models"]
    ]

    with open(config["collection_path"] + '/test/qrels', 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(config["measures"]))

    bm25_res = json.load(
        open(config["collection_path"] + '/test/' + 'BM25.metrics.json', 'r'))

    with open(config["collection_path"] + '/test/' + 'BM25.res', 'r') as f_run:
        bm25_run = pytrec_eval.parse_run(f_run)

    bm25_results = evaluator.evaluate(bm25_run)

    _ = ""

    for key, value in bm25_res.items():
        if key in config["print_measures"]:
            _ += str(value)[:6] + " & "

    print('BM25 & ' + _[:-2] + '\\\\')

    all_res = dict()
    for model_class in IR_models:

        validation_path = config[
            "collection_path"] + '/validation/' + model_class.__name__
        test_path = config["collection_path"] + '/test/' + model_class.__name__

        if os.path.exists(validation_path) and os.path.exists(test_path):
            best_model = ""
            best_metric = 0
            for file in os.listdir(validation_path):
                if '.json' in file:
                    val_res = json.load(open(validation_path + '/' + file,
                                             'r'))
                    if val_res[config["optim_measure"]] > best_metric:
                        best_model = file
                        best_metric = val_res[config["optim_measure"]]

            if best_model != "" and os.path.exists(test_path + '/' +
                                                   best_model):
                test_res = json.load(open(test_path + '/' + best_model, 'r'))
                all_res[model_class.__name__] = [best_model, test_res]

                with open(
                        config["collection_path"] + '/test/' +
                        model_class.__name__ + '/' + best_model[:-12] + 'res',
                        'r') as f_run:
                    run = pytrec_eval.parse_run(f_run)

                results = evaluator.evaluate(run)

                query_ids = list(
                    set(bm25_results.keys()) & set(results.keys()))

                _ = ""

                for key, value in test_res.items():
                    if key in config["print_measures"]:
                        bm25_scores = [
                            bm25_results[query_id][key]
                            for query_id in query_ids
                        ]
                        scores = [
                            results[query_id][key] for query_id in query_ids
                        ]
                        test = scipy.stats.ttest_rel(bm25_scores, scores)
                        _ += str(value)[:6]
                        if test[0] < 0:
                            if test[1] < 0.01 / len(config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{++}}"
                            elif test[1] < 0.05 / len(
                                    config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{+}}"

                        else:
                            if test[1] < 0.01 / len(config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{-\,-}}"
                            elif test[1] < 0.05 / len(
                                    config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{-}}"

                        _ += " & "

                print(model_class.__name__ + ' & ' + _[:-2] + '\\\\')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--coll_path', nargs="?", type=str)
    parser.add_argument('-i', '--indexed_path', nargs="?", type=str)
    parser.add_argument('-p', '--plot_path', nargs="?", type=str)
    parser.add_argument('-r', '--results_path', nargs="?", type=str)
    parser.add_argument('-n', '--experiment_name', nargs="?", type=str)

    args = parser.parse_args()

    print(args, flush=True)

    if not os.path.exists(args.results_path + '/validation/' +
                          args.experiment_name):
        os.makedirs(args.results_path + '/validation/' + args.experiment_name)

    if not os.path.exists(args.results_path + '/test/' + args.experiment_name):
        os.makedirs(args.results_path + '/test/' + args.experiment_name)

    if not os.path.exists(args.plot_path + '/validation/'):
        os.makedirs(args.plot_path + '/validation/')

    if not os.path.exists(args.plot_path + '/test/'):
        os.makedirs(args.plot_path + '/test/')

    #Initializing the results plot values for the different models #HR
    validation_plot_values = dict()
    test_plot_values = dict()

    for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
        validation_plot_values[model_name] = [[], []]
        test_plot_values[model_name] = [[], []]

    #Loading indexed collection #HR

    Collection = wikIR_Collection.Collection()
    with open(args.indexed_path, 'rb') as f:
        Collection = pickle.load(f)

    Collection.doc_index[-1] = "-1"
    Collection.doc_index["-1"] = -1

    #Loading validation and test query relavance values #HR
    with open(args.coll_path + 'validation/qrels', 'r') as f_qrel:
        validation_qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.coll_path + 'test/qrels', 'r') as f_qrel:
        test_qrel = pytrec_eval.parse_qrel(f_qrel)

    print('------------------------------start--------------------------',
          flush=True)

    #Evaluating the baseline models without TDV weights and saving the results of validation and test partitions #HR
    utils.eval_baseline_index_wikir(args.coll_path, Collection,
                                    validation_qrel, test_qrel,
                                    validation_plot_values, test_plot_values,
                                    args.results_path, args.experiment_name, 0)
    #Printing ndcg5 values for validation and test partitions #HR
    ndcg5_val = dict()
    ndcg5_test = dict()
    for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
        ndcg5_val[model_name] = validation_plot_values[model_name][1][0][
            'ndcg_cut_5']
        print("ndcg5 validation ",
              model_name,
              " of collection ",
              os.path.basename(args.coll_path),
              " ",
              ndcg5_val[model_name],
              flush=True)
        ndcg5_test[model_name] = test_plot_values[model_name][1][0][
            'ndcg_cut_5']
        print("ndcg5 test ",
              model_name,
              " of collection ",
              os.path.basename(args.coll_path),
              " ",
              ndcg5_test[model_name],
              flush=True)
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    index_folder = 'corpus/' + FLAGS.corpus_name + '/index'
    # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
        # if not os.path.exists(model_folder):
        # os.makedirs(model_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # set random seed - enable reproducibility
    np.random.seed(FLAGS.seed)
    # establish connection with UMLS db
    umls_lookup = umls.UMLSLookup()

    # load required data
    print(
        'load processed data required to retrofit word vectors and perform retrieval tasks'
    )
    with open(data_folder + '/docs.json', 'r') as df:
        corpus = json.load(df)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dict
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))
    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # pre process relational data
    if not os.path.exists(data_folder + '/term2cui.json'):
        # map terms to cuis using QuickUMLS
        term2cui = tf_utils.get_term2cui(word_dict,
                                         data_folder,
                                         threshold=FLAGS.threshold,
                                         stypes_fname=FLAGS.stypes_fname)
    else:
        # laod (term, cui) pairs
        print('load (term, cui) pairs')
        with open(data_folder + '/term2cui.json', 'r') as tcf:
            term2cui = json.load(tcf)
    """
	SEMANTIC PROCESSING
	"""

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    """
	RETROFITTING
	"""

    if FLAGS.retrofit:
        # get synonyms for each word within vocabulary
        print('get synonyms')
        syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup)
        if FLAGS.syn_weights:
            # convert collection frequencies from list to dict
            cfs = dict(cfs)
        else:
            cfs = None
        # retrofit word vectors
        print('retrofit word vectors for {} iterations'.format(
            FLAGS.iterations))
        word_embs = retrofit(word_embs,
                             syns,
                             reverse_word_dict,
                             FLAGS.iterations,
                             alpha=1.0,
                             beta=FLAGS.beta,
                             cfs=cfs)

    # compute doc embeddings
    print('compute document vectors w/ retrofitted word vectors')
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)

    if not FLAGS.reranking:
        """
		RETRIEVAL
		"""
        print('perform retrieval over the entire collection')
        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # set query embs and ids
        q_embs = []
        q_ids = []
        # loop over queries and generate rankings
        for qid, qtext in q.items():
            # prepare queries for semantic matching
            q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict,
                                            word_embs)
            if q_proj is None:
                print('query {} does not contain known terms'.format(qid))
            else:
                q_embs.append(q_proj)
                q_ids.append(qid)
        q_embs = np.array(q_embs)
        # perform search and evaluate model effectiveness
        tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs,
                                 rankings_folder, FLAGS.model_name)
        scores = tf_utils.evaluate(
            ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder,
            FLAGS.model_name, qrels_folder, FLAGS.qrels_fname)

    else:
        """
		RE-RANKING
		"""
        print('perform re-ranking over top 1000 documents from a baseline run')
        # parse and store qrels
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision

        # parse input run
        print('parse input run')
        with open(FLAGS.run_path, 'r') as runf:
            run = pytrec_eval.parse_run(runf)

        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # get query ids
        qids = list(q.keys())
        # shuffle query ids
        np.random.shuffle(qids)

        if FLAGS.fixed_gamma:
            # perform re-ranking based on a fixed value of gamma
            print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                            str(FLAGS.fixed_gamma))
            # combine rankings using fixed gamma
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
            # store test ranking in combined run
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, docno)
                          for docno, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_gamma_' +
                                 str(FLAGS.fixed_gamma) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_gamma_' +
                              str(FLAGS.fixed_gamma), qrels_folder,
                              FLAGS.qrels_fname)
        else:
            # learn optimal weight to combine runs
            print("learn optimal weight to combine runs with sweep: {}".format(
                FLAGS.sweep))
            # set variable to store scores and weights
            scores_and_weights = []
            # initialize kfold with FLAGS.num_folds
            kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
            for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
                print('fold n. {}'.format(fold))
                # restrict queries to train_qids and test_qids
                qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
                qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
                # obtain best combination on training queries
                train_score, best_train_weight = max(
                    tf_utils.perform_reranking(
                        run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict,
                        word_embs, FLAGS.sweep,
                        SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure,
                        evaluator))
                print(
                    'fold %d: best_train_weight=%.2f, %s =%.4f' %
                    (fold, best_train_weight, FLAGS.ref_measure, train_score))
                # compute combined run with best combination on test queries
                test_crun = tf_utils.compute_combined_run(
                    run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                    word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                    best_train_weight)
                # evaluate test run
                test_res = evaluator.evaluate(test_crun)
                # compute aggregated measure score for test queries
                test_score = pytrec_eval.compute_aggregated_measure(
                    FLAGS.ref_measure, [
                        qscore[FLAGS.ref_measure]
                        for qscore in test_res.values()
                    ])
                # store averaged scores w/ best weights
                scores_and_weights.append(
                    (np.mean([train_score, test_score]), best_train_weight))

            # get (best) weight that produces the highest averaged score
            best_score, best_weight = max(scores_and_weights)
            print('found best weight=%.2f' % (best_weight))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name +
                                            '_best_weight_' +
                                            str(FLAGS.best_weight))
            # compute combined run based on test weight
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
            # store ranking in crun
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, doc_id)
                          for doc_id, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_best_weight_' +
                                 str(FLAGS.best_weight) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print(
                'evaluate run combined w/ {}-fold cross validation and best weight={}'
                .format(FLAGS.num_folds, FLAGS.best_weight))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_best_weight_' +
                              str(FLAGS.best_weight), qrels_folder,
                              FLAGS.qrels_fname)
Пример #18
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--index',
                        type=pyndri.utils.existing_directory_path,
                        required=True)

    parser.add_argument('--limit_queries_for_debug',
                        type=pyndri.utils.positive_int,
                        default=None)

    parser.add_argument('--test_set_size', type=float, default=None)

    parser.add_argument('--num_epochs',
                        type=pyndri.utils.positive_int,
                        default=500)

    parser.add_argument('--queries',
                        type=pyndri.utils.existing_file_path,
                        required=True)
    parser.add_argument('--query_relevance',
                        type=pyndri.utils.existing_file_path,
                        required=True)

    parser.add_argument('--trace_output',
                        type=pyndri.utils.nonexisting_file_path,
                        required=True)

    args = parser.parse_args()

    args.index = pyndri.Index(args.index)

    try:
        pyndri.utils.configure_logging(args)
    except IOError:
        return -1

    qrel = {}

    env = RetrievalEnv(args.index, max_num_expanded_query_terms=5)

    with open(args.queries, 'r') as f_queries:
        queries = list(pyndri.utils.read_queries(f_queries).items())

    with open(args.query_relevance, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    queries_idx = np.array(list(range(len(queries))))
    np.random.shuffle(queries_idx)

    if args.limit_queries_for_debug:
        queries_idx = queries_idx[:args.limit_queries_for_debug]

    if args.test_set_size and args.test_set_size > 0:
        train_queries_idx, test_queries_idx = \
            sklearn.model_selection.train_test_split(
                queries_idx, test_size=args.test_set_size)

        logging.info('Split query set into train=%s and test=%s.',
                     train_queries_idx.size, test_queries_idx.size)

        def evaluate(agent):
            episode_count = 1
            max_steps = 10

            logging.info('Evaluating %s using %d queries.', agent,
                         len(test_queries_idx))

            ndcgs = []

            for idx, query_idx in enumerate(test_queries_idx):
                reward = 0
                done = False

                for i in range(episode_count):
                    query_id, query_str = queries[query_idx]
                    ob = env._reset(query_str, qrel[query_id])

                    for _ in range(max_steps):
                        action = agent.act(ob,
                                           reward,
                                           done,
                                           deterministic=True)

                        if action is not None:
                            ob, reward, done, _ = env.step(action)

                        if done:
                            break

                    logging.debug('Query %s: %.4f -> %.4f', query_id,
                                  env.original_utility, env.state['utility'])

                    ndcgs.append(env.state['utility'])

                if idx > 0 and (idx + 1) % 10 == 0:
                    logging.info('Finished %d out of %d queries.', idx + 1,
                                 len(test_queries_idx))

            return ndcgs
    else:
        train_queries_idx = queries_idx

        def evaluate(agent):
            return np.nan,

    if args.trace_output:
        f_trace_out = open(args.trace_output, 'w')
    else:
        f_trace_out = None

    agents = [
        NullAgent(),
        RandomAgent(env.action_space),
        TabularQAgent(env.observation_space, env.action_space)
    ]

    ndcg_per_agent = {}

    for agent in agents:
        if agent.can_learn():
            logging.info('Training %s using %d queries.', agent,
                         len(train_queries_idx))

            avg_rewards = []
            test_set_ndcgs = []

            start_time = time.time()

            for epoch_idx in range(args.num_epochs):
                logging.info('Epoch %d.', epoch_idx + 1)

                np.random.shuffle(train_queries_idx)

                avg_reward = 0.0

                for idx, query_idx in enumerate(train_queries_idx):
                    query_id, query_str = queries[query_idx]
                    relevance = qrel[query_id]

                    logging.debug('Learning from %s.', query_id)

                    total_reward = agent.learn(env, query_str, relevance)

                    if total_reward is not None:
                        avg_reward += total_reward

                    if idx > 0 and (idx + 1) % 500 == 0:
                        logging.info('Finished %d out of %d queries.', idx + 1,
                                     len(train_queries_idx))

                avg_reward /= len(train_queries_idx)
                avg_rewards.append(avg_reward)

                epoch_finish_time = time.time()

                epoch_data = {
                    'agent': agent.name,
                    'epoch_idx': epoch_idx,
                    'train_avg_reward': avg_reward,
                    'seconds_since_start': epoch_finish_time - start_time,
                }

                logging.info('Average rewards: %s', avg_rewards)

                if (epoch_idx + 1) % 10 == 0:
                    test_set_ndcg = np.mean(evaluate(agent))
                    test_set_ndcgs.append(test_set_ndcg)

                    logging.info('Test set NDCGs: %s', test_set_ndcgs)

                    epoch_data['test_set_ndcg'] = test_set_ndcg

                if f_trace_out:
                    f_trace_out.write(json.dumps(epoch_data))
                    f_trace_out.write('\n')

                    f_trace_out.flush()

        ndcgs = evaluate(agent)
        logging.info('NDCG: %.4f', np.mean(ndcgs))

        ndcg_per_agent[agent] = np.mean(ndcgs)

    logging.info('%s', ndcg_per_agent)

    if f_trace_out:
        f_trace_out.close()
def main():
    # parsing arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--coll_path', nargs="?", type=str)
    parser.add_argument('-i', '--indexed_path', nargs="?", type=str)
    parser.add_argument('-p', '--plot_path', nargs="?", type=str)
    parser.add_argument('-r', '--results_path', nargs="?", type=str)
    parser.add_argument('-f', '--folds', nargs="?", type=int, default=5)
    parser.add_argument('-n', '--experiment_name', nargs="?", type=str)

    args = parser.parse_args()

    print(args, flush=True)

    # Loading indexed collection
    Collection = TrecCollection()
    with open(args.indexed_path, 'rb') as f:
        Collection = pickle.load(f)

    Collection.doc_index[-1] = "-1"
    Collection.doc_index["-1"] = -1
    # Loading relevance judgements from collection
    with open(args.coll_path + 'qrels', 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)
    # ????
    id_titl = Collection.vocabulary['titl']

    for i in range(len(Collection.all_indexed_queries)):

        if Collection.all_indexed_queries[i][0] == id_titl and len(
                Collection.all_indexed_queries[i]) > 1:
            del Collection.all_indexed_queries[i][0]

    for i in range(len(Collection.indexed_queries)):
        for j in range(len(Collection.indexed_queries[i])):
            if Collection.indexed_queries[i][j][0] == id_titl and len(
                    Collection.indexed_queries[i][j]) > 1:
                del Collection.indexed_queries[i][j][0]

    print('---------------------start-------------------', flush=True)
    # Getting collection vocabulary size and total number of elements in collection
    coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index(
        Collection.inverted_index)
    # Creating for each fold and for a certain experiment a directory for results and plots data
    plot_values_folds_list = []
    for fold in range(args.folds):

        plot_values = dict()

        for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
            plot_values[model_name] = [[], []]

        if not os.path.exists(args.results_path + '/fold' + str(fold) + '/' +
                              args.experiment_name):
            os.makedirs(args.results_path + '/fold' + str(fold) + '/' +
                        args.experiment_name)

        if not os.path.exists(args.plot_path + '/fold' + str(fold) + '/'):
            os.makedirs(args.plot_path + '/fold' + str(fold) + '/')
        # Computing metrics for baseline models for a certain fold and updating plot_values dictionnary
        utils.eval_baseline_index_trec(args.coll_path, Collection, fold, qrel,
                                       plot_values, args.results_path,
                                       args.experiment_name, 0)
        # appending plot values to the list
        plot_values_folds_list.append(plot_values)
    #Evaluating baseline models without training.

    ndcg5 = dict()
    for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
        ndcg5[model_name] = []
    for fold in range(args.folds):
        for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
            ndcg5[model_name].append(
                plot_values_folds_list[fold][model_name][1][0]['ndcg_cut_5'])
    for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
        average = sum(ndcg5[model_name]) / args.folds
        maximum = max(ndcg5[model_name])
        print("ndcg5 ",
              model_name,
              " average of folds",
              average,
              " of collection ",
              os.path.basename(args.coll_path),
              flush=True)
        print("ndcg5 ",
              model_name,
              " max of folds",
              maximum,
              " of collection ",
              os.path.basename(args.coll_path),
              flush=True)

    print("-----------------Finished-------------------", flush=True)
Пример #20
0
def read_qrels(qrels):
    if platform.system().lower().startswith("win"):
        return dict()
    with open(qrels, "r") as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)
    return qrel
def get_qrels_as_dict(qrel_file):
    assert os.path.exists(qrel_file)
    with open(qrel_file, 'r') as f_qrel:
        qrels = pytrec_eval.parse_qrel(f_qrel)
    return (qrels)
Пример #22
0
def strips(dataset):
    for id_, (query, text) in dataset.items():
        query = query.split(" ")
        text = text.split(" ")
        id_ = id_
        yield id_, query, text


# Read dataset
with open(dataset_path, "r") as dataset_file:
    dataset = eval(dataset_file.read())

# Transform dataset and get relevent words
dataset_class = list(starmap(QueryText, strips(dataset)))

with open(qrels_path, "r") as f:
    qrels = parse_qrel(f)

# Prepare embeddings
model = fastText.load_model("/local/pouyet/py37/models/wiki.en.bin")
for x in dataset_class:
    x.compute_embedding(model)
    x.qrels = qrels[str(x._id)]

with open(dataset_classes_path, "wb") as f:
    pickle.dump(dataset_class, f)

# Build torch dataset
dataset_torch = KeyWordSelectionDataset(querytext_list=dataset_class)
torch.save(dataset_torch, torchdataset_path)
def main():
    #enabling eager execution of tensorflow. It is enabled in version 2 but not in version1 #HR
    tf.enable_eager_execution()
    #parsing arguments #HR
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--coll_path', nargs="?", type=str)
    parser.add_argument('-i', '--indexed_path', nargs="?", type=str)
    parser.add_argument('-p', '--plot_path', nargs="?", type=str)
    parser.add_argument('-r', '--results_path', nargs="?", type=str)
    parser.add_argument('-w', '--weights_path', nargs="?", type=str)
    parser.add_argument('-f', '--folds', nargs="?", type=int, default=5)
    parser.add_argument('-e', '--nb_epoch', nargs="?", type=int)
    parser.add_argument('-l', '--l1_weight', nargs="?", type=float)
    parser.add_argument('-d',
                        '--dropout_rate',
                        nargs="?",
                        type=float,
                        default=0.0)
    parser.add_argument('--lr', nargs="?", type=float)
    parser.add_argument('-n', '--experiment_name', nargs="?", type=str)
    parser.add_argument('--IR_model', nargs="?", type=str, default='tf')
    parser.add_argument('-u', '--update_embeddings', action="store_true")

    args = parser.parse_args()

    print(args, flush=True)

    # Loading indexed collection #HR
    Collection = TrecCollection()
    with open(args.indexed_path, 'rb') as f:
        Collection = pickle.load(f)

    Collection.doc_index[-1] = "-1"
    Collection.doc_index["-1"] = -1
    # Loading relevance judgements from collection #HR
    with open(args.coll_path + 'qrels', 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)
    # ???? #HR
    id_titl = Collection.vocabulary['titl']

    for i in range(len(Collection.all_indexed_queries)):

        if Collection.all_indexed_queries[i][0] == id_titl and len(
                Collection.all_indexed_queries[i]) > 1:
            print("found it at ", i, " ", Collection.all_indexed_queries[i][0])
            del Collection.all_indexed_queries[i][0]

    for i in range(len(Collection.indexed_queries)):
        for j in range(len(Collection.indexed_queries[i])):
            if Collection.indexed_queries[i][j][0] == id_titl and len(
                    Collection.indexed_queries[i][j]) > 1:
                del Collection.indexed_queries[i][j][0]

    print('---------------------start-------------------', flush=True)
    # Getting collection vocabulary size and total number of elements in collection #HR
    coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index(
        Collection.inverted_index)
    # Creating for each fold and for a certain experiment a directory for results,weights and plots data #HR

    for fold in range(args.folds):

        plot_values = dict()

        for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
            plot_values[model_name] = [[], []]

        if not os.path.exists(args.results_path + '/fold' + str(fold) + '/' +
                              args.experiment_name):
            os.makedirs(args.results_path + '/fold' + str(fold) + '/' +
                        args.experiment_name)

        if not os.path.exists(args.weights_path + '/fold' + str(fold) + '/' +
                              args.experiment_name):
            os.makedirs(args.weights_path + '/fold' + str(fold) + '/' +
                        args.experiment_name)

        if not os.path.exists(args.plot_path + '/fold' + str(fold) + '/'):
            os.makedirs(args.plot_path + '/fold' + str(fold) + '/')
        # Computing metrics for baseline models for a certain fold and updating plot_values dictionnary #HR
        #HR modified the eval_baseline_index to a function eval_baseline_index_trec
        # The previous version did not work because of different call parameters
        utils.eval_baseline_index_trec(args.coll_path, Collection, fold, qrel,
                                       plot_values, args.results_path,
                                       args.experiment_name, 0)
        # Saving plot_values dict of a particular fold as a pickle #HR
        pickle.dump(
            plot_values,
            open(
                args.plot_path + '/fold' + str(fold) + '/' +
                args.experiment_name, 'wb'))
        # Initialization of batch size, the loss function,te optimizer and the model to train #HR
        batch_gen_time = []
        batch_size = 32
        y_true = tf.ones(batch_size, )
        loss_function = tf.keras.losses.Hinge()
        optimizer = tf.keras.optimizers.Adam(args.lr)

        if args.IR_model == 'tf':
            model = differentiable_models.diff_simple_TF(
                Collection.embedding_matrix, dropout_rate=args.dropout_rate)

        elif args.IR_model == 'tf_idf':
            model = differentiable_models.diff_TF_IDF(
                Collection.embedding_matrix, dropout_rate=args.dropout_rate)

        elif args.IR_model == 'DIR':
            model = differentiable_models.diff_DIR(
                Collection.embedding_matrix, dropout_rate=args.dropout_rate)

        elif args.IR_model == 'BM25':
            model = differentiable_models.diff_BM25(
                Collection.embedding_matrix, dropout_rate=args.dropout_rate)
        #HR added JM model
        elif args.IR_model == 'JM':
            model = differentiable_models.diff_JM(
                Collection.embedding_matrix, dropout_rate=args.dropout_rate)

        # Training the model #HR
        print("Start training for fold ",
              fold,
              " ",
              args.experiment_name,
              flush=True)
        epoch = 0
        prop_elem_index = 1.0
        while epoch < args.nb_epoch and prop_elem_index > 0.05:

            begin = time.time()
            # generation of batches from the trec collection for training #HR
            query_batches, positive_doc_batches, negative_doc_batches = Collection.generate_training_batches(
                fold, batch_size)

            rank_loss = 0.0
            reg_loss = 0.0
            all_non_zero = 0.0

            begin = time.time()

            for i in range(len(query_batches)):
                with tf.GradientTape() as tape:
                    # reshaping queries, pos_documents and neg_documents into a numpy ndarray #HR
                    queries = tf.keras.preprocessing.sequence.pad_sequences(
                        [
                            Collection.all_indexed_queries[j]
                            for j in query_batches[i]
                        ],
                        padding='post')

                    pos_documents = tf.keras.preprocessing.sequence.pad_sequences(
                        [
                            Collection.indexed_docs[j]
                            for j in positive_doc_batches[i]
                        ],
                        padding='post')

                    neg_documents = tf.keras.preprocessing.sequence.pad_sequences(
                        [
                            Collection.indexed_docs[j]
                            for j in negative_doc_batches[i]
                        ],
                        padding='post')
                    # Creating sparse querie, pos_document and neg_documents indexes #HR
                    q_sparse_index = [[column, j]
                                      for j, raw in enumerate(queries)
                                      for column in raw]
                    pos_d_sparse_index = [[
                        column, j
                    ] for j, raw in enumerate(pos_documents) for column in raw]
                    neg_d_sparse_index = [[
                        column, j
                    ] for j, raw in enumerate(neg_documents) for column in raw]
                    # computing relevance and dense document for the negative and positive documents in the batch #HR
                    pos_res, pos_d = model(
                        np.clip(queries, 0, 1).astype(np.float32), queries,
                        q_sparse_index, pos_documents, pos_d_sparse_index)

                    neg_res, neg_d = model(
                        np.clip(queries, 0, 1).astype(np.float32), queries,
                        q_sparse_index, neg_documents, neg_d_sparse_index)
                    # Computing the hinge loss and the regularization loss and total loss #HR
                    ranking_loss = loss_function(y_true=y_true,
                                                 y_pred=pos_res - neg_res)

                    regularization_loss = tf.norm(pos_d + neg_d, ord=1)

                    rank_loss += ranking_loss.numpy()
                    reg_loss += regularization_loss.numpy()

                    all_non_zero += tf.math.count_nonzero(pos_d +
                                                          neg_d).numpy()

                    loss = (
                        1.0 - args.l1_weight
                    ) * ranking_loss + args.l1_weight * regularization_loss
                    # Calculating gradients #HR
                    if args.update_embeddings:
                        gradients = tape.gradient(loss,
                                                  model.trainable_variables)
                    else:
                        gradients = tape.gradient(
                            loss, model.trainable_variables[1:])
                # Back propagating the gradients #HR
                if args.update_embeddings:
                    optimizer.apply_gradients(
                        zip(gradients, model.trainable_variables))
                else:
                    optimizer.apply_gradients(
                        zip(gradients, model.trainable_variables[1:]))

            # Compute the TDVs after the training and saving them #HR
            weights = model.compute_index()

            pickle.dump(
                weights,
                open(
                    args.weights_path + '/fold' + str(fold) + '/' +
                    args.experiment_name + '/epoch_' + str(epoch), 'wb'))

            inverted_index, redefined_idf, redefined_docs_length, redefined_c_freq = utils.utils_compute_info_retrieval(
                Collection, weights, weighted=True)
            #JF
            #             inverted_index,idf,docs_length,c_freq = utils.compute_info_retrieval(Collection,
            #                                                                                  weights,
            #                                                                                  weighted=False)

            # Computing new vocab_size and total number of elements after introducting the TDV #HR
            vocab_size, tot_nb_elem = utils.evaluate_inverted_index(
                inverted_index)

            print(
                str(100 * vocab_size / coll_vocab_size)[0:5] +
                '% of the vocabulary is kept')
            print(str(100 * tot_nb_elem / coll_tot_nb_elem)[0:5] +
                  '% of the index is kept',
                  flush=True)

            prop_elem_index = tot_nb_elem / coll_tot_nb_elem
            #Evaluating baseline models with their new inverted index and new idf, doc length and collection frequencies #HR
            #HR modified the eval_learned_index to a function eval_learned_index_trec
            # The previous version did not work because of a different call parameters
            utils.eval_learned_index_trec(
                args.coll_path,
                Collection,
                args.IR_model,
                model,
                qrel,
                plot_values,
                args.plot_path,
                fold,
                inverted_index,
                weights,
                redefined_idf,
                redefined_docs_length,
                redefined_c_freq,
                #                                        idf,
                #                                        docs_length,
                #                                        c_freq,
                prop_elem_index,
                args.results_path,
                args.experiment_name,
                epoch + 1)
            epoch += 1
        print("finish training for fold ",
              fold,
              " ",
              args.experiment_name,
              flush=True)  #HR

    print("-----------------Finished-------------------", flush=True)  #HR
Пример #24
0
        qid, qtext = line.strip().split('\t')
        qtext = re.sub(r'[^\w\s]', ' ', qtext)
        qtokens = [word for word in qtext.strip().split(' ') if word != '']
        queries[qid] = qtokens

folds = {}
for collection in collections:
    with open(os.path.join('..', 'queries', 'json',
                           collection + '.json')) as f:
        folds[collection] = json.load(f)

with open(el_path) as f:
    qid_entities = json.load(f)

with open(qrel_path, 'r') as f_qrel:
    qrel = pytrec_eval.parse_qrel(f_qrel)

evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut'})

redirects = {}
with open(redirects_path) as f:
    for line in f:
        if not line.startswith('#'):
            subj, pred, obj = line.split(maxsplit=2)
            obj = obj[:obj.rfind('.')].strip()
            redirects[subj] = obj

with open(ir_run_path, "r") as ir_run_file:
    ir_run = pytrec_eval.parse_run(ir_run_file)

model = Word2Vec.load(args.model)
Пример #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--coll_path', nargs="?", type=str)
    parser.add_argument('-i', '--indexed_path', nargs="?", type=str)
    parser.add_argument('-p', '--plot_path', nargs="?", type=str)
    parser.add_argument('-r', '--results_path', nargs="?", type=str)
    parser.add_argument('-w', '--weights_path', nargs="?", type=str)
    parser.add_argument('-e', '--nb_epoch', nargs="?", type=int)
    parser.add_argument('-l', '--l1_weight', nargs="?", type=float)
    parser.add_argument('-n', '--experiment_name', nargs="?", type=str)
    parser.add_argument('-u', '--update_embeddings', action="store_true")
    #HR added the choice of a particular differentiable model. There was no choices
    #in the original file
    parser.add_argument('--IR_model', nargs="?", type=str, default='tf')
    #HR added the option to choose a lerning rate and dropout rate
    parser.add_argument('--lr', nargs="?", type=float)
    parser.add_argument('-d',
                        '--dropout_rate',
                        nargs="?",
                        type=float,
                        default=0.0)
    args = parser.parse_args()

    print(args, flush=True)

    if not os.path.exists(args.results_path + '/validation/' +
                          args.experiment_name):
        os.makedirs(args.results_path + '/validation/' + args.experiment_name)

    if not os.path.exists(args.results_path + '/test/' + args.experiment_name):
        os.makedirs(args.results_path + '/test/' + args.experiment_name)

    if not os.path.exists(args.weights_path + '/' + args.experiment_name):
        os.makedirs(args.weights_path + '/' + args.experiment_name)

    if not os.path.exists(args.plot_path + '/validation/'):
        os.makedirs(args.plot_path + '/validation/')

    if not os.path.exists(args.plot_path + '/test/'):
        os.makedirs(args.plot_path + '/test/')

    #Initializing the results plot values for the different models #HR
    validation_plot_values = dict()
    test_plot_values = dict()

    for model_name in ['tf', 'tf_idf', 'DIR', 'BM25', 'JM']:
        validation_plot_values[model_name] = [[], []]
        test_plot_values[model_name] = [[], []]

    #Loading indexed collection #HR

    Collection = wikIR_Collection.Collection()
    with open(args.indexed_path, 'rb') as f:
        Collection = pickle.load(f)

    Collection.doc_index[-1] = "-1"
    Collection.doc_index["-1"] = -1

    #Loading validation and test query relavance values #HR
    with open(args.coll_path + 'validation/qrels', 'r') as f_qrel:
        validation_qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.coll_path + 'test/qrels', 'r') as f_qrel:
        test_qrel = pytrec_eval.parse_qrel(f_qrel)

    print('------------------------------start--------------------------',
          flush=True)
    #Computing collection vocabulary size and total number of elements #HR
    coll_vocab_size, coll_tot_nb_elem = utils.evaluate_inverted_index(
        Collection.inverted_index)
    #Evaluating the baseline models without TDV weights and saving the results of calidation and test partitions #HR
    #HR modified the eval_baseline_index to a function eval_baseline_index_wikir
    # The previous version did not work because of a different call parameters
    utils.eval_baseline_index_wikir(args.coll_path, Collection,
                                    validation_qrel, test_qrel,
                                    validation_plot_values, test_plot_values,
                                    args.results_path, args.experiment_name, 0)

    pickle.dump(
        validation_plot_values,
        open(args.plot_path + '/validation/' + args.experiment_name, 'wb'))
    pickle.dump(test_plot_values,
                open(args.plot_path + '/test/' + args.experiment_name, 'wb'))

    #Initialization of batch size, the loss function and optimizer for the training  #HR
    batch_gen_time = []
    batch_size = 64
    y_true = tf.ones(batch_size, )
    loss_function = tf.keras.losses.Hinge()
    optimizer = tf.keras.optimizers.Adam(args.lr)
    #Loading the differentiable model used for the training #HR
    #HR added options for different IR models. In the original version only the
    # simple tf model was present
    if args.IR_model == 'tf':
        model = differentiable_models.diff_simple_TF(
            Collection.embedding_matrix, dropout_rate=args.dropout_rate)
    #HR
    elif args.IR_model == 'tf_idf':
        model = differentiable_models.diff_TF_IDF(
            Collection.embedding_matrix, dropout_rate=args.dropout_rate)
    #HR
    elif args.IR_model == 'DIR':
        model = differentiable_models.diff_DIR(Collection.embedding_matrix,
                                               dropout_rate=args.dropout_rate)
    #HR
    elif args.IR_model == 'BM25':
        model = differentiable_models.diff_BM25(Collection.embedding_matrix,
                                                dropout_rate=args.dropout_rate)
    #HR
    elif args.IR_model == 'JM':
        model = differentiable_models.diff_JM(Collection.embedding_matrix,
                                              dropout_rate=args.dropout_rate)

    #Starting the training
    print("Start training ", args.experiment_name, flush=True)
    epoch = 0
    prop_elem_index = 1.0
    while epoch < args.nb_epoch and prop_elem_index > 0.2:

        begin = time.time()
        #Generating batches from WikIR Collection for training #HR
        query_batches, positive_doc_batches, negative_doc_batches = Collection.generate_training_batches(
            batch_size)

        rank_loss = 0.0
        reg_loss = 0.0
        all_non_zero = 0.0

        begin = time.time()

        for i in range(len(query_batches)):
            with tf.GradientTape() as tape:
                # reshaping queries, pos_documents and neg_documents into a numpy  ndarray #HR
                queries = tf.keras.preprocessing.sequence.pad_sequences(
                    [
                        Collection.indexed_training_queries[j]
                        for j in query_batches[i]
                    ],
                    padding='post')
                pos_documents = tf.keras.preprocessing.sequence.pad_sequences(
                    [
                        Collection.indexed_docs[j]
                        for j in positive_doc_batches[i]
                    ],
                    padding='post')

                neg_documents = tf.keras.preprocessing.sequence.pad_sequences(
                    [
                        Collection.indexed_docs[j]
                        for j in negative_doc_batches[i]
                    ],
                    padding='post')
                # Creating sparse querie, pos_document and neg_documents indexes #HR
                q_sparse_index = [[column, j] for j, raw in enumerate(queries)
                                  for column in raw]
                pos_d_sparse_index = [[column, j]
                                      for j, raw in enumerate(pos_documents)
                                      for column in raw]
                neg_d_sparse_index = [[column, j]
                                      for j, raw in enumerate(neg_documents)
                                      for column in raw]
                # computing relevance and dense document for the negative and positive documents in the batch #HR
                pos_res, pos_d = model(
                    np.clip(queries, 0, 1).astype(np.float32), queries,
                    q_sparse_index, pos_documents, pos_d_sparse_index)

                neg_res, neg_d = model(
                    np.clip(queries, 0, 1).astype(np.float32), queries,
                    q_sparse_index, neg_documents, neg_d_sparse_index)

                #Computing the hinge loss , the regularization loss and total loss #HR
                ranking_loss = loss_function(y_true=y_true,
                                             y_pred=pos_res - neg_res)
                regularization_loss = tf.norm(pos_d + neg_d, ord=1)

                rank_loss += ranking_loss.numpy()
                reg_loss += regularization_loss.numpy()

                all_non_zero += tf.math.count_nonzero(pos_d + neg_d).numpy()

                loss = (1.0 - args.l1_weight
                        ) * ranking_loss + args.l1_weight * regularization_loss
                #Calculating gradients #HR
                if args.update_embeddings:
                    gradients = tape.gradient(loss, model.trainable_variables)
                else:
                    gradients = tape.gradient(loss,
                                              model.trainable_variables[1:])
            #Back propagating the gradients #HR
            if args.update_embeddings:
                optimizer.apply_gradients(
                    zip(gradients, model.trainable_variables))
            else:
                optimizer.apply_gradients(
                    zip(gradients, model.trainable_variables[1:]))
        #Computing TDVs and saving them #HR
        weights = model.compute_index()

        pickle.dump(
            weights,
            open('weights/' + args.experiment_name + '/epoch_' + str(epoch),
                 'wb'))
        #updating the inverted index and computing the new idf, doc lengths and collection frequencies #HR
        inverted_index, redefined_idf, redefined_docs_length, redefined_c_freq = utils.compute_info_retrieval(
            Collection, weights, weighted=True)

        # Computing new vocab_size and total number of elements after introducting the TDV #HR
        vocab_size, tot_nb_elem = utils.evaluate_inverted_index(inverted_index)

        print(
            str(100 * vocab_size / coll_vocab_size)[0:5] +
            '% of the vocabulary is kept')
        print(str(100 * tot_nb_elem / coll_tot_nb_elem)[0:5] +
              '% of the index is kept',
              flush=True)

        prop_elem_index = tot_nb_elem / coll_tot_nb_elem

        #Evaluating baseline models with their new inverted index and new idf, doc length and collection frequencies
        #HR modified the eval_learned_index to a function eval_learned_index_wikir
        # The previous version did not work because of  different call parameters
        utils.eval_learned_index_wikir(
            args.coll_path, Collection, args.IR_model, model, validation_qrel,
            test_qrel, validation_plot_values, test_plot_values,
            args.plot_path, inverted_index, redefined_idf,
            redefined_docs_length, redefined_c_freq, prop_elem_index,
            args.results_path, args.experiment_name, epoch)
        epoch += 1
Пример #26
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--qrel',
        default=
        '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt'
    )
    parser.add_argument(
        '--run',
        default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt')

    args = parser.parse_args()

    print("args.qrel:", args.qrel)
    print("args.run", args.run)

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    final_auc, final_accuracy = calc_auc(args.qrel, args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    total = len(results.items())
    sum_map = 0.0

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            # print_line(measure, query_id, value)
            pass

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    print("==========")
    selected_measures = [
        'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5',
        'recall_10', 'recall_15', 'recall_20', 'ndcg'
    ]

    eva_values = {}
    for measure in selected_measures:
        eva_values[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])
        # print_line( measure, 'all', eva_values[measure])
    for measure in selected_measures:
        print_line(measure, 'all', eva_values[measure])

    print(
        "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f"
        % (final_auc, final_accuracy, eva_values['map'],
           eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'],
           eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'],
           eva_values['recall_10'], eva_values['recall_15'],
           eva_values['recall_20'], eva_values['ndcg']))