Пример #1
0
 def test_compile_to_fbr(self):
     indexref = pt.IndexRef.of(self.here + "/fixtures/index/data.properties")
     # we only want a candidate set of 2 documents
     firstpass = pt.BatchRetrieve(indexref, wmodel="BM25")
     pipe_f_fbr = firstpass >> pt.FeaturesBatchRetrieve(indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
     pipe_fbr = pt.FeaturesBatchRetrieve(indexref, wmodel="BM25", features=["WMODEL:DPH", "WMODEL:PL2"])
     pipe_raw = firstpass >> ( pt.BatchRetrieve(indexref, wmodel="DPH") ** pt.BatchRetrieve(indexref, wmodel="PL2") )
     input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
     res1 = (pipe_f_fbr %2)(input)
     res2 = (pipe_fbr % 2)(input)
     res3 = (pipe_raw % 2)(input)
     compiled = (pipe_raw % 2).compile()
     print(repr(compiled))
     res4 = compiled(input)
Пример #2
0
    def test_fbr_reranking(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        # this test examines the use of ScoringMatchingWithFat
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 2 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2
        pipe = firstpass >> pt.FeaturesBatchRetrieve(
            indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = pipe.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertEqual(2, len(result))
        self.assertEqual(result.iloc[0]["features"].size, 2)

        pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")**
                                    pt.BatchRetrieve(indexref, wmodel="PL2"))
        result2 = pipe.transform(input)
        import numpy as np
        f1 = np.stack(result["features"].values)
        f2 = np.stack(result2["features"].values)
        self.assertTrue(np.array_equal(f1, f2))
Пример #3
0
    def test_xgltr_pipeline(self):
        try:
            import xgboost as xgb
        except:
            self.skipTest("xgboost not installed")

        xgparams = {
            'objective': 'rank:ndcg',
            'learning_rate': 0.1,
            'gamma': 1.0, 'min_child_weight': 0.1,
            'max_depth': 6,
            'verbose': 2,
            'random_state': 42
        }

        topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query_light.trec").head(5)
        qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")

        pipeline = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25"], controls={"wmodel" : "DPH"}) >> \
            pt.ltr.apply_learned_model(xgb.sklearn.XGBRanker(**xgparams), form="ltr")
        
        pipeline.fit(topics, qrels, topics, qrels)
        pt.Utils.evaluate(
            pipeline.transform(topics),
            qrels
        )
Пример #4
0
 def test_fbr_empty(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
     input = pd.DataFrame([["1", ""]], columns=['qid', 'query'])
     with warnings.catch_warnings(record=True) as w:
         result = retr.transform(input)
         assert "Skipping empty query" in str(w[-1].message)
     self.assertTrue(len(result) == 0)
Пример #5
0
    def test_ltr_pipeline_feature_change(self):
        from sklearn.ensemble import RandomForestClassifier

        topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query_light.trec").head(5)
        qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")

        rf = RandomForestClassifier()

        pipeline = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25"], controls={"wmodel" : "DPH"}) >> \
            pt.ltr.apply_learned_model(rf)
        
        pipeline.fit(topics, qrels)
        pipeline.transform(topics)

        pipeline2 = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25", "WMODEL:Dl"], controls={"wmodel" : "DPH"}) >> \
            pt.ltr.apply_learned_model(rf)
        with self.assertRaises(ValueError):
            pipeline2.transform(topics)
Пример #6
0
 def test_fbr_ltr(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
     topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query-text.trec").head(3)
     qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")
     res = retr.transform(topics)
     res = res.merge(qrels, on=['qid', 'docno'], how='left').fillna(0)
     from sklearn.ensemble import RandomForestClassifier
     import numpy as np
     #print(res.dtypes)
     RandomForestClassifier(n_estimators=10).fit(np.stack(res["features"]), res["label"])
Пример #7
0
    def test_fbr_reranking2(self):
        if not pt.check_version("5.4"):
            self.skipTest("Requires Terrier 5.4")
        # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3
        JIR = pt.Class('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 3 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3
        pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref,
                                                      features=["WMODEL:PL2"])
        pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2")

        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result0 = firstpass.transform(input)
        result1 = pipe1.transform(input)
        result2 = pipe2.transform(input)

        result1["feature0"] = result1.apply(lambda row: row["features"][0],
                                            axis=1)
        #BM25 score
        result0_map = {row.docno: row.score for row in result0.itertuples()}
        result1S_map = {row.docno: row.score for row in result1.itertuples()}
        #PL2 score
        result1F_map = {
            row.docno: row.feature0
            for row in result1.itertuples()
        }
        result2_map = {row.docno: row.score for row in result2.itertuples()}

        print(result1F_map)
        print(result2_map)

        # check features scores
        # NB: places can go no less than 4, as two documents have similar PL2 scores
        for rank, row in enumerate(result0.itertuples()):
            docno = row.docno
            # check that score is unchanged
            self.assertAlmostEqual(
                result1S_map[docno],
                result0_map[docno],
                msg="input score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
            #  check that feature score is correct
            self.assertAlmostEqual(
                result1F_map[docno],
                result2_map[docno],
                msg="feature score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
Пример #8
0
    def test_ltr_pipeline(self):
        from sklearn.ensemble import RandomForestClassifier

        topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query_light.trec").head(5)
        qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")

        pipeline = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25"], controls={"wmodel" : "DPH"}) >> \
            pt.ltr.apply_learned_model(RandomForestClassifier())
        
        pipeline.fit(topics, qrels)
        pt.Utils.evaluate(
            pipeline.transform(topics),
            qrels,
        )
Пример #9
0
    def test_fbr(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = retr.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertTrue(len(result) > 0)
        self.assertEqual(result.iloc[0]["features"].size, 1)

        retrBasic = pt.BatchRetrieve(indexref)
        if "matching" in retrBasic.controls:
            self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull")
Пример #10
0
    def test_fastrank(self):
        import fastrank
        train_request = fastrank.TrainRequest.coordinate_ascent()
        params = train_request.params
        params.init_random = True
        params.normalize = True
        params.seed = 1234567

        topics = pt.io.read_topics(
            self.here + "/fixtures/vaswani_npl/query_light.trec").head(5)
        qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")

        pipeline = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25"], controls={"wmodel" : "DPH"}) >> \
            pt.ltr.apply_learned_model(train_request, form="fastrank")

        pipeline.fit(topics, qrels, topics, qrels)
        pt.Utils.evaluate(pipeline.transform(topics), qrels)
Пример #11
0
    def _fbr(self, pickler):
        vaswani = pt.datasets.get_dataset("vaswani")
        br = pt.FeaturesBatchRetrieve(vaswani.get_index(),
                                      wmodel="BM25",
                                      features=["WMODEL:DPH"],
                                      controls={"c": 0.75},
                                      num_results=15)
        q = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"])
        res1 = br(q)
        br2 = pickler.loads(pickler.dumps(br))

        self.assertEqual("BM25", br2.controls["wmodel"])
        self.assertEqual(br.controls, br2.controls)
        self.assertEqual(br.properties, br2.properties)
        self.assertEqual(br.metadata, br2.metadata)
        self.assertEqual(br.features, br2.features)

        self.assertEqual(repr(br), repr(br2))
        res2 = br2(q)

        pd.testing.assert_frame_equal(res1, res2)
Пример #12
0
    def test_xgltr_pipeline(self):
        import xgboost as xgb

        xgparams = {
            'objective': 'rank:ndcg',
            'learning_rate': 0.1,
            'gamma': 1.0,
            'min_child_weight': 0.1,
            'max_depth': 6,
            'verbose': 2,
            'random_state': 42
        }

        topics = pt.Utils.parse_trec_topics_file(
            self.here + "/fixtures/vaswani_npl/query_light.trec").head(5)
        qrels = pt.Utils.parse_qrels(self.here + "/fixtures/vaswani_npl/qrels")

        pipeline = pt.FeaturesBatchRetrieve(self.here + "/fixtures/index/data.properties", ["WMODEL:PL2", "WMODEL:BM25"], controls={"wmodel" : "DPH"}) >> \
            pt.XGBoostLTR_pipeline(xgb.sklearn.XGBRanker(**xgparams))

        pipeline.fit(topics, qrels, topics, qrels)
        pt.Utils.evaluate(pipeline.transform(topics), qrels)
Пример #13
0
def main(algorithm=LAMBDAMART,
         feat_batch=FEATURES_BATCH_N,
         top_n_train=TOP_N_TRAIN,
         top_n_validation=TOP_N_TRAIN,
         run_id=RUN_ID):

    if not pt.started():
        pt.init(mem=8000)

    ################
    ## INDEX STEP ##
    ################

    dataset = pt.get_dataset("trec-deep-learning-passages")

    def msmarco_generate():
        with pt.io.autoopen(dataset.get_corpus()[0], 'rt') as corpusfile:
            for l in corpusfile:
                docno, passage = l.split("\t")
                yield {'docno': docno, 'text': passage}

    try:
        print("Indexing MSMARCO passage ranking dataset")
        print(
            "If the index has not be constructed yet but the MSMARCO dataset has been downloaded previously, it is recommended to place the collection.tar.gz in the \"/Users/{username}/.pyterrier/corpora/trec-deep-learning-passages\" directory. This will make sure that PyTerrier does not download the corpus of the internet and uses the local file instead. "
        )
        # Single threaded indexing
        # iter_indexer = pt.IterDictIndexer("./passage_index")
        # indexref3 = iter_indexer.index(msmarco_generate(), meta=['docno', 'text'], meta_lengths=[20, 4096])
        print(
            "Performing Multi threaded indexing, if this does not work on your system (probably if it is Windows), then uncomment the two lines above this print statement and comment out the two lines below this statement in the code to make sure it runs on a single thread."
        )
        # Multi threaded indexing, UNIX-based systems only!!!!!
        iter_indexer = pt.IterDictIndexer("./passage_index_8", threads=8)
        indexref4 = iter_indexer.index(msmarco_generate(),
                                       meta=['docno', 'text'],
                                       meta_lengths=[20, 4096])

    except ValueError as err:
        if "Index already exists" in str(err):
            print("Index already exists, loading existing one")
            indexref4 = "./passage_index_8/data.properties"

    pt.logging('WARN')
    index = pt.IndexFactory.of(indexref4)
    print(index.getCollectionStatistics().toString())

    ################
    ## DATA PREP  ##
    ################

    # Load topics as df: [qid, query]
    # load qrels as df: [qid, docno, label]
    def load_qrels_file(path):
        df = pd.read_csv(path,
                         sep='\t',
                         names=['qid', 'q0', 'docno', 'label'],
                         dtype={
                             'qid': str,
                             'q0': str,
                             'docno': str,
                             'label': np.int32
                         })
        del df['q0']
        return df

    def load_topics_file(path):
        df = pd.read_csv(path,
                         sep='\t',
                         names=['qid', 'query'],
                         dtype={
                             'qid': str,
                             'query': str
                         })
        exclude = set(string.punctuation)
        # Remove punctuation
        # print(exclude)
        df['query'] = df['query'].apply(
            lambda s: ''.join(ch for ch in s if ch not in exclude))
        # print(df['query'][:6])
        return df

    def filter_train_qrels(train_topics_subset, train_qrels):
        m = train_qrels.qid.isin(train_topics_subset.qid)
        return train_qrels[m]

    print('Loading train/validation topics and qrels')
    print(
        "Looking for the query files in the following directory: collections/msmarco-passage/, make sure to have the query files located there..."
    )
    train_topics = load_topics_file(
        'collections/msmarco-passage/queries.train.tsv')
    train_qrels = load_qrels_file(
        'collections/msmarco-passage/qrels.train.tsv')
    validation_topics = load_topics_file(
        'collections/msmarco-passage/queries.dev.small.tsv')
    validation_qrels = load_qrels_file(
        'collections/msmarco-passage/qrels.dev.small.tsv')
    test_topics = load_topics_file(
        'collections/msmarco-passage/msmarco-test2019-queries.tsv')

    print('Getting first {} train topics and corresponding qrels'.format(
        top_n_train))
    # TODO: not all queries here have qrels... Maybe filter on first 100 that have qrels?
    if int(top_n_train) > 0:
        train_sub = train_topics[:top_n_train].copy()
        train_qrels_sub = filter_train_qrels(train_sub, train_qrels)
    else:
        train_sub = train_topics
        train_qrels_sub = train_qrels

    print('Getting first {} validation topics and corresponding qrels'.format(
        top_n_validation))
    if int(top_n_validation) > 0:
        validation_sub = validation_topics[:top_n_validation].copy()
        validation_qrels_sub = filter_train_qrels(validation_sub,
                                                  validation_qrels)
    else:
        validation_sub = validation_topics
        validation_qrels_sub = validation_qrels
    # print(train_qrels_sub)

    ##############
    ## TRAINING ##
    ##############

    print('Setting up FeaturesBatchRetriever')

    pipeline = pt.FeaturesBatchRetrieve(
        index,
        wmodel="BM25",
        features=[
            "SAMPLE", "WMODEL:Tf", "WMODEL:PL2", "WMODEL:TF_IDF",
            "WMODEL:DLH13", "WMODEL:Hiemstra_LM"
        ]) % feat_batch

    #### LAMBDAMART
    print('Configuring Ranker...')
    # this configures LightGBM as LambdaMART
    lmart_l = lgb.LGBMRanker(
        task="train",
        # min_data_in_leaf=1,
        # min_sum_hessian_in_leaf=100,
        # max_bin=255,
        num_leaves=7,
        objective="lambdarank",
        metric="ndcg",
        # ndcg_eval_at=[1, 3, 5, 10],
        learning_rate=.1,
        importance_type="gain",
        # num_iterations=10,
        silent=False,
        n_jobs=-1)

    # lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
    #       learning_rate=0.1,
    #       gamma=1.0,
    #       min_child_weight=0.1,
    #       max_depth=6,
    #       verbose=2,
    #       random_state=42)

    print('''\n
    ########################################
    ###### Training pipeline summary: ######
    ########################################

    Train Topics: {}
    Train Qrels: {}
    Validation topics: {}
    Validation Qrels: {}
    Amount of passage samples per query: {}

    ########################################

    '''.format(train_sub.shape[0], train_qrels_sub.shape[0],
               validation_sub.shape[0], validation_qrels_sub.shape[0],
               FEATURES_BATCH_N))

    start = time.time()
    print(
        "Model output is not rendered to the terminal until after the run is finished..."
    )
    if algorithm.upper() == LAMBDAMART:
        print('Training LambdaMART pipeline')

        # ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
        # ltr_pipeline.fit(train_sub, train_qrels_sub, validation_topics, validation_qrels)

        ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_l,
                                                              form="ltr")
        ltr_pipeline.fit_kwargs = {'verbose': 1}
        ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub,
                         validation_qrels_sub)
        model_name = "LambdaRANK"

    elif algorithm.upper() == RANDOM_FOREST:
        # RANDOM FOREST
        print('Training RandomForest pipeline')
        rf_model = RandomForestRegressor(n_jobs=-1, verbose=10)
        ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(rf_model)
        ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub,
                         validation_qrels_sub)
        model_name = 'RandomForest'
    else:
        print("ERROR: passed invalid algorithm as parameters")
        sys.exit(1)

    ### End of training ###

    end = time.time()
    print('Training finished, time elapsed:', end - start, 'seconds...')

    ###########################
    ## RERANKING AND OUTPUT  ##
    ###########################

    # Output models to pickle files

    # pipeline_filename = '{}_pipeline_{}_{}_{}.p'.format(model_name, train_sub.shape[0], validation_sub.shape[0], run_id)
    # print('Exporting learned pipline to:', pipeline_filename)
    # pickle.dump(ltr_pipeline, open(pipeline_filename, "wb"))

    model_filename = '{}_model_{}_{}_{}.p'.format(model_name,
                                                  train_sub.shape[0],
                                                  validation_sub.shape[0],
                                                  run_id)
    print('Exporting l2r model to:', model_filename)
    if algorithm.upper() == LAMBDAMART:
        pickle.dump(lmart_l, open(model_filename, "wb"))
    else:
        pickle.dump(rf_model, open(model_filename, "wb"))

    print('Running test evaluation...')

    # Test on small subset
    # res = ltr_pipeline.transform(test_topics[:10].copy())

    # Test on entire testset
    start = time.time()
    res = ltr_pipeline.transform(test_topics)
    end = time.time()
    print('Test evaluation finished, time elapsed:', end - start, 'seconds...')

    print('Writing results...')
    output_file_path = './{}_resuls_{}.trec'.format(model_name, str(run_id))
    pt.io.write_results(res, output_file_path, format='trec')

    print('SUCCES: results can be found at: ', output_file_path)