Пример #1
0
def _pt_init(args):
    import pyterrier as pt
    if not pt.started():
        pt.init(no_download=True, **args)
    else:
        from warnings import warn
        warn("Avoiding reinit of PyTerrier")
Пример #2
0
 def __init__(self, *args, **kwargs):
     super(TestTRECIndexer, self).__init__(*args, **kwargs)
     if not pt.started():
         pt.init(logging="DEBUG")
     # else:
     #     pt.setup_logging("DEBUG")
     self.here = os.path.dirname(os.path.realpath(__file__))
Пример #3
0
 def __init__(self, *args, **kwargs):
     super(BaseTestCase, self).__init__(*args, **kwargs)
     terrier_version = os.environ.get("TERRIER_VERSION", None)
     if terrier_version is not None:
         print("Testing with Terrier version" + terrier_version)
     if not pt.started():
         pt.init(version=terrier_version)
     self.here = os.path.dirname(os.path.realpath(__file__))
Пример #4
0
 def __init__(self, *args, **kwargs):
     super(BaseTestCase, self).__init__(*args, **kwargs)
     terrier_version = os.environ.get("TERRIER_VERSION", None)
     java_bridge = os.environ.get("JAVA_BRIDGE", "jpype")
     print("Using " + java_bridge + " as java bridge")
     if terrier_version is not None:
         print("Testing with Terrier version " + terrier_version)
     if not pt.started():
         pt.init(version=terrier_version, java_bridge=java_bridge)
     self.here = os.path.dirname(os.path.realpath(__file__))
Пример #5
0
 def test_monot5_vaswani(self):
     if not pt.started():
         pt.init()
     bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(),
                             wmodel='BM25')
     monoT5 = pyterrier_t5.MonoT5ReRanker()
     pipeline = bm25 % 20 >> pt.text.get_text(
         pt.get_dataset('irds:vaswani'), 'text') >> monoT5
     result = pipeline.search('fluid dynamics')
     self.assertEqual(result.iloc[0]['docno'], '11216')
     self.assertAlmostEqual(result.iloc[0]['score'], -2.186261, places=4)
     self.assertEqual(result.iloc[0]['rank'], 0)
     self.assertEqual(result.iloc[1]['docno'], '5299')
     self.assertAlmostEqual(result.iloc[1]['score'], -8.078399, places=4)
     self.assertEqual(result.iloc[1]['rank'], 1)
     self.assertEqual(result.iloc[-1]['docno'], '3442')
     self.assertAlmostEqual(result.iloc[-1]['score'], -12.725513, places=4)
     self.assertEqual(result.iloc[-1]['rank'], 19)
Пример #6
0
 def test_duot5_vaswani(self):
     if not pt.started():
         pt.init()
     bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(),
                             wmodel='BM25')
     duoT5 = pyterrier_t5.DuoT5ReRanker()
     pipeline = bm25 % 10 >> pt.text.get_text(
         pt.get_dataset('irds:vaswani'), 'text') >> duoT5
     result = pipeline.search('fluid dynamics')
     self.assertEqual(result.iloc[0]['docno'], '9731')
     self.assertAlmostEqual(result.iloc[0]['score'], 44.621585, places=4)
     self.assertEqual(result.iloc[0]['rank'], 0)
     self.assertEqual(result.iloc[1]['docno'], '7045')
     self.assertAlmostEqual(result.iloc[1]['score'], 27.716750, places=4)
     self.assertEqual(result.iloc[1]['rank'], 1)
     self.assertEqual(result.iloc[-1]['docno'], '4767')
     self.assertAlmostEqual(result.iloc[-1]['score'], -9.916206, places=4)
     self.assertEqual(result.iloc[-1]['rank'], 9)
Пример #7
0
def main():
    if not pt.started():
        pt.init()

    #   Valid Stemmers: "porter", "snowball" and ""
    #   Valid Datasets: "vaswani", "trec-deep-learning-docs"

    # Args
    argv = sys.argv 

    if( len(argv) < 4 ):
        print(" Less than 4 arguements inputted, quiting the program.")
        return 1

    dataset = argv[1]
    index_loc = argv[2]
    stemmer = argv[3]
    only_retr = argv[4] == 'T'

    time_taken, evaluation = conduct_experiment(dataset, index_loc, stemmer, only_retr)
Пример #8
0
 def __init__(self, *args, **kwargs):
     super(TestOperators, self).__init__(*args, **kwargs)
     if not pt.started():
         pt.init()
Пример #9
0
 def setUp(self):
     import pyterrier as pt
     if not pt.started():
         pt.init()
     self.test_dir = tempfile.mkdtemp()
Пример #10
0
# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
import sphinx_rtd_theme

# -- Dataset table listing -----------------------------------------------------
import pyterrier as pt
import textwrap
pt.init()
df = pt.list_datasets()


def _wrap(text, width):
    return text
    #return '\\\n_'.join(textwrap.wrap(text, width=width))


def _get_text(row, name, width):
    value = row.get(name)
    if type(value) != list:
        return value
    if len(value) == 1:
        return value
    return '[' + _wrap(', '.join(value), width=width) + ']'
Пример #11
0
def main(algorithm=LAMBDAMART,
         feat_batch=FEATURES_BATCH_N,
         top_n_train=TOP_N_TRAIN,
         top_n_validation=TOP_N_TRAIN,
         run_id=RUN_ID):

    if not pt.started():
        pt.init(mem=8000)

    ################
    ## INDEX STEP ##
    ################

    dataset = pt.get_dataset("trec-deep-learning-passages")

    def msmarco_generate():
        with pt.io.autoopen(dataset.get_corpus()[0], 'rt') as corpusfile:
            for l in corpusfile:
                docno, passage = l.split("\t")
                yield {'docno': docno, 'text': passage}

    try:
        print("Indexing MSMARCO passage ranking dataset")
        print(
            "If the index has not be constructed yet but the MSMARCO dataset has been downloaded previously, it is recommended to place the collection.tar.gz in the \"/Users/{username}/.pyterrier/corpora/trec-deep-learning-passages\" directory. This will make sure that PyTerrier does not download the corpus of the internet and uses the local file instead. "
        )
        # Single threaded indexing
        # iter_indexer = pt.IterDictIndexer("./passage_index")
        # indexref3 = iter_indexer.index(msmarco_generate(), meta=['docno', 'text'], meta_lengths=[20, 4096])
        print(
            "Performing Multi threaded indexing, if this does not work on your system (probably if it is Windows), then uncomment the two lines above this print statement and comment out the two lines below this statement in the code to make sure it runs on a single thread."
        )
        # Multi threaded indexing, UNIX-based systems only!!!!!
        iter_indexer = pt.IterDictIndexer("./passage_index_8", threads=8)
        indexref4 = iter_indexer.index(msmarco_generate(),
                                       meta=['docno', 'text'],
                                       meta_lengths=[20, 4096])

    except ValueError as err:
        if "Index already exists" in str(err):
            print("Index already exists, loading existing one")
            indexref4 = "./passage_index_8/data.properties"

    pt.logging('WARN')
    index = pt.IndexFactory.of(indexref4)
    print(index.getCollectionStatistics().toString())

    ################
    ## DATA PREP  ##
    ################

    # Load topics as df: [qid, query]
    # load qrels as df: [qid, docno, label]
    def load_qrels_file(path):
        df = pd.read_csv(path,
                         sep='\t',
                         names=['qid', 'q0', 'docno', 'label'],
                         dtype={
                             'qid': str,
                             'q0': str,
                             'docno': str,
                             'label': np.int32
                         })
        del df['q0']
        return df

    def load_topics_file(path):
        df = pd.read_csv(path,
                         sep='\t',
                         names=['qid', 'query'],
                         dtype={
                             'qid': str,
                             'query': str
                         })
        exclude = set(string.punctuation)
        # Remove punctuation
        # print(exclude)
        df['query'] = df['query'].apply(
            lambda s: ''.join(ch for ch in s if ch not in exclude))
        # print(df['query'][:6])
        return df

    def filter_train_qrels(train_topics_subset, train_qrels):
        m = train_qrels.qid.isin(train_topics_subset.qid)
        return train_qrels[m]

    print('Loading train/validation topics and qrels')
    print(
        "Looking for the query files in the following directory: collections/msmarco-passage/, make sure to have the query files located there..."
    )
    train_topics = load_topics_file(
        'collections/msmarco-passage/queries.train.tsv')
    train_qrels = load_qrels_file(
        'collections/msmarco-passage/qrels.train.tsv')
    validation_topics = load_topics_file(
        'collections/msmarco-passage/queries.dev.small.tsv')
    validation_qrels = load_qrels_file(
        'collections/msmarco-passage/qrels.dev.small.tsv')
    test_topics = load_topics_file(
        'collections/msmarco-passage/msmarco-test2019-queries.tsv')

    print('Getting first {} train topics and corresponding qrels'.format(
        top_n_train))
    # TODO: not all queries here have qrels... Maybe filter on first 100 that have qrels?
    if int(top_n_train) > 0:
        train_sub = train_topics[:top_n_train].copy()
        train_qrels_sub = filter_train_qrels(train_sub, train_qrels)
    else:
        train_sub = train_topics
        train_qrels_sub = train_qrels

    print('Getting first {} validation topics and corresponding qrels'.format(
        top_n_validation))
    if int(top_n_validation) > 0:
        validation_sub = validation_topics[:top_n_validation].copy()
        validation_qrels_sub = filter_train_qrels(validation_sub,
                                                  validation_qrels)
    else:
        validation_sub = validation_topics
        validation_qrels_sub = validation_qrels
    # print(train_qrels_sub)

    ##############
    ## TRAINING ##
    ##############

    print('Setting up FeaturesBatchRetriever')

    pipeline = pt.FeaturesBatchRetrieve(
        index,
        wmodel="BM25",
        features=[
            "SAMPLE", "WMODEL:Tf", "WMODEL:PL2", "WMODEL:TF_IDF",
            "WMODEL:DLH13", "WMODEL:Hiemstra_LM"
        ]) % feat_batch

    #### LAMBDAMART
    print('Configuring Ranker...')
    # this configures LightGBM as LambdaMART
    lmart_l = lgb.LGBMRanker(
        task="train",
        # min_data_in_leaf=1,
        # min_sum_hessian_in_leaf=100,
        # max_bin=255,
        num_leaves=7,
        objective="lambdarank",
        metric="ndcg",
        # ndcg_eval_at=[1, 3, 5, 10],
        learning_rate=.1,
        importance_type="gain",
        # num_iterations=10,
        silent=False,
        n_jobs=-1)

    # lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
    #       learning_rate=0.1,
    #       gamma=1.0,
    #       min_child_weight=0.1,
    #       max_depth=6,
    #       verbose=2,
    #       random_state=42)

    print('''\n
    ########################################
    ###### Training pipeline summary: ######
    ########################################

    Train Topics: {}
    Train Qrels: {}
    Validation topics: {}
    Validation Qrels: {}
    Amount of passage samples per query: {}

    ########################################

    '''.format(train_sub.shape[0], train_qrels_sub.shape[0],
               validation_sub.shape[0], validation_qrels_sub.shape[0],
               FEATURES_BATCH_N))

    start = time.time()
    print(
        "Model output is not rendered to the terminal until after the run is finished..."
    )
    if algorithm.upper() == LAMBDAMART:
        print('Training LambdaMART pipeline')

        # ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
        # ltr_pipeline.fit(train_sub, train_qrels_sub, validation_topics, validation_qrels)

        ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_l,
                                                              form="ltr")
        ltr_pipeline.fit_kwargs = {'verbose': 1}
        ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub,
                         validation_qrels_sub)
        model_name = "LambdaRANK"

    elif algorithm.upper() == RANDOM_FOREST:
        # RANDOM FOREST
        print('Training RandomForest pipeline')
        rf_model = RandomForestRegressor(n_jobs=-1, verbose=10)
        ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(rf_model)
        ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub,
                         validation_qrels_sub)
        model_name = 'RandomForest'
    else:
        print("ERROR: passed invalid algorithm as parameters")
        sys.exit(1)

    ### End of training ###

    end = time.time()
    print('Training finished, time elapsed:', end - start, 'seconds...')

    ###########################
    ## RERANKING AND OUTPUT  ##
    ###########################

    # Output models to pickle files

    # pipeline_filename = '{}_pipeline_{}_{}_{}.p'.format(model_name, train_sub.shape[0], validation_sub.shape[0], run_id)
    # print('Exporting learned pipline to:', pipeline_filename)
    # pickle.dump(ltr_pipeline, open(pipeline_filename, "wb"))

    model_filename = '{}_model_{}_{}_{}.p'.format(model_name,
                                                  train_sub.shape[0],
                                                  validation_sub.shape[0],
                                                  run_id)
    print('Exporting l2r model to:', model_filename)
    if algorithm.upper() == LAMBDAMART:
        pickle.dump(lmart_l, open(model_filename, "wb"))
    else:
        pickle.dump(rf_model, open(model_filename, "wb"))

    print('Running test evaluation...')

    # Test on small subset
    # res = ltr_pipeline.transform(test_topics[:10].copy())

    # Test on entire testset
    start = time.time()
    res = ltr_pipeline.transform(test_topics)
    end = time.time()
    print('Test evaluation finished, time elapsed:', end - start, 'seconds...')

    print('Writing results...')
    output_file_path = './{}_resuls_{}.trec'.format(model_name, str(run_id))
    pt.io.write_results(res, output_file_path, format='trec')

    print('SUCCES: results can be found at: ', output_file_path)
Пример #12
0
 def __init__(self):
     if not pt.started():
         pt.init()
Пример #13
0
def _parallel_lambda_ray(function, inputs, jobs):
    from ray.util.multiprocessing import Pool
    with Pool(jobs, lambda args: pt.init(**args), pt.init_args) as pool:
        return pool.map(function, inputs)
Пример #14
0
 def _transform_ray(self, splits):
     from ray.util.multiprocessing import Pool
     with Pool(self.n_jobs, lambda: pt.init(**pt.init_args)) as pool:
         results = pool.map(lambda topics: self.parent(topics), splits)
         return pd.concat(results)
Пример #15
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     if not pt.started():
         pt.init()
Пример #16
0
 def starter(**initargs):
     if not pt.started():
         print("pt booted")
         pt.init(*initargs)
Пример #17
0
import pyterrier as pt
if not pt.started():
    pt.init(version="snapshot")
    
import tensorflow.compat.v1 as tf
import numpy as np
import collections

from pyterrier.transformer import TransformerBase

from run_reranking import model_fn_builder
from input_parser import input_fn_builder
from bert.modeling import BertConfig
from generate_data import PointwiseInstance
from bert import tokenization



def create_instance_pointwise(tokenizer, max_seq_length, qid, docno, query, doc, label):
  query = tokenization.convert_to_unicode(query)
  doc = tokenization.convert_to_unicode(doc)
  passages = get_passages(doc, 150, 50)
  if len(passages) == 0:
    tf.logging.warn("Passage length is 0 in qid {} docno {}".format(qid, docno))

  query = tokenization.convert_to_bert_input(
    text=query,
    max_seq_length=64,
    tokenizer=tokenizer,
    add_cls=True,
    convert_to_id=False