def test_check_stemmer(self): it = [ { 'docno': '1', 'url': 'url1', 'text': 'He ran out of money, so he had to stop playing', 'title': 'Woes of playing poker' }, { 'docno': '2', 'url': 'url2', 'text': 'The waves were crashing on the shore; it was a', 'title': 'Lovely sight' }, { 'docno': '3', 'url': 'url3', 'text': 'The body may perhaps compensates for the loss', 'title': 'Best of Viktor Prowoll' }, ] props = {} props["termpipelines"] = "" indexer = pt.IterDictIndexer(self.test_dir) for k, v in props.items(): indexer.setProperty(k, v) indexref = indexer.index(it) index = pt.IndexFactory.of(indexref) index = pt.cast("org.terrier.structures.IndexOnDisk", index) #restore setting after test pt.ApplicationSetup.setProperty("termpipelines", "Stopwords,PorterStemmer") self.assertEqual("", index.getIndexProperty("termpipelines", "bla"))
def test_sliding(self): slider = pt.text.sliding("text", 10, 10, prepend_attr=None) indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") indexref = pipeline.index(dataset.get_corpus_iter()) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) self.assertTrue( index.getCollectionStatistics().getNumberOfDocuments() > len(dataset.get_corpus_iter()))
def test_sliding_title_one(self): corpus = [{"docno": "d1", "text": "A B", "title": "this is a title"}] slider = pt.text.sliding("text", 2, 1, prepend_attr="title") indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") indexref = pipeline.index(corpus) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) # we should get 1 passages in the resulting index self.assertEqual( 1, index.getCollectionStatistics().getNumberOfDocuments())
def test_add_dup(self): def _first(df): df2 = df.copy() df2["docno"] = df2["docno"] + "bis" return pd.concat([df, df2]) slider = pt.apply.generic(_first) indexer = pt.IterDictIndexer(self.test_dir) pipeline = slider >> indexer dataset = pt.get_dataset("irds:vaswani") #print(next(dataset.get_corpus_iter().gen)) indexref = pipeline.index(dataset.get_corpus_iter()) self.assertIsNotNone(indexref) index = pt.IndexFactory.of(indexref) self.assertEqual( index.getCollectionStatistics().getNumberOfDocuments(), 2 * len(dataset.get_corpus_iter()))
def _create_index(self): """ Index candidates in case they are not already indexed. """ index_path_with_cand = self.path_index + "_documents_cand_{}".format( self.sample_data) if not os.path.isdir(index_path_with_cand): os.makedirs(index_path_with_cand, exist_ok=True) def from_list_gen(): for i, cand in enumerate(self.candidates): yield {'docno': i, 'text': cand} iter_indexer = pt.IterDictIndexer(index_path_with_cand) self.indexref = iter_indexer.index(from_list_gen(), meta=['docno', 'text']) self.indexref = pt.IndexRef.of(index_path_with_cand)
def test_meta_init(self): it = [ { 'docno': '1', 'url': 'url1', 'text': 'He ran out of money, so he had to stop playing', 'title': 'Woes of playing poker' }, { 'docno': '2', 'url': 'url2', 'text': 'The waves were crashing on the shore; it was a', 'title': 'Lovely sight' }, { 'docno': '3', 'url': 'url3', 'text': 'The body may perhaps compensates for the loss', 'title': 'Best of Viktor Prowoll' }, ] props = {} props["termpipelines"] = "" indexer = pt.IterDictIndexer(self.test_dir, meta={ 'docno': 10, 'url': 10, 'text': 100, 'title': 100 }, meta_reverse=['docno', 'url']) indexref = indexer.index(it) index = pt.IndexFactory.of(indexref) self.assertIn("docno", index.getMetaIndex().getKeys()) self.assertIn("text", index.getMetaIndex().getKeys()) self.assertIn("docno", index.getMetaIndex().getKeys()) self.assertIn("url", index.getMetaIndex().getReverseKeys())
def _create_index(self, it, fields, meta, type): pd_indexer = pt.IterDictIndexer(self.test_dir, type=type) indexref = pd_indexer.index(it, fields, meta) self.assertIsNotNone(indexref) return indexref
delimiter : str delimiter of csv file that contains the passages verbose: bool, default=False Whether or not to log progress frequently. Returns ------- {'docno': docno, 'text': text} """ csv_file = open(filepath) read_csv = csv.reader(csv_file, delimiter=delimiter) for i, (docno, text) in enumerate(read_csv): if i % 200000 == 0 and verbose: print(f'Processing passage {i}') yield {'docno': docno, 'text': text} # uncomment to create index if index is not yet created. if os.path.exists('index'): shutil.rmtree('index') index_path = os.path.join(os.getcwd(), 'index') iter_indexer = pt.IterDictIndexer(index_path) collection_file = os.path.join(os.getcwd(), 'data', 'collection.tsv') doc_iter = passages_generator(collection_file, '\t', verbose=True) index_passages = iter_indexer.index(doc_iter) print("done")
def main(algorithm=LAMBDAMART, feat_batch=FEATURES_BATCH_N, top_n_train=TOP_N_TRAIN, top_n_validation=TOP_N_TRAIN, run_id=RUN_ID): if not pt.started(): pt.init(mem=8000) ################ ## INDEX STEP ## ################ dataset = pt.get_dataset("trec-deep-learning-passages") def msmarco_generate(): with pt.io.autoopen(dataset.get_corpus()[0], 'rt') as corpusfile: for l in corpusfile: docno, passage = l.split("\t") yield {'docno': docno, 'text': passage} try: print("Indexing MSMARCO passage ranking dataset") print( "If the index has not be constructed yet but the MSMARCO dataset has been downloaded previously, it is recommended to place the collection.tar.gz in the \"/Users/{username}/.pyterrier/corpora/trec-deep-learning-passages\" directory. This will make sure that PyTerrier does not download the corpus of the internet and uses the local file instead. " ) # Single threaded indexing # iter_indexer = pt.IterDictIndexer("./passage_index") # indexref3 = iter_indexer.index(msmarco_generate(), meta=['docno', 'text'], meta_lengths=[20, 4096]) print( "Performing Multi threaded indexing, if this does not work on your system (probably if it is Windows), then uncomment the two lines above this print statement and comment out the two lines below this statement in the code to make sure it runs on a single thread." ) # Multi threaded indexing, UNIX-based systems only!!!!! iter_indexer = pt.IterDictIndexer("./passage_index_8", threads=8) indexref4 = iter_indexer.index(msmarco_generate(), meta=['docno', 'text'], meta_lengths=[20, 4096]) except ValueError as err: if "Index already exists" in str(err): print("Index already exists, loading existing one") indexref4 = "./passage_index_8/data.properties" pt.logging('WARN') index = pt.IndexFactory.of(indexref4) print(index.getCollectionStatistics().toString()) ################ ## DATA PREP ## ################ # Load topics as df: [qid, query] # load qrels as df: [qid, docno, label] def load_qrels_file(path): df = pd.read_csv(path, sep='\t', names=['qid', 'q0', 'docno', 'label'], dtype={ 'qid': str, 'q0': str, 'docno': str, 'label': np.int32 }) del df['q0'] return df def load_topics_file(path): df = pd.read_csv(path, sep='\t', names=['qid', 'query'], dtype={ 'qid': str, 'query': str }) exclude = set(string.punctuation) # Remove punctuation # print(exclude) df['query'] = df['query'].apply( lambda s: ''.join(ch for ch in s if ch not in exclude)) # print(df['query'][:6]) return df def filter_train_qrels(train_topics_subset, train_qrels): m = train_qrels.qid.isin(train_topics_subset.qid) return train_qrels[m] print('Loading train/validation topics and qrels') print( "Looking for the query files in the following directory: collections/msmarco-passage/, make sure to have the query files located there..." ) train_topics = load_topics_file( 'collections/msmarco-passage/queries.train.tsv') train_qrels = load_qrels_file( 'collections/msmarco-passage/qrels.train.tsv') validation_topics = load_topics_file( 'collections/msmarco-passage/queries.dev.small.tsv') validation_qrels = load_qrels_file( 'collections/msmarco-passage/qrels.dev.small.tsv') test_topics = load_topics_file( 'collections/msmarco-passage/msmarco-test2019-queries.tsv') print('Getting first {} train topics and corresponding qrels'.format( top_n_train)) # TODO: not all queries here have qrels... Maybe filter on first 100 that have qrels? if int(top_n_train) > 0: train_sub = train_topics[:top_n_train].copy() train_qrels_sub = filter_train_qrels(train_sub, train_qrels) else: train_sub = train_topics train_qrels_sub = train_qrels print('Getting first {} validation topics and corresponding qrels'.format( top_n_validation)) if int(top_n_validation) > 0: validation_sub = validation_topics[:top_n_validation].copy() validation_qrels_sub = filter_train_qrels(validation_sub, validation_qrels) else: validation_sub = validation_topics validation_qrels_sub = validation_qrels # print(train_qrels_sub) ############## ## TRAINING ## ############## print('Setting up FeaturesBatchRetriever') pipeline = pt.FeaturesBatchRetrieve( index, wmodel="BM25", features=[ "SAMPLE", "WMODEL:Tf", "WMODEL:PL2", "WMODEL:TF_IDF", "WMODEL:DLH13", "WMODEL:Hiemstra_LM" ]) % feat_batch #### LAMBDAMART print('Configuring Ranker...') # this configures LightGBM as LambdaMART lmart_l = lgb.LGBMRanker( task="train", # min_data_in_leaf=1, # min_sum_hessian_in_leaf=100, # max_bin=255, num_leaves=7, objective="lambdarank", metric="ndcg", # ndcg_eval_at=[1, 3, 5, 10], learning_rate=.1, importance_type="gain", # num_iterations=10, silent=False, n_jobs=-1) # lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg', # learning_rate=0.1, # gamma=1.0, # min_child_weight=0.1, # max_depth=6, # verbose=2, # random_state=42) print('''\n ######################################## ###### Training pipeline summary: ###### ######################################## Train Topics: {} Train Qrels: {} Validation topics: {} Validation Qrels: {} Amount of passage samples per query: {} ######################################## '''.format(train_sub.shape[0], train_qrels_sub.shape[0], validation_sub.shape[0], validation_qrels_sub.shape[0], FEATURES_BATCH_N)) start = time.time() print( "Model output is not rendered to the terminal until after the run is finished..." ) if algorithm.upper() == LAMBDAMART: print('Training LambdaMART pipeline') # ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr") # ltr_pipeline.fit(train_sub, train_qrels_sub, validation_topics, validation_qrels) ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(lmart_l, form="ltr") ltr_pipeline.fit_kwargs = {'verbose': 1} ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub, validation_qrels_sub) model_name = "LambdaRANK" elif algorithm.upper() == RANDOM_FOREST: # RANDOM FOREST print('Training RandomForest pipeline') rf_model = RandomForestRegressor(n_jobs=-1, verbose=10) ltr_pipeline = pipeline >> pt.ltr.apply_learned_model(rf_model) ltr_pipeline.fit(train_sub, train_qrels_sub, validation_sub, validation_qrels_sub) model_name = 'RandomForest' else: print("ERROR: passed invalid algorithm as parameters") sys.exit(1) ### End of training ### end = time.time() print('Training finished, time elapsed:', end - start, 'seconds...') ########################### ## RERANKING AND OUTPUT ## ########################### # Output models to pickle files # pipeline_filename = '{}_pipeline_{}_{}_{}.p'.format(model_name, train_sub.shape[0], validation_sub.shape[0], run_id) # print('Exporting learned pipline to:', pipeline_filename) # pickle.dump(ltr_pipeline, open(pipeline_filename, "wb")) model_filename = '{}_model_{}_{}_{}.p'.format(model_name, train_sub.shape[0], validation_sub.shape[0], run_id) print('Exporting l2r model to:', model_filename) if algorithm.upper() == LAMBDAMART: pickle.dump(lmart_l, open(model_filename, "wb")) else: pickle.dump(rf_model, open(model_filename, "wb")) print('Running test evaluation...') # Test on small subset # res = ltr_pipeline.transform(test_topics[:10].copy()) # Test on entire testset start = time.time() res = ltr_pipeline.transform(test_topics) end = time.time() print('Test evaluation finished, time elapsed:', end - start, 'seconds...') print('Writing results...') output_file_path = './{}_resuls_{}.trec'.format(model_name, str(run_id)) pt.io.write_results(res, output_file_path, format='trec') print('SUCCES: results can be found at: ', output_file_path)
def index(self): iter_indexer = pt.IterDictIndexer("./index") doc_iter = _livivo_doc_iter() indexref = iter_indexer.index(doc_iter) self.idx = pt.IndexFactory.of(indexref)