def main(): parser = argparse.ArgumentParser("ACL Anthology document DynamoDB bulk importer") parser.add_argument("--index", required=True, type=str, help="Path to ACL Anthology Lucene index") parser.add_argument("--table", default="ACL", type=str, help="Dynamo table to insert the raw ACL documents to") parser.add_argument("--batch-size", dest="batch", default=MAX_BATCH_SIZE, help="The size of batch insert to Dynamo") parser.add_argument("--threads", default=5, type=int, help="Number of threads for batch inserts") parser.add_argument("--report-interval", dest="report_interval", default=500, type=int, help="Output progress interval") args = parser.parse_args() # TODO: use https://github.com/castorini/pyserini/blob/master/docs/usage-collection.md once AclAnthology support is added searcher = SimpleSearcher(args.index) progress = 0 next_report_threshold = args.report_interval batches = build_item_batches(searcher, args.batch) with concurrent.futures.ThreadPoolExecutor(max_workers=args.batch) as executor: futures = {executor.submit(batch_write_dynamo, args.table, batch): batch for batch in batches} for future in concurrent.futures.as_completed(futures): batch = futures[future] try: failed_docids = future.result() if failed_docids: logger.error("Error writing batches %s" % failed_docids) except Exception: batch_ids = [item["id"] for item in batch] logger.exception("Error writing batches %s" % batch_ids) finally: progress += len(batch) if progress > next_report_threshold: logger.info("Processed %s/%s records" % (progress, searcher.num_docs)) next_report_threshold += args.report_interval
def sampling(args): # load the positive doc qrels = defaultdict(list) for line in open(os.path.join(args.msmarco_dir, f"qrels.{args.mode}.tsv"), 'r'): qid, _, pid, _ = line.split('\t') qrels[qid].append(int(pid)) qrels = dict(qrels) # load the queries queries = dict() for line in open( os.path.join(args.msmarco_dir, f"queries.{args.mode}.tsv"), 'r'): qid, query = line.split('\t') query = query.rstrip() queries[qid] = query searcher = SimpleSearcher(args.index_dir) searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b) with open(os.path.join(args.output_dir, f'top_candidates.{args.mode}.tsv'), 'w') as outfile: for qid in tqdm(qrels): query = queries[qid] candidates = searcher.search(query, k=args.topN) for i in range(len(candidates)): outfile.write( f"{qid}\t{candidates[i].docid}\t{candidates[i].score}\n")
def bm25(qid, query, docs, index_path): s = SimpleSearcher(index_path) hits = s.search(query, 1000) n = 1 seen_docids = {} with open(f'run-passage-{qid}.txt', 'w') as writer: for i in range(0, len(hits)): if hits[i].docid in seen_docids: continue writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n') n = n + 1 seen_docids[hits[i].docid] = 1 with open(f'run-doc-{qid}.txt', 'w') as writer: for doc in docs: writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n') n = n + 1 os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' + f'--output run-rrf-{qid}.txt --runtag test') fused_run = TrecRun(f'run-rrf-{qid}.txt') output = [] for idx, r in fused_run.get_docs_by_topic(qid).iterrows(): output.append([qid, r["docid"], r["rank"]]) return output
def __init__(self, index): self._index = SimpleSearcher(index) self._query = None self._docids = [] self._doc_content = [] self._doc_scores = [] self._doc_embeddings = []
def __init__( self, index_dir="/nfs/phd_by_carlos/notebooks/datasets/TREC_CAsT/CAsT_collection_with_meta.index", k1=0.82, b=0.68, **kwargs): self.searcher = SimpleSearcher(index_dir)
def main(): # This assumes the index has already been generated searcher = SimpleSearcher('indexes/msmarco-passage') searcher.set_qld() topics = read_topics('msmarco-test2019-queries.tsv') run_all_queries('runs/run.msmarco-test2019-queries-bm25.trec', topics, searcher)
def main(): # This assumes the index has already been generated searcher = SimpleSearcher('indexes/msmarco-passage') # searcher.set_bm25(0.82, 0.68) searcher.set_rm3(fb_terms=25, fb_docs=50, original_query_weight=0.5) topics = read_topics('msmarco-test2019-queries.tsv') run_all_queries('runs/run.msmarco-test2019-queries-bm25.trec', topics, searcher)
def __init__(self, ranker, index, topn=10, topw=10, original_q_w=0.5): RelevanceFeedback.__init__(self, ranker=ranker, prels=None, anserini=None, index=index, topn=topn) self.topw = topw self.searcher = SimpleSearcher(index) self.ranker = ranker self.original_q_w = original_q_w
def initialize(self): from pyserini.search import SimpleSearcher self.searcher = SimpleSearcher(str(self.index.path)) modelhandler = Handler() @modelhandler() def handle(bm25: BM25): self.searcher.set_bm25(bm25.k1, bm25.b) modelhandler[self.model]
def retrieve_and_get_topn_relevant_docids(self, q): relevant_documents = [] searcher = SimpleSearcher(self.index) if self.ranker == 'bm25': searcher.set_bm25() elif self.ranker == 'qld': searcher.set_qld() hits = searcher.search(q) for i in range(0, self.topn): relevant_documents.append(hits[i].docid) return relevant_documents
def index(self): self._mkdir('./index/') self._mkdir('./index/convert/') self._mkdir('./index/chunks/') self._make_chuncks("./data/livivo/documents/") p = Pool() p.map(self._convert_chunks, os.listdir("./index/chunks/")) p.close() shutil.rmtree('./index/chunks') JIndexCollection.main(ARGS) self.searcher = SimpleSearcher('./index/') shutil.rmtree('./index/convert/')
def build_searcher( k1=0.9, b=0.4, index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors", segmented=False, rm3=False, chinese=False): searcher = SimpleSearcher(index_path) searcher.set_bm25(k1, b) if chinese: searcher.object.setLanguage("zh") print("########### we are usinig Chinese retriever ##########") return searcher
def __init__(self): self.searcher = SimpleSearcher(PATH_TO_WIKI_INDEX) self.searcher.set_bm25() self.searcher.unset_rm3() self.processor = SquadV2Processor() self.k = 29 self.mu = 0.5 self.use_ir_score = True self.tokenizer = BertTokenizer.from_pretrained(PATH_TO_DILBERT, do_lower_case=True) self.model = DilBert.from_pretrained(PATH_TO_DILBERT) self.device = DEVICE_COMP self.model.to(torch.device(self.device))
def main(output_path=OUTPUT_PATH, index_path=INDEX_PATH, queries_path=QUERIES_PATH, run=RUN, k=K): print('################################################') print("##### Performing Passage Ranking using L2R #####") print('################################################') print("Output will be placed in:", output_path, ", format used will be TREC") print('Loading pre-trained model MonoT5...') from pygaggle.rerank.transformer import MonoT5 reranker = MonoT5() print('Fetching anserini-like indices from:', index_path) # fetch some passages to rerank from MS MARCO with Pyserini (BM25) searcher = SimpleSearcher(index_path) print('Loading queries from:', queries_path) with open(queries_path, 'r') as f: content = f.readlines() content = [x.strip().split('\t') for x in content] queries = [Query(x[1], x[0]) for x in content] print(f'Ranking queries using BM25 (k={k})') queries_text = [] for query in tqdm(queries): hits = searcher.search(query.text, k=K) texts = hits_to_texts(hits) queries_text.append(texts) print('Reranking all queries using MonoT5!') rankings = [] for (i, query) in enumerate(tqdm(queries)): reranked = reranker.rerank(query, queries_text[i]) reranked.sort(key=lambda x: x.score, reverse=True) rankings.append(reranked) print('Outputting to file...') if '.tsv' in output_path: output_to_tsv(queries, rankings, run, output_path) elif '.csv' in output_path: output_to_csv(queries, rankings, run, output_path) else: print( 'ERROR: invalid output file format provided, please use either .csv or .tsv. Exiting' ) sys.exit(1) print('SUCCESS: completed reranking, you may check the output at:', output_path) sys.exit(0)
def setUp(self): # Download pre-built CACM index; append a random value to avoid filename clashes. r = randint(0, 10000000) self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz' self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r) self.index_dir = 'index{}/'.format(r) filename, headers = urlretrieve(self.collection_url, self.tarball_name) tarball = tarfile.open(self.tarball_name) tarball.extractall(self.index_dir) tarball.close() self.searcher = SimpleSearcher(f'{self.index_dir}lucene-index.cacm')
def load_ranker(args): if args.sparse and args.dense: sparse_searcher = SimpleSearcher(args.sparse_index_path) sparse_searcher.set_bm25(args.k, args.b) sparse_searcher.set_rm3(args.expansion_terms, args.expansion_documents, args.original_query_weight) encoder = TCTColBERTQueryEncoder('castorini/tct_colbert-msmarco') dense_searcher = SimpleDenseSearcher(args.dense_index_path, encoder) hsearcher = HybridSearcher(dense_searcher, sparse_searcher) elif args.sparse: sparse_searcher = SimpleSearcher(args.sparse_index_path) sparse_searcher.set_bm25(args.k, args.b) sparse_searcher.set_rm3(args.expansion_terms, args.expansion_documents, args.original_query_weight) return sparse_searcher elif args.dense: encoder = TCTColBERTQueryEncoder('castorini/tct_colbert-msmarco') dense_searcher = SimpleDenseSearcher(args.dense_index_path, encoder) return dense_searcher else: print( "Choose a valid ranking function sparse(BM25), dense(vector) or a combination of the two" ) exit(0)
def main(args): query = args.query index = args.index if args.do_tokenize: tokenizer = AutoTokenizer.from_pretrained('bert-multilingual-base-uncased') query = " ".join(tokenizer.tokenize(query)) logger.info(f'searching for: {query}') searcher = SimpleSearcher(index) searcher.set_analyzer(JWhiteSpaceAnalyzer()) hits = searcher.search(query, 1000) for i in range(len(hits)): doc = hits[i] print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
def __init__( self, index_folder: str, n_threads: int = 1, top_n: int = 5, text_column_name: str = "contents", return_scores: bool = False, *args, **kwargs, ): self.searcher = SimpleSearcher(str(expand_path(index_folder))) self.n_threads = n_threads self.top_n = top_n self.text_column_name = text_column_name self.return_scores = return_scores
def _run_thread(arguments): idz = arguments["id"] index = arguments["index"] k = arguments["k"] data = arguments["data"] # BM25 parameters #TODO # bm25_a = arguments["bm25_a"] # bm25_b = arguments["bm25_b"] # searcher.set_bm25(bm25_a, bm25_b) from pyserini.search import SimpleSearcher searcher = SimpleSearcher(index) _iter = data if idz == 0: _iter = tqdm(data) provenance = {} for x in _iter: query_id = x["id"] query = ( x["query"].replace(utils.ENT_END, "").replace(utils.ENT_START, "").strip() ) hits = searcher.search(query, k) element = [] for y in hits: try: doc_data = json.loads(str(y.docid).strip()) doc_data["score"] = y.score doc_data["text"] = str(y.raw).strip() element.append(doc_data) except Exception as e: print(e) element.append( { "score": y.score, "text": str(y.raw).strip(), "title": y.docid, } ) provenance[query_id] = element return provenance
def index(self): data = [] with jsonlines.open( './data/gesis-search/datasets/dataset.jsonl') as reader: for obj in reader: title = obj.get('title') or '' title = title[0] if type(title) is list else title abstract = obj.get('abstract') or '' abstract = abstract[0] if type(abstract) is list else abstract try: data.append({ 'id': obj.get('id'), 'contents': ' '.join([title, abstract]) }) except Exception as e: print(e) try: os.mkdir('./convert/') except OSError as error: print(error) with jsonlines.open('./convert/output.jsonl', mode='w') as writer: for doc in data: writer.write(doc) try: os.mkdir('./indexes/') except OSError as error: print(error) args = [ "-collection", "JsonCollection", "-generator", "DefaultLuceneDocumentGenerator", "-threads", "1", "-input", "./convert", "-index", "./indexes/gesis", "-storePositions", "-storeDocvectors", "-storeRaw" ] JIndexCollection.main(args) self.searcher = SimpleSearcher('indexes/gesis') with jsonlines.open( './data/gesis-search/documents/publication.jsonl') as reader: for obj in reader: self.title_lookup[obj.get('id')] = obj.get('title')
def main(): try: # Location of the generated index index_loc = "indexes/msmarco-passage/lucene-index-msmarco" # Create a searcher object searcher = SimpleSearcher(index_loc) # Set the active scorer to BM25 searcher.set_bm25(k1=0.9, b=0.4) # Fetch 3 results for the given test query results = searcher.search('this is a test query', k=3) # For all results print the docid and the score expected = ['5578280', '2016011', '7004677'] docids = [x.docid for x in results] if expected != docids: raise Exception('Test query results do not match expected:', expected, '(expecteD)', docids, '(actual)') # IndexReader can give information about the index indexer = IndexReader(index_loc) if indexer.stats()['total_terms'] != 352316036: raise Exception( 'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?' ) topics = get_topics("msmarco-passage-dev-subset") if topics == {}: raise Exception( 'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.' ) first_query = topics[list(topics.keys())[0]]['title'] if first_query != "why do people grind teeth in sleep": raise Exception( 'Found a different first query than expected in the dataset. Did you download the right dataset?' ) # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch query = "This is a test query in which things are tested. Found using www.google.com of course!" # Tokenizing in pyserini is called Analyzing output = indexer.analyze(query) if len(output) != 9: raise Exception( 'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.' ) except Exception as inst: print('ERROR: something went wrong in the installation') print(inst) else: print("INSTALLATION OK")
def __init__(self, candidates, num_candidates_samples, path_index, sample_data, anserini_folder, set_rm3=False, seed=42): random.seed(seed) self.candidates = candidates self.num_candidates_samples = num_candidates_samples self.path_index = path_index if set_rm3: self.name = "BM25RM3NS" else: self.name = "BM25NS" self.sample_data = sample_data self.anserini_folder = anserini_folder self._create_index() self.searcher = SimpleSearcher(self.path_index+"anserini_index") self.searcher.set_bm25(0.9, 0.4) if set_rm3: self.searcher.set_rm3()
def write_out(path_index, path_out, query_operation): searcher = SimpleSearcher(path_index) index_utils = index.IndexReader(path_index) f = open(path_out, "a") searcher.set_bm25(0.9, 0.4) searcher.set_rm3(10, 10, 0.5) searcher.set_qld(400) for x in range(len(number)): hits = searcher.search(query_operation[x],100) # Print the first 10 hits: for i in range(0, 100): #f = open("/content/anserini/june_remove_num.txt", "a") print(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}') f.write(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}\n') f.close()
def __init__(self, name, num_threads, index_dir=None, k1=0.9, b=0.4, use_bigrams=False, stem_bigrams=False): super().__init__(name) self.num_threads = min(num_threads, int(multiprocessing.cpu_count())) # initialize a ranker per thread self.arguments = [] for id in tqdm(range(self.num_threads)): ranker = SimpleSearcher(index_dir) ranker.set_bm25(k1, b) self.arguments.append( { "id": id, "ranker": ranker, "use_bigrams": use_bigrams, "stem_bigrams": stem_bigrams } )
def __init__( self, k, index_loc='../../anserini/indexes/lucene-wapost.v2.pos+docvectors+raw' ): self.utils = Utils() # Make sure you have produced this lucene index before self.index_loc = index_loc self.searcher = SimpleSearcher(self.index_loc) self.k = k # number of hits to return self.searcher.set_bm25(k1=0.9, b=0.4) # BM25 params #searcher.set_rm3(10, 10, 0.5) # relevance feedback self.batch_hits = {} self.topics = get_topics('core18') self.query_ids = [str(id) for id in self.topics.keys()] self.queries = [topic['title'] for topic in self.topics.values()] self.doc_ids = {} self.scores = {}
def build_searcher(settings: SearcherSettings) -> SimpleSearcher: if path.isdir(settings.index_path): searcher = SimpleSearcher(settings.index_path) else: searcher = SimpleSearcher.from_prebuilt_index(settings.index_path) searcher.set_bm25(float(settings.k1), float(settings.b)) logging.info( "Initializing BM25, setting k1={} and b={}".format(settings.k1, settings.b) ) if settings.rm3: searcher.set_rm3( settings.fb_terms, settings.fb_docs, settings.original_query_weight ) logging.info( "Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}".format( settings.fb_terms, settings.fb_docs, settings.original_query_weight ) ) return searcher
def extract_documents(): experiments = ["run.cw12.bm25+rm3", "run.cw12.bm25"] # searcher = SimpleSearcher('/data/anserini/lucene-index.gov2.pos+docvectors+rawdocs') # searcher = SimpleSearcher.from_prebuilt_index('robust04') searcher = SimpleSearcher( '/data/anserini/lucene-index.cw12b13.pos+docvectors+rawdocs') for experiment in experiments: file_address = "../data/cw12/" + experiment + ".txt" with open(file_address, "r") as index_file: if not os.path.exists("../data/cw12/" + experiment): os.makedirs("../data/cw12/" + experiment) for line_number, line in enumerate(index_file): # print(line.split(" ")[3]) idx = line.split(" ")[2] write_address = "../data/cw12/" + experiment + "/" + idx + ".txt" doc = searcher.doc(idx) with open(write_address, "w") as file_to_write: file_to_write.write(doc.raw()) if line_number % 1000 == 0: print(line_number)
def extract_expanded_documents(): experiment = "unbiased_expansions" searcher = SimpleSearcher( '/data/anserini/lucene-index.cw12b13.pos+docvectors+rawdocs') # searcher = SimpleSearcher.from_prebuilt_index('robust04') lamdas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for my_lambda in lamdas: print(my_lambda) my_directory = "../data/cw12/"+experiment +\ "/expanded_landa_"+str(my_lambda) file_address = my_directory + ".txt" with open(file_address, "r") as index_file: if not os.path.exists(my_directory): os.makedirs(my_directory) for line_number, line in enumerate(index_file): # print(line.split(" ")[3]) idx = line.split(" ")[2] write_address = my_directory + "/" + idx + ".txt" doc = searcher.doc(idx) with open(write_address, "w") as file_to_write: file_to_write.write(doc.raw())
def __init__(self, index_location, k=1000, wmodel="BM25", **kwargs): """ Construct an AnseriniBatchRetrieve retrieve. Args: index_location(str): The location of the Anserini index. wmodel(str): Weighting models supported by Anserini. There are three options: * `"BM25"` - the BM25 weighting model * `"QLD"` - Dirichlet language modelling * `"TFIDF"` - Lucene's `ClassicSimilarity <https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/search/similarities/ClassicSimilarity.html>`_. k(int): number of results to return. Default is 1000. """ super().__init__(kwargs) self.index_location = index_location self.k = k _init_anserini() from pyserini.search import SimpleSearcher self.searcher = SimpleSearcher(index_location) self.wmodel = wmodel self._setsimilarty(wmodel)
def _run_thread(arguments): idz = arguments["id"] index = arguments["index"] k = arguments["k"] data = arguments["data"] # BM25 parameters #TODO # bm25_a = arguments["bm25_a"] # bm25_b = arguments["bm25_b"] # searcher.set_bm25(bm25_a, bm25_b) searcher = SimpleSearcher(index) _iter = data if idz == 0: _iter = tqdm(data) provenance = {} for x in _iter: query_id = x["id"] query = ( x["query"].replace(utils.ENT_END, "").replace(utils.ENT_START, "").strip() ) hits = searcher.search(query, k) element = [] for y in hits: element.append( { "score": y.score, "text": str(y.raw).strip(), "title": y.docid, } ) provenance[query_id] = element return provenance