def main(queries_file, qrels_file, output_file, write_negative): queries = read_topics(queries_file) index_reader = IndexReader('indexes/msmarco-passage') document_count = int(index_reader.stats()['documents']) qrels = open(qrels_file, 'r') with open(output_file, 'w') as output_file_handle: for line in qrels: line = line.strip().split('\t') qid = int(line[0]) docid = line[2] target = line[3] query = queries[qid]['title'] features = compute_features(index_reader, query, docid) output_file_handle.write( format_qrel_line(target, qid, features, docid)) # The evaluation set doesn't need negative examples. if write_negative: negative_docid = str(get_negative_docid(document_count, docid)) features = compute_features(index_reader, query, negative_docid) output_file_handle.write( format_qrel_line(0, qid, features, negative_docid))
def _compute_idf(index_path): from pyserini.index import IndexReader index_reader = IndexReader(index_path) tokens = [] dfs = [] for term in index_reader.terms(): dfs.append(term.df) tokens.append(term.term) idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs)))) return dict(zip(tokens, idfs))
def compute_idf(query_terms: List[str], index_reader: IndexReader) -> np.ndarray: """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)""" C = index_reader.stats()['documents'] query_idf = np.zeros(len(query_terms)) for i, term in enumerate(query_terms): term_df = index_reader.get_term_counts(term, analyzer=None)[0] query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5)) return query_idf
def main(): try: # Location of the generated index index_loc = "indexes/msmarco-passage/lucene-index-msmarco" # Create a searcher object searcher = SimpleSearcher(index_loc) # Set the active scorer to BM25 searcher.set_bm25(k1=0.9, b=0.4) # Fetch 3 results for the given test query results = searcher.search('this is a test query', k=3) # For all results print the docid and the score expected = ['5578280', '2016011', '7004677'] docids = [x.docid for x in results] if expected != docids: raise Exception('Test query results do not match expected:', expected, '(expecteD)', docids, '(actual)') # IndexReader can give information about the index indexer = IndexReader(index_loc) if indexer.stats()['total_terms'] != 352316036: raise Exception( 'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?' ) topics = get_topics("msmarco-passage-dev-subset") if topics == {}: raise Exception( 'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.' ) first_query = topics[list(topics.keys())[0]]['title'] if first_query != "why do people grind teeth in sleep": raise Exception( 'Found a different first query than expected in the dataset. Did you download the right dataset?' ) # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch query = "This is a test query in which things are tested. Found using www.google.com of course!" # Tokenizing in pyserini is called Analyzing output = indexer.analyze(query) if len(output) != 9: raise Exception( 'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.' ) except Exception as inst: print('ERROR: something went wrong in the installation') print(inst) else: print("INSTALLATION OK")
parser.add_argument('--msmarco_dir', type=str, default="./data") parser.add_argument('--index_dir', type=str, default="./data/index") parser.add_argument('--output_dir', type=str, default="./data/bm25_result") parser.add_argument('--bm25_k1', type=float, default=0.6) parser.add_argument('--bm25_b', type=float, default=0.8) parser.add_argument('--threads', type=int, default=4) parser.add_argument('--sample', type=int, default=0) args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) indexer = IndexReader(args.index_dir) searcher = SimpleSearcher(args.index_dir) searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b) num_candidates = indexer.stats()['documents'] def calculate_bm25(query): qid, text = query with open(os.path.join(args.output_dir, f"{qid}.tsv"), 'w') as outfile: candidates = searcher.search(text, k=num_candidates) for i in range(len(candidates)): outfile.write(f"{candidates[i].docid}\t{candidates[i].score}\n") if __name__ == "__main__": # load the queries queries = dict() for line in open(os.path.join(args.msmarco_dir, f"queries.dev.tsv"), 'r'): qid, query = line.split('\t')
'cf': 1005023 }, { 'term': 'also', 'cf': 991428 }, { 'term': 'mai', 'cf': 955836 }, { 'term': 'most', 'cf': 927327 }, { 'term': 'about', 'cf': 909980 }] total_words = index_reader.stats()['total_terms'] def dirich(freq_term_in_doc, total_words_in_doc, freq_term_in_collection, total_words, mu=1000, log=True): output = 0 if log: output = math.log( (freq_term_in_doc + mu * (freq_term_in_collection / total_words)) / (total_words_in_doc + mu)) else: