示例#1
0
def main():
    parser = argparse.ArgumentParser("ACL Anthology document DynamoDB bulk importer")
    parser.add_argument("--index", required=True, type=str, help="Path to ACL Anthology Lucene index")
    parser.add_argument("--table", default="ACL", type=str, help="Dynamo table to insert the raw ACL documents to")
    parser.add_argument("--batch-size", dest="batch", default=MAX_BATCH_SIZE, help="The size of batch insert to Dynamo")
    parser.add_argument("--threads", default=5, type=int, help="Number of threads for batch inserts")
    parser.add_argument("--report-interval", dest="report_interval", default=500, type=int, help="Output progress interval")
    args = parser.parse_args()

    # TODO: use https://github.com/castorini/pyserini/blob/master/docs/usage-collection.md once AclAnthology support is added
    searcher = SimpleSearcher(args.index)

    progress = 0
    next_report_threshold = args.report_interval
    batches = build_item_batches(searcher, args.batch)
    with concurrent.futures.ThreadPoolExecutor(max_workers=args.batch) as executor:
        futures = {executor.submit(batch_write_dynamo, args.table, batch): batch for batch in batches}
        for future in concurrent.futures.as_completed(futures):
            batch = futures[future]
            try:
                failed_docids = future.result()
                if failed_docids:
                    logger.error("Error writing batches %s" % failed_docids)
            except Exception:
                batch_ids = [item["id"] for item in batch]
                logger.exception("Error writing batches %s" % batch_ids)
            finally:
                progress += len(batch)
                if progress > next_report_threshold:
                    logger.info("Processed %s/%s records" % (progress, searcher.num_docs))
                    next_report_threshold += args.report_interval
示例#2
0
def sampling(args):
    # load the positive doc
    qrels = defaultdict(list)
    for line in open(os.path.join(args.msmarco_dir, f"qrels.{args.mode}.tsv"),
                     'r'):
        qid, _, pid, _ = line.split('\t')
        qrels[qid].append(int(pid))
    qrels = dict(qrels)

    # load the queries
    queries = dict()
    for line in open(
            os.path.join(args.msmarco_dir, f"queries.{args.mode}.tsv"), 'r'):
        qid, query = line.split('\t')
        query = query.rstrip()
        queries[qid] = query

    searcher = SimpleSearcher(args.index_dir)
    searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b)

    with open(os.path.join(args.output_dir, f'top_candidates.{args.mode}.tsv'),
              'w') as outfile:
        for qid in tqdm(qrels):
            query = queries[qid]
            candidates = searcher.search(query, k=args.topN)
            for i in range(len(candidates)):
                outfile.write(
                    f"{qid}\t{candidates[i].docid}\t{candidates[i].score}\n")
示例#3
0
def bm25(qid, query, docs, index_path):
    s = SimpleSearcher(index_path)
    hits = s.search(query, 1000)

    n = 1
    seen_docids = {}
    with open(f'run-passage-{qid}.txt', 'w') as writer:
        for i in range(0, len(hits)):
            if hits[i].docid in seen_docids:
                continue
            writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n')
            n = n + 1
            seen_docids[hits[i].docid] = 1

    with open(f'run-doc-{qid}.txt', 'w') as writer:
        for doc in docs:
            writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n')
            n = n + 1

    os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' +
              f'--output run-rrf-{qid}.txt --runtag test')
    fused_run = TrecRun(f'run-rrf-{qid}.txt')

    output = []
    for idx, r in fused_run.get_docs_by_topic(qid).iterrows():
        output.append([qid, r["docid"], r["rank"]])

    return output
示例#4
0
 def __init__(self, index):
     self._index = SimpleSearcher(index)
     self._query = None
     self._docids = []
     self._doc_content = []
     self._doc_scores = []
     self._doc_embeddings = []
示例#5
0
 def __init__(
         self,
         index_dir="/nfs/phd_by_carlos/notebooks/datasets/TREC_CAsT/CAsT_collection_with_meta.index",
         k1=0.82,
         b=0.68,
         **kwargs):
     self.searcher = SimpleSearcher(index_dir)
示例#6
0
def main():
    # This assumes the index has already been generated
    searcher = SimpleSearcher('indexes/msmarco-passage')
    searcher.set_qld()

    topics = read_topics('msmarco-test2019-queries.tsv')

    run_all_queries('runs/run.msmarco-test2019-queries-bm25.trec', topics,
                    searcher)
示例#7
0
def main():
    # This assumes the index has already been generated
    searcher = SimpleSearcher('indexes/msmarco-passage')
    # searcher.set_bm25(0.82, 0.68)
    searcher.set_rm3(fb_terms=25, fb_docs=50, original_query_weight=0.5)

    topics = read_topics('msmarco-test2019-queries.tsv')

    run_all_queries('runs/run.msmarco-test2019-queries-bm25.trec', topics,
                    searcher)
示例#8
0
 def __init__(self, ranker, index, topn=10, topw=10, original_q_w=0.5):
     RelevanceFeedback.__init__(self,
                                ranker=ranker,
                                prels=None,
                                anserini=None,
                                index=index,
                                topn=topn)
     self.topw = topw
     self.searcher = SimpleSearcher(index)
     self.ranker = ranker
     self.original_q_w = original_q_w
示例#9
0
    def initialize(self):
        from pyserini.search import SimpleSearcher

        self.searcher = SimpleSearcher(str(self.index.path))

        modelhandler = Handler()

        @modelhandler()
        def handle(bm25: BM25):
            self.searcher.set_bm25(bm25.k1, bm25.b)

        modelhandler[self.model]
示例#10
0
    def retrieve_and_get_topn_relevant_docids(self, q):
        relevant_documents = []
        searcher = SimpleSearcher(self.index)

        if self.ranker == 'bm25':
            searcher.set_bm25()
        elif self.ranker == 'qld':
            searcher.set_qld()
        hits = searcher.search(q)
        for i in range(0, self.topn):
            relevant_documents.append(hits[i].docid)
        return relevant_documents
 def index(self):
     self._mkdir('./index/')
     self._mkdir('./index/convert/')
     self._mkdir('./index/chunks/')
     self._make_chuncks("./data/livivo/documents/")
     p = Pool()
     p.map(self._convert_chunks, os.listdir("./index/chunks/"))
     p.close()
     shutil.rmtree('./index/chunks')
     JIndexCollection.main(ARGS)
     self.searcher = SimpleSearcher('./index/')
     shutil.rmtree('./index/convert/')
示例#12
0
def build_searcher(
        k1=0.9,
        b=0.4,
        index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors",
        segmented=False,
        rm3=False,
        chinese=False):
    searcher = SimpleSearcher(index_path)
    searcher.set_bm25(k1, b)
    if chinese:
        searcher.object.setLanguage("zh")
        print("########### we are usinig Chinese retriever ##########")
    return searcher
示例#13
0
 def __init__(self):
     self.searcher = SimpleSearcher(PATH_TO_WIKI_INDEX)
     self.searcher.set_bm25()
     self.searcher.unset_rm3()
     self.processor = SquadV2Processor()
     self.k = 29
     self.mu = 0.5
     self.use_ir_score = True
     self.tokenizer = BertTokenizer.from_pretrained(PATH_TO_DILBERT,
                                                    do_lower_case=True)
     self.model = DilBert.from_pretrained(PATH_TO_DILBERT)
     self.device = DEVICE_COMP
     self.model.to(torch.device(self.device))
def main(output_path=OUTPUT_PATH,
         index_path=INDEX_PATH,
         queries_path=QUERIES_PATH,
         run=RUN,
         k=K):
    print('################################################')
    print("##### Performing Passage Ranking using L2R #####")
    print('################################################')
    print("Output will be placed in:", output_path,
          ", format used will be TREC")
    print('Loading pre-trained model MonoT5...')
    from pygaggle.rerank.transformer import MonoT5
    reranker = MonoT5()

    print('Fetching anserini-like indices from:', index_path)
    # fetch some passages to rerank from MS MARCO with Pyserini (BM25)
    searcher = SimpleSearcher(index_path)
    print('Loading queries from:', queries_path)
    with open(queries_path, 'r') as f:
        content = f.readlines()
        content = [x.strip().split('\t') for x in content]
        queries = [Query(x[1], x[0]) for x in content]
    print(f'Ranking queries using BM25 (k={k})')
    queries_text = []
    for query in tqdm(queries):
        hits = searcher.search(query.text, k=K)
        texts = hits_to_texts(hits)
        queries_text.append(texts)

    print('Reranking all queries using MonoT5!')
    rankings = []

    for (i, query) in enumerate(tqdm(queries)):
        reranked = reranker.rerank(query, queries_text[i])
        reranked.sort(key=lambda x: x.score, reverse=True)
        rankings.append(reranked)

    print('Outputting to file...')
    if '.tsv' in output_path:
        output_to_tsv(queries, rankings, run, output_path)
    elif '.csv' in output_path:
        output_to_csv(queries, rankings, run, output_path)
    else:
        print(
            'ERROR: invalid output file format provided, please use either .csv or .tsv. Exiting'
        )
        sys.exit(1)
    print('SUCCESS: completed reranking, you may check the output at:',
          output_path)
    sys.exit(0)
示例#15
0
    def setUp(self):
        # Download pre-built CACM index; append a random value to avoid filename clashes.
        r = randint(0, 10000000)
        self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz'
        self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r)
        self.index_dir = 'index{}/'.format(r)

        filename, headers = urlretrieve(self.collection_url, self.tarball_name)

        tarball = tarfile.open(self.tarball_name)
        tarball.extractall(self.index_dir)
        tarball.close()

        self.searcher = SimpleSearcher(f'{self.index_dir}lucene-index.cacm')
示例#16
0
def load_ranker(args):
    if args.sparse and args.dense:
        sparse_searcher = SimpleSearcher(args.sparse_index_path)
        sparse_searcher.set_bm25(args.k, args.b)
        sparse_searcher.set_rm3(args.expansion_terms, args.expansion_documents,
                                args.original_query_weight)
        encoder = TCTColBERTQueryEncoder('castorini/tct_colbert-msmarco')
        dense_searcher = SimpleDenseSearcher(args.dense_index_path, encoder)
        hsearcher = HybridSearcher(dense_searcher, sparse_searcher)
    elif args.sparse:
        sparse_searcher = SimpleSearcher(args.sparse_index_path)
        sparse_searcher.set_bm25(args.k, args.b)
        sparse_searcher.set_rm3(args.expansion_terms, args.expansion_documents,
                                args.original_query_weight)
        return sparse_searcher
    elif args.dense:
        encoder = TCTColBERTQueryEncoder('castorini/tct_colbert-msmarco')
        dense_searcher = SimpleDenseSearcher(args.dense_index_path, encoder)
        return dense_searcher
    else:
        print(
            "Choose a valid ranking function sparse(BM25), dense(vector) or a combination of the two"
        )
        exit(0)
示例#17
0
def main(args):
    query = args.query
    index = args.index
    if args.do_tokenize:
        tokenizer = AutoTokenizer.from_pretrained('bert-multilingual-base-uncased')
        query = " ".join(tokenizer.tokenize(query))

    logger.info(f'searching for: {query}')
    searcher = SimpleSearcher(index)
    searcher.set_analyzer(JWhiteSpaceAnalyzer())
    hits = searcher.search(query, 1000)

    for i in range(len(hits)):
        doc = hits[i]
        print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
示例#18
0
 def __init__(
     self,
     index_folder: str,
     n_threads: int = 1,
     top_n: int = 5,
     text_column_name: str = "contents",
     return_scores: bool = False,
     *args,
     **kwargs,
 ):
     self.searcher = SimpleSearcher(str(expand_path(index_folder)))
     self.n_threads = n_threads
     self.top_n = top_n
     self.text_column_name = text_column_name
     self.return_scores = return_scores
示例#19
0
def _run_thread(arguments):
    idz = arguments["id"]
    index = arguments["index"]
    k = arguments["k"]
    data = arguments["data"]

    # BM25 parameters #TODO
    # bm25_a = arguments["bm25_a"]
    # bm25_b = arguments["bm25_b"]
    # searcher.set_bm25(bm25_a, bm25_b)

    from pyserini.search import SimpleSearcher

    searcher = SimpleSearcher(index)

    _iter = data
    if idz == 0:
        _iter = tqdm(data)

    provenance = {}
    for x in _iter:
        query_id = x["id"]
        query = (
            x["query"].replace(utils.ENT_END, "").replace(utils.ENT_START, "").strip()
        )

        hits = searcher.search(query, k)

        element = []
        for y in hits:
            try:
                doc_data = json.loads(str(y.docid).strip())
                doc_data["score"] = y.score
                doc_data["text"] = str(y.raw).strip()
                element.append(doc_data)
            except Exception as e:
                print(e)
                element.append(
                    {
                        "score": y.score,
                        "text": str(y.raw).strip(),
                        "title": y.docid,
                    }
                )
        provenance[query_id] = element

    return provenance
示例#20
0
    def index(self):

        data = []

        with jsonlines.open(
                './data/gesis-search/datasets/dataset.jsonl') as reader:
            for obj in reader:
                title = obj.get('title') or ''
                title = title[0] if type(title) is list else title
                abstract = obj.get('abstract') or ''
                abstract = abstract[0] if type(abstract) is list else abstract
                try:
                    data.append({
                        'id': obj.get('id'),
                        'contents': ' '.join([title, abstract])
                    })
                except Exception as e:
                    print(e)

        try:
            os.mkdir('./convert/')
        except OSError as error:
            print(error)

        with jsonlines.open('./convert/output.jsonl', mode='w') as writer:
            for doc in data:
                writer.write(doc)

        try:
            os.mkdir('./indexes/')
        except OSError as error:
            print(error)

        args = [
            "-collection", "JsonCollection", "-generator",
            "DefaultLuceneDocumentGenerator", "-threads", "1", "-input",
            "./convert", "-index", "./indexes/gesis", "-storePositions",
            "-storeDocvectors", "-storeRaw"
        ]

        JIndexCollection.main(args)
        self.searcher = SimpleSearcher('indexes/gesis')

        with jsonlines.open(
                './data/gesis-search/documents/publication.jsonl') as reader:
            for obj in reader:
                self.title_lookup[obj.get('id')] = obj.get('title')
示例#21
0
def main():
    try:
        # Location of the generated index
        index_loc = "indexes/msmarco-passage/lucene-index-msmarco"

        # Create a searcher object
        searcher = SimpleSearcher(index_loc)
        # Set the active scorer to BM25
        searcher.set_bm25(k1=0.9, b=0.4)
        # Fetch 3 results for the given test query
        results = searcher.search('this is a test query', k=3)
        # For all results print the docid and the score
        expected = ['5578280', '2016011', '7004677']
        docids = [x.docid for x in results]
        if expected != docids:
            raise Exception('Test query results do not match expected:',
                            expected, '(expecteD)', docids, '(actual)')
        # IndexReader can give information about the index
        indexer = IndexReader(index_loc)
        if indexer.stats()['total_terms'] != 352316036:
            raise Exception(
                'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?'
            )
        topics = get_topics("msmarco-passage-dev-subset")
        if topics == {}:
            raise Exception(
                'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.'
            )
        first_query = topics[list(topics.keys())[0]]['title']
        if first_query != "why do people grind teeth in sleep":
            raise Exception(
                'Found a different first query than expected in the dataset. Did you download the right dataset?'
            )
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch
        query = "This is a test query in which things are tested. Found using www.google.com of course!"
        # Tokenizing in pyserini is called Analyzing
        output = indexer.analyze(query)
        if len(output) != 9:
            raise Exception(
                'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.'
            )
    except Exception as inst:
        print('ERROR: something went wrong in the installation')
        print(inst)
    else:
        print("INSTALLATION OK")
示例#22
0
        def __init__(self, candidates, num_candidates_samples, path_index, sample_data, anserini_folder, set_rm3=False, seed=42):
            random.seed(seed)
            self.candidates = candidates
            self.num_candidates_samples = num_candidates_samples
            self.path_index  = path_index
            if set_rm3:
                self.name = "BM25RM3NS"
            else:
                self.name = "BM25NS"
            self.sample_data = sample_data
            self.anserini_folder = anserini_folder
            self._create_index()

            self.searcher = SimpleSearcher(self.path_index+"anserini_index")
            self.searcher.set_bm25(0.9, 0.4)
            if set_rm3:
                self.searcher.set_rm3()
def write_out(path_index, path_out, query_operation):
  
  searcher = SimpleSearcher(path_index)
  index_utils = index.IndexReader(path_index)

  f = open(path_out, "a")
  searcher.set_bm25(0.9, 0.4)
  searcher.set_rm3(10, 10, 0.5)
  searcher.set_qld(400)
  for x in range(len(number)):
    hits = searcher.search(query_operation[x],100)

    # Print the first 10 hits:
    for i in range(0, 100):
      #f = open("/content/anserini/june_remove_num.txt", "a")
      print(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}')
      f.write(f'{number[x]} {"Q0"} {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} {"JUNE"}\n')
  f.close()
示例#24
0
    def __init__(self, name, num_threads, index_dir=None, k1=0.9, b=0.4, use_bigrams=False, stem_bigrams=False):
        super().__init__(name)

        self.num_threads = min(num_threads, int(multiprocessing.cpu_count()))

        # initialize a ranker per thread
        self.arguments = []
        for id in tqdm(range(self.num_threads)):
            ranker = SimpleSearcher(index_dir)
            ranker.set_bm25(k1, b)
            self.arguments.append(
                {
                    "id": id,
                    "ranker": ranker,
                    "use_bigrams": use_bigrams,
                    "stem_bigrams": stem_bigrams
                }
            )
示例#25
0
 def __init__(
     self,
     k,
     index_loc='../../anserini/indexes/lucene-wapost.v2.pos+docvectors+raw'
 ):
     self.utils = Utils()
     # Make sure you have produced this lucene index before
     self.index_loc = index_loc
     self.searcher = SimpleSearcher(self.index_loc)
     self.k = k  # number of hits to return
     self.searcher.set_bm25(k1=0.9, b=0.4)  # BM25 params
     #searcher.set_rm3(10, 10, 0.5)  # relevance feedback
     self.batch_hits = {}
     self.topics = get_topics('core18')
     self.query_ids = [str(id) for id in self.topics.keys()]
     self.queries = [topic['title'] for topic in self.topics.values()]
     self.doc_ids = {}
     self.scores = {}
示例#26
0
def build_searcher(settings: SearcherSettings) -> SimpleSearcher:
    if path.isdir(settings.index_path):
        searcher = SimpleSearcher(settings.index_path)
    else:
        searcher = SimpleSearcher.from_prebuilt_index(settings.index_path)
    searcher.set_bm25(float(settings.k1), float(settings.b))
    logging.info(
        "Initializing BM25, setting k1={} and b={}".format(settings.k1, settings.b)
    )
    if settings.rm3:
        searcher.set_rm3(
            settings.fb_terms, settings.fb_docs, settings.original_query_weight
        )
        logging.info(
            "Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}".format(
                settings.fb_terms, settings.fb_docs, settings.original_query_weight
            )
        )
    return searcher
示例#27
0
def extract_documents():
    experiments = ["run.cw12.bm25+rm3", "run.cw12.bm25"]
    # searcher = SimpleSearcher('/data/anserini/lucene-index.gov2.pos+docvectors+rawdocs')
    # searcher = SimpleSearcher.from_prebuilt_index('robust04')
    searcher = SimpleSearcher(
        '/data/anserini/lucene-index.cw12b13.pos+docvectors+rawdocs')

    for experiment in experiments:
        file_address = "../data/cw12/" + experiment + ".txt"
        with open(file_address, "r") as index_file:
            if not os.path.exists("../data/cw12/" + experiment):
                os.makedirs("../data/cw12/" + experiment)
            for line_number, line in enumerate(index_file):
                # print(line.split(" ")[3])
                idx = line.split(" ")[2]
                write_address = "../data/cw12/" + experiment + "/" + idx + ".txt"
                doc = searcher.doc(idx)
                with open(write_address, "w") as file_to_write:
                    file_to_write.write(doc.raw())
                if line_number % 1000 == 0:
                    print(line_number)
示例#28
0
def extract_expanded_documents():
    experiment = "unbiased_expansions"
    searcher = SimpleSearcher(
        '/data/anserini/lucene-index.cw12b13.pos+docvectors+rawdocs')

    # searcher = SimpleSearcher.from_prebuilt_index('robust04')
    lamdas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for my_lambda in lamdas:
        print(my_lambda)
        my_directory = "../data/cw12/"+experiment +\
                      "/expanded_landa_"+str(my_lambda)
        file_address = my_directory + ".txt"
        with open(file_address, "r") as index_file:
            if not os.path.exists(my_directory):
                os.makedirs(my_directory)
            for line_number, line in enumerate(index_file):
                # print(line.split(" ")[3])
                idx = line.split(" ")[2]
                write_address = my_directory + "/" + idx + ".txt"
                doc = searcher.doc(idx)
                with open(write_address, "w") as file_to_write:
                    file_to_write.write(doc.raw())
示例#29
0
    def __init__(self, index_location, k=1000, wmodel="BM25", **kwargs):
        """
            Construct an AnseriniBatchRetrieve retrieve. 

            Args:

                index_location(str): The location of the Anserini index.
                wmodel(str): Weighting models supported by Anserini. There are three options: 
                
                 * `"BM25"` - the BM25 weighting model
                 * `"QLD"`  - Dirichlet language modelling
                 *  `"TFIDF"` - Lucene's `ClassicSimilarity <https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/search/similarities/ClassicSimilarity.html>`_.
                k(int): number of results to return. Default is 1000.
        """
        super().__init__(kwargs)
        self.index_location = index_location
        self.k = k
        _init_anserini()
        from pyserini.search import SimpleSearcher
        self.searcher = SimpleSearcher(index_location)
        self.wmodel = wmodel
        self._setsimilarty(wmodel)
示例#30
0
def _run_thread(arguments):
    idz = arguments["id"]
    index = arguments["index"]
    k = arguments["k"]
    data = arguments["data"]

    # BM25 parameters #TODO
    # bm25_a = arguments["bm25_a"]
    # bm25_b = arguments["bm25_b"]
    # searcher.set_bm25(bm25_a, bm25_b)

    searcher = SimpleSearcher(index)

    _iter = data
    if idz == 0:
        _iter = tqdm(data)

    provenance = {}
    for x in _iter:
        query_id = x["id"]
        query = (
            x["query"].replace(utils.ENT_END, "").replace(utils.ENT_START, "").strip()
        )

        hits = searcher.search(query, k)

        element = []
        for y in hits:
            element.append(
                {
                    "score": y.score,
                    "text": str(y.raw).strip(),
                    "title": y.docid,
                }
            )
        provenance[query_id] = element

    return provenance