示例#1
0
    def test_query_environment(self):
        env = pyndri.QueryEnvironment(
            self.index,
            rules=('method:linear,collectionLambda:0.4,documentLambda:0.2',))

        self.assertEqual(
            env.query('ipsum'),
            ((1, -4.911066480756002),))

        self.assertEqual(
            env.query('his'),
            ((2, -4.6518844642777),
             (3, -6.1469416959076195)))

        another_env = pyndri.QueryEnvironment(
            self.index,
            rules=('method:linear,collectionLambda:1.0,documentLambda:0.0',))

        self.assertEqual(
            another_env.query('ipsum'),
            ((1, -6.595780513961311),))

        self.assertEqual(
            another_env.query('his'),
            ((3, -5.902633333401366),
             (2, -5.902633333401366)))
示例#2
0
    def __init__(self, env: str = 'default', verbose: bool = False, avg_len=False):
        if verbose:
            helpers.log(f'Loading index {INDRI_INDEX_DIR} with {env} query environment.')
        start = datetime.now()

        self.index = pyndri.Index(f'{INDRI_INDEX_DIR}')
        self.token2id, self.id2token, self.id2df = self.index.get_dictionary()
        self.id2tf = self.index.get_term_frequencies()

        if avg_len:
            # Monte Carlo Estimation for document length:
            doc_lengths = np.empty(self.index.document_count(), dtype=np.float)
            for (idx, doc_iid) in enumerate(range(self.index.document_base(), self.index.maximum_document())):
                doc_lengths[idx] = self.index.document_length(doc_iid)
            self.avg_doc_len = float(doc_lengths.mean())

        self.tokenizer = Tokenizer()

        if os.path.isfile(TITLE2WID):
            with open(TITLE2WID, 'rb') as file:
                self.title2wid = pickle.load(file)

        if os.path.isfile(WID2TITLE):
            with open(WID2TITLE, 'rb') as file:
                self.wid2title = pickle.load(file)
        try:
            if os.path.isfile(WID2INT):
                with open(WID2INT, 'rb') as file:
                    self.wid2int = pickle.load(file)

            if os.path.isfile(INT2WID):
                with open(INT2WID, 'rb') as file:
                    self.int2wid = pickle.load(file)
        except FileNotFoundError:
            helpers.log('ID mappings do not exist yet. Not loaded.')

        if env == 'default':
            self.env = pyndri.QueryEnvironment(self.index)
        elif env == 'tfidf':
            self.env = pyndri.TFIDFQueryEnvironment(self.index, k1=1.2, b=0.75)
        elif env == 'prf':
            env = pyndri.QueryEnvironment(self.index)
            self.env = pyndri.PRFQueryEnvironment(env, fb_docs=10, fb_terms=10)
        else:
            raise ValueError(f'Unknown environment configuration {env}')

        stop = datetime.now()
        if verbose:
            helpers.log(f'Loaded index in {stop - start}.')
示例#3
0
    def test_prf_query_environment(self):
        initial_query_env = pyndri.QueryEnvironment(
            self.index,
            rules=('method:linear,collectionLambda:0.4,documentLambda:0.2', ))

        prf_query_env = pyndri.PRFQueryEnvironment(initial_query_env)

        results = prf_query_env.query('consectetur adipiscing')

        self.assertEqual(len(results), 2)
示例#4
0
def run_queries(index_path, scorer_module, scorer_class, params, queries=[]):
    """
    Parsl app instantiates a scorer, sets the parameters,
    runs the query, returns the result
    """

    module = importlib.import_module(scorer_module)
    class_ = getattr(module, scorer_class)
    scorer_instance = class_()

    # set parameter

    # open index. Assumes access to index_path
    index = pyndri.Index(index_path)
    term_count = index.total_terms()

    # initial retrieval
    try:
        rule = 'method:dirichlet,mu:%s' % params['mu']
        query_env = pyndri.QueryEnvironment(index, rules=(rule,))
        hits = query_env.query(queries[1], results_requested=1000)
        # hits = index.query(queries[1], rules=(rule,), results_requested=1000)

        results = []
        for doc_id, score in hits:
            docno, tokens = index.document(doc_id)
            doc_vector = Counter(tokens)
            doc_len = float(index.document_length(doc_id))

            new_score = scorer_instance.score(query_vector=queries[2],
                                              document_vector=doc_vector,
                                              doc_length=doc_len,
                                              term_count=term_count,
                                              col_prob=queries[3],
                                              params=params)

            # TODO: rescore
            results.append((queries[0], docno, new_score))
    finally:
        index.close()

    return results
示例#5
0
    def test_query_expander(self):
        query_env = pyndri.QueryEnvironment(
            self.index,
            rules=('method:linear,collectionLambda:0.4,documentLambda:0.2', ))

        query_expander = pyndri.QueryExpander(query_env)

        self.assertEqual(
            query_expander.expand('consectetur adipiscing'),
            '#weight( 0.50000000000000000000000000000000 '
            '#combine( consectetur adipiscing ) '
            '0.50000000000000000000000000000000 '
            '#weight(  0.03409090909090908838585676221555 "in"  '
            '0.03409090909090908838585676221555 '
            '"eget"  0.03409090909090908838585676221555 "consectetur"  '
            '0.03409090909090908838585676221555 '
            '"nulla"  0.02272727272727272790353580944611 "amet"  '
            '0.02272727272727272790353580944611 '
            '"fringilla"  0.02272727272727272790353580944611 "porta"  '
            '0.02272727272727272790353580944611 '
            '"tort"  0.02272727272727272790353580944611 "lorem"  '
            '0.02272727272727272790353580944611 '
            '"arcu"  ) ) ')
    prefix = '' if len(sys.argv) == 2 else '{}_'.format(sys.argv[2].upper())

    print('export {}NUM={}'.format(prefix, num_documents))
    print('export {}LENGTH_MEDIAN={}'.format(prefix, median))
    print('export {}LENGTH_MODE={}'.format(prefix, mode.mode[0]))
    print('export {}LENGTH_MEAN={}'.format(prefix, mean))
    print('export {}LENGTH_MIN={}'.format(prefix, min_))
    print('export {}LENGTH_MAX={}'.format(prefix, max_))
    print('export {}LENGTH_STD={}'.format(prefix, std))
    print('export {}TOTAL_TERMS={}'.format(prefix, index.total_terms()))
    print('export {}UNIQUE_TERMS={}'.format(prefix, index.unique_terms()))

    with pyndri.open(sys.argv[1]) as index:
        # Constructs a QueryEnvironment that uses a
        # language model with Dirichlet smoothing.
        lm_query_env = pyndri.QueryEnvironment(
            index, rules=('method:dirichlet,mu:5000', ))
        print(
            lm_query_env.query('hello world',
                               results_requested=-5,
                               include_snippets=True))

        # Constructs a QueryEnvironment that uses the TF-IDF retrieval model.
        #
        # See "Baseline (non-LM) retrieval"
        # (https://lemurproject.org/doxygen/lemur/html/IndriRunQuery.html)
        tfidf_query_env = pyndri.TFIDFQueryEnvironment(index)
        print(tfidf_query_env.query('hello world'))

        # Constructs a QueryEnvironment that uses the Okapi BM25 retrieval model.
        #
        # See "Baseline (non-LM) retrieval"
示例#7
0
def search():
    index, dictionary = get_index()

    query_string = request.args.get('q', None)

    smoothing_method = request.args.get('smoothing_method', 'dirichlet')
    smoothing_param = float(request.args.get('smoothing_param', 1000))
    results_requested = int(request.args.get('results_requested', 10))

    documents = []

    if query_string is not None:
        logging.info('Query string: %s', query_string)

        highlighted_token_ids = set()

        if not query_string.startswith('docid:'):
            for token in index.tokenize(pyndri.escape(query_string)):
                if dictionary.has_token(token):
                    highlighted_token_ids.add(
                        dictionary.translate_token(token))

        def _include_document(int_doc_id):
            ext_doc_id, doc_token_ids = index.document(int_doc_id)

            def _format_token(token_id):
                term = dictionary[token_id]

                if token_id in highlighted_token_ids:
                    term = '<strong>{}</strong>'.format(term)

                return term

            doc_tokens = [
                _format_token(token_id) if token_id > 0 else '&lt;unk&gt;'
                for token_id in doc_token_ids
            ]

            documents.append((ext_doc_id, ' '.join(doc_tokens)))

        if query_string.startswith('docid:'):
            ext_document_id = query_string[6:]
            lookup = dict(index.document_ids([ext_document_id]))

            if lookup:
                _include_document(lookup[ext_document_id])
        else:
            query_env = pyndri.QueryEnvironment(index,
                                                rules=(build_smoothing_rule(
                                                    smoothing_method,
                                                    smoothing_param), ))

            results = query_env.query(query_string,
                                      results_requested=results_requested)

            for int_doc_id, _ in results:
                _include_document(int_doc_id)

    return render_template('index.html',
                           query=query_string,
                           results=documents,
                           smoothing_method=smoothing_method,
                           smoothing_param=smoothing_param)
示例#8
0
import pyndri
import params
from CrossValidationUtils import run_bash_command
from time import time
# command="~/.local/bin/PyndriQuery --loglevel warning \
# 	--queries mq_queries.txt \
# 	--index "+params.path_to_index+" \
# 	--smoothing_method dirichlet --smoothing_param auto --prf \
# 	test.run"
begin = time()

index = pyndri.Index(params.path_to_index)
query_env = pyndri.QueryEnvironment(
    index, rules=('method:linear,collectionLambda:0.4,documentLambda:0.2', ))
query_env = pyndri.PRFQueryEnvironment(query_env, fb_terms=50, fb_docs=10)
# query_expander = pyndri.QueryExpander(query_env)
results = query_env.query("family tree")
print(results)
# results=query_env.query(results)
# print(results)
# results = index.query('')
print("it took ", time() - begin)
示例#9
0
def main(args):

    ## command line arguments
    queries = args.queries
    maxdocs = args.maxdocs
    metadata_path = args.metadata_path
    index_root = args.index_path
    reranking_scores = args.reranking_scores
    coord_type = args.coordinates_algorithm
    krovetz_stem = args.krovetz_stem
    stopword_file = args.stopwords
    no_rerank = args.no_rerank
    rerank_weight = args.rerank_weight
    rerank_cutoff = args.rerank_cutoff

    #metadata="metadata.csv_covid-19.kwrds.csv.all-coords.csv"
    #passage_metadata="metadata.csv_covid-19.kwrds.paragraphs.csv.all-coords.csv"
    metadata = "metadata.csv"  #_covid-19.kwrds.csv.old"
    passage_metadata = "metadata.csv_covid-19-empty.kwrds.paragraphs.csv"  #"metadata.csv_covid-19.kwrds.paragraphs.csv"

    # metadata for documents
    metadata_doc = pd.read_csv(os.path.join(metadata_path, metadata),
                               dtype={
                                   "mag_id": str,
                                   "who_covidence_id": str,
                                   "arxiv_id": str
                               },
                               low_memory=False)
    sys.stderr.write("metadata shape: {} \n".format(metadata_doc.shape))

    # if passages are to be retrieved instead of full documents open also metadata for passages.
    metadata_pas = pd.read_csv(os.path.join(metadata_path, passage_metadata),
                               low_memory=False)
    sys.stderr.write("metadata shape: {} \n".format(metadata_pas.shape))

    reranking_scores_df = pd.DataFrame(
        columns=["query_candidate_id", "label", "neg_score", "pos_score"])
    # if exists, reranking-scores file
    if os.path.isfile(reranking_scores):
        reranking_scores_df = pd.read_csv(reranking_scores,
                                          dialect='excel-tab')

    rerank_csv = "rerank-queries_nofilter.tsv"
    of = open(rerank_csv, "w", encoding='utf-8')
    fieldnames = ["question", "question_id", "answer", "answer_id", "label"]
    wr = csv.DictWriter(of, fieldnames=fieldnames, dialect='excel-tab')
    #wr.writeheader()

    # output format for bokeh
    output = []
    documents = []
    passages = []
    #fieldnames=["doc_id","source","author", "url","title",]

    # indri
    #index_doc_path=os.path.join(index_root,'BildumaTRECAbsBodyIndex_ezmarra')#_ round 1')
    #index_doc_path=os.path.join(index_root,'BildumaTRECAbsBodyIndex_2ndround')
    #index_doc_path=os.path.join(index_root,'BildumaTRECAbsIndex_round3_all') #BildumaTRECAbsIndex_round3')
    #index_doc_path=os.path.join(index_root,'BildumaTRECAbsIndex_round3all_exp') #BildumaTRECAbsIndex_round3all_exp')
    #index_doc_path=os.path.join(index_root,'BildumaTRECAbsIndex_round4_Nofiltered')
    index_doc_path = os.path.join(
        index_root, 'BildumaTRECAbsIndex_round5_Notfiltered_bis')

    #index_pas_path=os.path.join(index_root,'BildumaTRECParIndex')

    index_doc = pyndri.Index(index_doc_path)
    #index_pas = pyndri.Index(index_pas_path)

    # Constructs a QueryEnvironment that uses a
    # language model with Dirichlet smoothing.
    lm_query_env = pyndri.QueryEnvironment(index_doc,
                                           rules=('method:dirichlet', ))
    #print(lm_query_env.query('hello world'))
    prf_query_env = pyndri.PRFQueryEnvironment(
        lm_query_env, fb_docs=35, fb_terms=25)  #fb_docs=20, fb_terms=10)#
    #print(prf_query_env.query('hello world'))

    #query tokenizer
    tokenizer = RegexpTokenizer(r'[\w-]+')
    #tokenizer = RegexpTokenizer(r'[^ ]+'))

    #stopwords
    with open(stopword_file) as f:
        stopwords = [line.rstrip() for line in f]
    sys.stderr.write("stopwords loaded - {}\n".format(len(stopwords)))

    queries_df = pd.read_csv(queries, dialect='excel-tab')
    for index, row in queries_df.iterrows():
        #querylc = row['query'].lower()
        querylc = row['question'].lower() + " " + row['query'].lower(
        )  #+" "+row['narrative'].lower()
        querylc2 = row['narrative'].lower()

        sys.stderr.write("current query: {} -- {}\n.".format(
            querylc, querylc2))
        tokens = tokenizer.tokenize(querylc)
        tokenized_query = " ".join(tokens)
        #sys.stderr.write("Only tokenized: {} \n".format(tokenized_query))

        tokens2 = tokenizer.tokenize(querylc2)
        tokenized_query2 = " ".join(tokens2)

        #Lemmatization and stopwords removal
        if krovetz_stem:
            tokenized_query = " ".join(
                [pyndri.krovetz_stem(t) for t in tokens if not t in stopwords])
            tokenized_query2 = " ".join([
                pyndri.krovetz_stem(t) for t in tokens2 if not t in stopwords
            ])
            sys.stderr.write(
                "tokenized and stemmed query: {} \n".format(tokenized_query))

        #construct query
        complex_query = "#weight(0.8 #combine(" + tokenized_query + ") 0.2 #combine(" + tokenized_query2 + "))"
        # document level results
        results = prf_query_env.query(complex_query, results_requested=maxdocs)
        #results = prf_query_env.query(tokenized_query, results_requested=maxdocs)
        docs = process_results(results, index_doc, metadata_doc, metadata_pas,
                               reranking_scores_df, row["id"], coord_type,
                               rerank_weight, rerank_cutoff)

        #sys.stderr.write("docs retrieved, {} \n".format(len(docs)))

        for d in docs:
            #wr.writerow({"question":row["query"],"question_id":row["id"],"answer":d["text"],"answer_id":d["doc_id"],"label":0})
            #wr.writerow({"question":row["question"],"question_id":row["id"],"answer":d["text"],"answer_id":d["doc_id"],"label":0})
            wr.writerow({
                "question": row["query"] + " " + row["question"],
                "question_id": row["id"],
                "answer": d["text"],
                "answer_id": d["doc_id"],
                "label": 0
            })

        # passage level results
        #results = prf_query_env.query(tokenized_query, results_requested=maxdocs)
        #pas = process_results(results,index_pas,metadata_doc, metadata_pas, reranking_scores_df, row["id"], coord_type, rerank_weight, passages=True)

        pas_df = pd.DataFrame(docs)

        pas_sorted = pas_df.sort_values("ranking_score", ascending=False)
        if no_rerank:
            pas_sorted = pas_df.sort_values("indri_score", ascending=False)

        #sys.stderr.write("passages retrieved, {} \n".format(len(pas)))

        doc_dict = {}
        rank = 1
        for index, p in pas_sorted.iterrows():
            #wr.writerow({"question":row["query"]+" "+row["narrative"],"question_id":row["id"],"answer":p["text"],"answer_id":p["doc_id"],"label":0})

            #sys.stderr.write(" {} - {} {} \n".format(p["doc_id"]))

            #ranking has already 1000 documents
            if rank > maxdocs:
                break

            question_id = row["id"]
            #doc_pas_id = str(p["doc_id"])
            doc_id = str(p["doc_id"])
            #doc_id = doc_pas_id.split("_")[0]

            # already found a more relevant passage of the same document
            if doc_id in doc_dict:
                continue

            doc_dict[doc_id] = 1
            if no_rerank:
                print("{} Q0 {} {} {} {}".format(row['id'], doc_id, rank,
                                                 p["indri_score"],
                                                 "elhuyar_indri"))
            else:
                print("{} Q0 {} {} {} {}".format(row['id'], doc_id, rank,
                                                 p["ranking_score"],
                                                 "elhuyar_rRnk"))
            #sys.stderr.write("{} Q0 {} {} {} {}\n".format(row['id'],doc_id, rank, p["ranking_score"],run_rerank))
            rank += 1

        #query_json={"query_id":row['id'], "task": row['task'], "query":row['query'], "docs":docs,"pas":pas}

        #query_json={"query_id":row['id'], "task": "trec-round-1", "query":row['query'], "pas":pas}
        #output.append(query_json)

    of.close()
示例#10
0
 def init_query_env(
     self,
     rules=('method:linear,collectionLambda:0.4,documentLambda:0.2', )):
     self.query_env = pyndri.QueryEnvironment(self.index, rules=rules)