예제 #1
0
    def test_plus_more_cols(self):
        import pyterrier.transformer as ptt
        from pyterrier.model import add_ranks
        mock1 = pt.Transformer.from_df(add_ranks(
            pd.DataFrame([["q1", "a query", "doc1", 5]],
                         columns=["qid", "query", "docno", "score"])),
                                       uniform=True)
        mock2 = pt.Transformer.from_df(add_ranks(
            pd.DataFrame([["q1", "a query", "doc1", 10]],
                         columns=["qid", "query", "docno", "score"])),
                                       uniform=True)

        combined = mock1 + mock2
        # we dont need an input, as both Identity transformers will return anyway
        rtr = combined.transform(None)

        self.assertEqual(1, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc1", rtr.iloc[0]["docno"])
        self.assertEqual(15, rtr.iloc[0]["score"])
        bad_columns = [
            "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R",
            "score_x", "score_y", "score_r"
        ]
        for bad in bad_columns:
            self.assertFalse(bad in rtr.columns,
                             "column %s in returned dataframe" % bad)
예제 #2
0
 def test_rank_two_queries(self):
     df = pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 4], ["q2", "doc1", 4]], columns=["qid", "docno", "score"])
     df = add_ranks(df)
     self.assertTrue("rank" in df.columns)
     self.assertEqual(df.iloc[0]["rank"], FIRST_RANK)
     self.assertEqual(df.iloc[1]["rank"], FIRST_RANK+1)
     self.assertEqual(df.iloc[2]["rank"], FIRST_RANK)
예제 #3
0
    def test_mul(self):

        import pyterrier.transformer as ptt
        mock = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 5]], columns=["qid", "docno", "score"]),
                                      uniform=True)
        for comb in [mock * 10, 10 * mock]:
            rtr = comb.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertEqual("q1", rtr.iloc[0]["qid"])
            self.assertEqual("doc1", rtr.iloc[0]["docno"])
            self.assertEqual(50, rtr.iloc[0]["score"])

        import pyterrier.transformer as ptt
        from pyterrier.model import add_ranks
        mock = pt.Transformer.from_df(add_ranks(
            pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 10]],
                         columns=["qid", "docno", "score"])),
                                      uniform=True)
        rtr = mock.search("bla", qid="q1")
        self.assertEqual(2, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc2", rtr.iloc[0]["docno"])
        self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])

        rtr = (-1 * mock).search("bla", qid="q1")
        self.assertEqual(2, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc1", rtr.iloc[0]["docno"])
        self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])
예제 #4
0
 def transform(self, topics_and_res):
     scoredict=defaultdict(lambda: defaultdict(dict))
     lastqid=None
     qids=[]
     for i, row in topics_and_res.iterrows():
         qid = row["qid"]
         if qid != lastqid:
             qids.append(qid)
             lastqid = qid
             
         docno, passage = row["docno"].split("%p")
         scoredict[qid][docno][int(passage)] = row["score"]
     rows=[]
     for qid in qids:
         for docno in scoredict[qid]:
             if self.agg == 'first':
                 first_passage_id = min( scoredict[qid][docno].keys() )
                 score = scoredict[qid][docno][first_passage_id]
             if self.agg == 'max':
                 score = max( scoredict[qid][docno].values() )
             if self.agg == 'mean':
                 score = sum( scoredict[qid][docno].values() ) / len(scoredict[qid][docno])
             if self.agg == "kmaxavg":
                 values = np.fromiter(scoredict[qid][docno].values(), dtype=float)
                 K = self.K
                 score = np.argpartition( values , -K)[-K:].mean() if len(values) > K else values.mean()    
             rows.append([qid, docno, score])
     rtr = pd.DataFrame(rows, columns=["qid", "docno", "score"])
     # add the queries back
     queries = topics_and_res[["qid", "query"]].dropna(axis=0, subset=["query"]).drop_duplicates()
     rtr = rtr.merge(queries, on=["qid"])
     rtr = add_ranks(rtr)
     return rtr
예제 #5
0
    def transform(self, topics_and_res):
        topics_and_res = topics_and_res.copy()
        topics_and_res[["olddocno", "pid"]] = topics_and_res.docno.str.split("%p", expand=True)
        if self.agg == 'max':
            groups = topics_and_res.groupby(['qid', 'olddocno'])
            group_max_idx = groups['score'].idxmax()
            rtr = topics_and_res.loc[group_max_idx, :]
            rtr = rtr.drop(columns=['docno', 'pid']).rename(columns={"olddocno" : "docno"})
        
        if self.agg == 'first':
            #could this be done by just selectin pid = 0?
            topics_and_res.pid = topics_and_res.pid.astype(int)
            rtr = topics_and_res[topics_and_res.pid == 0].rename(columns={"olddocno" : "docno"})
            
            groups = topics_and_res.groupby(['qid', 'olddocno'])
            group_first_idx = groups['pid'].idxmin()
            rtr = topics_and_res.loc[group_first_idx, ]
            rtr = rtr.drop(columns=['docno', 'pid']).rename(columns={"olddocno" : "docno"})

        if self.agg == 'mean':
            rtr = topics_and_res.groupby(['qid', 'olddocno']).mean()['score'].reset_index().rename(columns={'olddocno' : 'docno'})
            from .model import query_columns
            #add query columns back
            rtr = rtr.merge(topics_and_res[query_columns(topics_and_res)].drop_duplicates(), on='qid')

        if self.agg == 'kmaxavg':
            rtr = topics_and_res.groupby(['qid', 'olddocno'])['score'].apply(lambda ser: ser.nlargest(2).mean()).reset_index().rename(columns={'olddocno' : 'docno'})
            from .model import query_columns
            #add query columns back
            rtr = rtr.merge(topics_and_res[query_columns(topics_and_res)].drop_duplicates(), on='qid')

        rtr = add_ranks(rtr)
        return rtr
예제 #6
0
 def test_rank_one_query(self):
     df = pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 5]], columns=["qid", "docno", "score"])
     df = add_ranks(df)
     self.assertTrue("rank" in df.columns)
     # check that first item is rank 1
     self.assertEqual(df.iloc[0]["rank"], FIRST_RANK)
     # check that ties are resolved by keeping the same order.
     # trec_eval instead breaks ties on ascending docno
     self.assertEqual(df.iloc[1]["rank"], FIRST_RANK+1)
예제 #7
0
    def test_rank_one_query_neg(self):
        df = pd.DataFrame([["q1", "doc1", -4], ["q1", "doc2", -5]],
                          columns=["qid", "docno", "score"])
        df = add_ranks(df)
        df = df.sort_values("rank", ascending=True)
        self.assertTrue("rank" in df.columns)
        # check that first item is rank 1
        self.assertEqual(df.iloc[0]["rank"], FIRST_RANK)
        self.assertEqual(df.iloc[0]["docno"], "doc1")

        df = pd.DataFrame([["q1", "doc2", -5], ["q1", "doc1", -4]],
                          columns=["qid", "docno", "score"])
        df = add_ranks(df)
        df = df.sort_values("rank", ascending=True)
        self.assertTrue("rank" in df.columns)
        # check that first item is rank 1
        self.assertEqual(df.iloc[0]["rank"], FIRST_RANK)
        self.assertEqual(df.iloc[0]["docno"], "doc1")
예제 #8
0
 def test_rank_one_query_sort(self):
     import pyterrier as pt
     sort_status = pt.model.STRICT_SORT
     pt.model.STRICT_SORT = True
     df = pd.DataFrame([["q1", "doc1", 4], ["q1", "doc2", 5]],
                       columns=["qid", "docno", "score"])
     df = add_ranks(df)
     print(df)
     self.assertTrue("rank" in df.columns)
     # check that first item is rank 1
     self.assertEqual(df.iloc[0]["rank"], FIRST_RANK)
     self.assertEqual(df.iloc[0]["docno"], "doc2")
     pt.model.STRICT_SORT = sort_status
예제 #9
0
 def transform(self, te):
     te_dataset = DFDataset(te,
                            self.tokenizer,
                            split="test",
                            get_doc_fn=self.get_doc_fn)
     # we permit to adjust the batch size to allow better testing
     scores = bert4ir_score(self.model,
                            te_dataset,
                            batch_size=self.test_batch_size)
     assert len(scores) == len(
         te), "Expected %d scores, but got %d" % (len(tr), len(scores))
     te["score"] = scores
     return add_ranks(te)
예제 #10
0
    def transform(self, topics_res):
        # wmdistance is a _distance_, so we take the negative as our "similarity" score

        lambda_row = lambda row: self.wv.wmdistance(
            list(tokenize(row["query"])), list(tokenize(row[self.doc_attr])))

        # could take a while, add a progress bar if asked to
        if self.verbose:
            tqdm.pandas()
            topics_res["score"] = -1 * topics_res.progress_apply(lambda_row,
                                                                 axis=1)
        else:
            topics_res["score"] = -1 * topics_res.apply(lambda_row, axis=1)

        return add_ranks(topics_res)
예제 #11
0
    def transform(self, topics):
        from pyterrier import tqdm
        queries = []
        qid2q = {}
        for q, qid in zip(topics["query"].to_list(), topics["qid"].to_list()):
            passage = self.tokenizer.encode(
                q,
                add_special_tokens=True,
                max_length=self.args.max_seq_length,
            )

            passage_len = min(len(passage), self.args.max_query_length)
            input_id_b = pad_input_ids(passage, self.args.max_query_length)
            queries.append([passage_len, input_id_b])
            qid2q[qid] = q

        print("***** inference of %d queries *****" % len(queries))
        dev_query_embedding, dev_query_embedding2id = StreamInferenceDoc(
            self.args,
            self.model,
            GetProcessingFn(self.args, query=True),
            "transform",
            queries,
            is_query_inference=True)

        print("***** faiss search for %d queries on %d shards *****" %
              (len(queries), self.segments))
        rtr = []
        for i, offset in enumerate(tqdm(self.shard_offsets, unit="shard")):
            scores, neighbours = self.cpu_index[i].search(
                dev_query_embedding, self.num_results)
            res = self._calc_scores(topics["qid"].values,
                                    self.passage_embedding2id[i],
                                    neighbours,
                                    scores,
                                    num_results=self.num_results,
                                    offset=offset,
                                    qid2q=qid2q)
            rtr.append(res)
        rtr = pd.concat(rtr)
        rtr = add_ranks(rtr)
        rtr = rtr[rtr["rank"] < self.num_results]
        rtr = rtr.sort_values(by=["qid", "score", "docno"],
                              ascending=[True, False, True])
        return rtr
예제 #12
0
 def transform(self, queries_and_docs):
     groupby = queries_and_docs.groupby("qid")
     rtr = []
     with torch.no_grad():
         for qid, group in tqdm(
                 groupby, total=len(groupby), desc='colbert',
                 unit="q") if self.verbose else groupby:
             query = group["query"].values[0]
             ranking = rerank(self.args,
                              query,
                              group["docno"].values,
                              group[self.doc_attr].values,
                              index=None)
             for rank, (score, pid, passage) in enumerate(ranking):
                 rtr.append([qid, query, pid, score, rank])
     return add_ranks(
         pd.DataFrame(rtr,
                      columns=["qid", "query", "docno", "score", "rank"]))
예제 #13
0
    def transform(self, topics_and_res):
        import pandas as pd
        rtr = []
        grouper = topics_and_res.groupby("qid")
        from pyterrier import tqdm, started
        assert started()

        #for each query, get the results, and pass to _for_each_query
        for qid, group in tqdm(grouper, desc="BERTQE",
                               unit="q") if self.verbose else grouper:
            query = group["query"].iloc[0]
            scores = self._for_each_query(qid, query,
                                          group[["docno", self.body_attr]])

            # assigned the scores to the input documents
            for i, s in enumerate(scores.tolist()):
                rtr.append([qid, query, group.iloc[i]["docno"], s])

        # returns the final dataframe
        df = pd.DataFrame(rtr, columns=["qid", "query", "docno", "score"])
        return add_ranks(df)
예제 #14
0
    def transform(self, topics_res):
        def lambda_row(row):
            q = list(tokenize(row["query"]))
            d = list(tokenize(row[self.doc_attr]))

            qs = np.array(
                [self.wv[t] if t in self.wv else self.oov for t in q])
            ds = np.array(
                [self.wv[t] if t in self.wv else self.oov for t in d])
            qs_avg = np.average(qs, axis=0)
            ds_avg = np.average(ds, axis=0)

            return qs_avg @ qs_avg.T / (norm(qs_avg) * norm(ds_avg))

        # could take a while, add a progress bar if asked to
        if self.verbose:
            tqdm.pandas()
            topics_res["score"] = topics_res.progress_apply(lambda_row, axis=1)
        else:
            topics_res["score"] = topics_res.apply(lambda_row, axis=1)

        return add_ranks(topics_res)
예제 #15
0
 def transform(self, queries_and_docs):
     
     from cedr import train
     import pyterrier as pt
     train.tqdm = pt.tqdm
     import pandas as pd
     
     test_run = self._make_cedr_run(queries_and_docs, None)
     dataset = self._make_cedr_dataset(queries_and_docs)
     
     
     run_values = train.run_model(self.model, dataset, test_run, desc="CEDR")
     run_df_rows = []
     for q, docs in run_values.items():
         for d in docs:
             run_df_rows.append([q, d, docs[d]])
     run_df = pd.DataFrame(run_df_rows, columns=["qid", "docno", "score"])
     if "score" in queries_and_docs.columns:
         queries_and_docs = queries_and_docs.drop(columns="score")
     
     final_df = run_df.merge(queries_and_docs, on=["qid", "docno"])
     final_df = add_ranks(final_df)
     return final_df
예제 #16
0
 def test_rank_zero_query(self):
     df = pd.DataFrame([], columns=["qid", "docno", "score"])
     df = add_ranks(df)
     self.assertTrue("rank" in df.columns)