Exemplo n.º 1
0
    def _test_vaswani(self, mhs, TWO_D=False):
        corpus_file = pt.datasets.get_dataset("vaswani").get_corpus()[0]
        Arrays = pt.autoclass("java.util.Arrays")
        corpus_file_list = Arrays.asList(corpus_file)
        trec_properties = {
            "TrecDocTags.doctag": "DOC",
            "TrecDocTags.idtag": "DOCNO",
            "TrecDocTags.skip": "DOCHDR",
            "TrecDocTags.casesensitive": "false",
            "trec.collection.class": "TRECCollection",
        }
        for k, v in trec_properties.items():
            pt.ApplicationSetup.setProperty(k, v)
        corpus = pt.autoclass("org.terrier.indexing.TRECCollection")(
            corpus_file_list,
            pt.autoclass("org.terrier.utility.TagSet").TREC_DOC_TAGS, "", "")

        def _get_text(d):
            terms = []
            while not d.endOfDocument():
                t = d.getNextTerm()
                if t is None:
                    continue
                terms.append(t)
            return " ".join(terms)

        def _corpus_iter():
            while corpus.nextDocument():
                doc = corpus.getDocument()
                text = _get_text(doc)
                docno = doc.getProperty("docno")
                yield docno, text

        mhs.index(_corpus_iter())

        if TWO_D:
            oneDmatrix = mhs.pairwise_sim()
            maxPos = None
            maxSim = -1
            numDocs = len(mhs.docNames)
            for i in range(0, numDocs):
                # For each of the other test documents...
                for j in range(i + 1, numDocs):
                    if oneDmatrix[mhs.getTriangleIndex(i, j)] > maxSim:
                        maxSim = oneDmatrix[mhs.getTriangleIndex(i, j)]
                        maxPos = (i, j)
            print("Most similar pair is %s, with sim %f" %
                  (str(maxPos), maxSim))

        input = pd.DataFrame([["17"]], columns=["docno"])
        rtr = mhs.transform(input)
        print(rtr)
Exemplo n.º 2
0
 def test_num_manual_wmodel(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     Tf = pt.autoclass("org.terrier.matching.models.Tf")()
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     from jnius import JavaException
     try:
         retr = pt.BatchRetrieve(indexref, wmodel=Tf)
         input = pd.DataFrame([["1", "Stability"]],
                              columns=['qid', 'query'])
         result = retr.transform(input)
     except JavaException as ja:
         print(ja.stacktrace)
         raise ja
Exemplo n.º 3
0
    def test_fbr_reranking(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        # this test examines the use of ScoringMatchingWithFat
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 2 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2
        pipe = firstpass >> pt.FeaturesBatchRetrieve(
            indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = pipe.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertEqual(2, len(result))
        self.assertEqual(result.iloc[0]["features"].size, 2)

        pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")**
                                    pt.BatchRetrieve(indexref, wmodel="PL2"))
        result2 = pipe.transform(input)
        import numpy as np
        f1 = np.stack(result["features"].values)
        f2 = np.stack(result2["features"].values)
        self.assertTrue(np.array_equal(f1, f2))
Exemplo n.º 4
0
 def test_num_results(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     retr = pt.BatchRetrieve(indexref, num_results=10)
     input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
     result = retr.transform(input)
     self.assertEqual(len(result), 10)
Exemplo n.º 5
0
    def get_topics(self, variant=None, tokenise_query=True):
        """
            Returns the topics, as a dataframe, ready for retrieval. 
        """
        ds = self.irds_ref()
        assert ds.has_queries(), f"{self._irds_id} doesn't support get_topics"
        qcls = ds.queries_cls()
        assert variant is None or variant in qcls._fields[1:], f"{self._irds_id} only supports the following topic variants {qcls._fields[1:]}"
        df = pd.DataFrame(ds.queries_iter())

        df.rename(columns={"query_id": "qid"}, inplace=True) # pyterrier uses "qid"

        if variant is not None:
            df.rename(columns={variant: "query"}, inplace=True) # user specified which version of the query they want
            df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
        elif len(qcls._fields) == 2:
            # auto-rename single query field to "query" if there's only query_id and that field
            df.rename(columns={qcls._fields[1]: "query"}, inplace=True)
        else:
            print(f'There are multiple query fields available: {qcls._fields[1:]}. To use with pyterrier, provide variant or modify dataframe to add query column.')

        # apply pyterrier tokenisation (otherwise the queries may not play well with batchretrieve)
        if tokenise_query and 'query' in df:
            import pyterrier as pt
            tokeniser = pt.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
            def pt_tokenise(text):
                return ' '.join(tokeniser.getTokens(text))
            df['query'] = df['query'].apply(pt_tokenise)

        return df
Exemplo n.º 6
0
    def test_two_term_query_correct_qid_docid_score(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref)
        input = pd.DataFrame([["1", "Stability"], ["2", "Generator"]],
                             columns=['qid', 'query'])
        result = retr.transform(input)
        exp_result = parse_res_file(
            os.path.dirname(os.path.realpath(__file__)) +
            "/fixtures/two_queries_result")
        for index, row in result.iterrows():
            self.assertEqual(row['qid'], exp_result[index][0])
            self.assertEqual(row['docno'], exp_result[index][1])
            self.assertAlmostEqual(row['score'], exp_result[index][2])

        input = pd.DataFrame([[1, "Stability"], [2, "Generator"]],
                             columns=['qid', 'query'])
        result = retr.transform(input)
        exp_result = parse_res_file(
            os.path.dirname(os.path.realpath(__file__)) +
            "/fixtures/two_queries_result")
        for index, row in result.iterrows():
            self.assertEqual(str(row['qid']), exp_result[index][0])
            self.assertEqual(row['docno'], exp_result[index][1])
            self.assertAlmostEqual(row['score'], exp_result[index][2])
Exemplo n.º 7
0
    def __init__(self, *args, fb_terms=10, fb_docs=3, **kwargs):
        """
        Args:
            index_like: the Terrier index to use
            fb_terms(int): number of terms to add to the query
            fb_docs(int): number of feedback documents to consider
        """
        global terrier_prf_package_loaded

        #if not terrier_prf_package_loaded:
        #    pt.extend_classpath("org.terrier:terrier-prf")
        #    terrier_prf_package_loaded = True
        #rm = pt.ApplicationSetup.getClass("org.terrier.querying.RM3").newInstance()
        import jnius_config
        prf_found = False
        for j in jnius_config.get_classpath():
            if "terrier-prf" in j:
                prf_found = True
                break
        assert prf_found, 'terrier-prf jar not found: you should start Pyterrier with '\
            + 'pt.init(boot_packages=["org.terrier:terrier-prf:0.0.1-SNAPSHOT"])'
        rm = pt.autoclass("org.terrier.querying.AxiomaticQE")()
        self.fb_terms = fb_terms
        self.fb_docs = fb_docs
        kwargs["qeclass"] = rm
        super().__init__(*args, **kwargs)
Exemplo n.º 8
0
 def get_index(self, variant=None):
     import pyterrier as pt
     if self.name == "50pct" and variant is None:
         variant = "ex1"
     thedir = self._get_all_files("index", variant=variant)
     return pt.autoclass("org.terrier.querying.IndexRef").of(
         os.path.join(thedir, "data.properties"))
Exemplo n.º 9
0
 def test_fbr_empty(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
     input = pd.DataFrame([["1", ""]], columns=['qid', 'query'])
     with warnings.catch_warnings(record=True) as w:
         result = retr.transform(input)
         assert "Skipping empty query" in str(w[-1].message)
     self.assertTrue(len(result) == 0)
Exemplo n.º 10
0
 def __init__(self,
              index_like,
              fb_terms=10,
              fb_docs=3,
              qeclass="org.terrier.querying.QueryExpansion",
              verbose=0,
              **kwargs):
     super().__init__(**kwargs)
     self.verbose = verbose
     if isinstance(qeclass, str):
         self.qe = pt.autoclass(qeclass)()
     else:
         self.qe = qeclass
     self.indexref = parse_index_like(index_like)
     self.fb_terms = fb_terms
     self.fb_docs = fb_docs
     self.manager = pt.autoclass(
         "org.terrier.querying.ManagerFactory")._from_(self.indexref)
Exemplo n.º 11
0
def get_text(indexlike,
             metadata: Union[str, List[str]] = "body",
             by_query: bool = False,
             verbose: bool = False) -> TransformerBase:
    """
    A utility transformer for obtaining the text from the text of documents (or other document metadata) from Terrier's MetaIndex
    or an IRDSDataset docstore.

    Arguments:
        - indexlike: a Terrier index or IRDSDataset to retrieve the metadata from
        - metakeys(list(str) or str): a list of strings of the metadata keys to retrieve from the index. Defaults to ["body"]
        - by_query(bool): whether the entire dataframe should be progressed at once, rather than one query at a time. 
            Defaults to false, which means that all document metadata will be fetched at once.
        - verbose(bool): whether to print a tqdm progress bar. Defaults to false. Has no effect when by_query=False

    Example::

        pipe = pt.BatchRetrieve(index, wmodel="DPH") \ 
            >> pt.text.get_text(index) \ 
            >> pt.text.scorer(wmodel="DPH")

    """
    import pyterrier as pt
    JIR = pt.autoclass('org.terrier.querying.IndexRef')
    JI = pt.autoclass('org.terrier.structures.Index')

    if isinstance(metadata, str):
        metadata = [metadata]

    if isinstance(indexlike, str) or isinstance(indexlike, JIR):
        index = pt.IndexFactory.of(indexlike)
        add_text_fn = _add_text_terrier_metaindex(index, metadata)
    elif isinstance(indexlike, JI):
        add_text_fn = _add_text_terrier_metaindex(indexlike, metadata)
    elif isinstance(indexlike, IRDSDataset):
        add_text_fn = _add_text_irds_docstore(indexlike, metadata)
    else:
        raise ValueError(
            "indexlike %s of type %s not supported. Pass a string, an IndexRef, an Index, or an IRDSDataset"
            % (str(indexlike), type(indexlike)))

    if by_query:
        return pt.apply.by_query(add_text_fn, verbose=verbose)
    return pt.apply.generic(add_text_fn)
Exemplo n.º 12
0
    def test_threading_selfupgrade(self):
        if not pt.check_version("5.5"):
            self.skipTest("Requires Terrier 5.5")

        topics = pt.get_dataset("vaswani").get_topics().head(10)

        #this test ensures we can upgrade the indexref to be concurrent
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, threads=5)
        result = retr.transform(topics)
    def _test_it(self, type):
        import pyterrier as pt
        import pandas as pd
        df1 = pd.DataFrame({
            'docno': ['1', '2', '3'],
            'url': ['url1', 'url2', 'url3'],
            'text': [
                'He ran out of money, so he had to stop playing',
                'The wave were crash on the shore; it was a',
                'The body may perhaps compensates for the loss'
            ]
        })
        pd_indexer1 = pt.DFIndexer(tempfile.mkdtemp(), type=type)
        indexref1 = pd_indexer1.index(df1["text"], df1["docno"])

        df2 = pd.DataFrame({'docno': ['14'], 'text': ['test wave']})

        from jnius import JavaException
        try:

            pd_indexer2 = pt.DFIndexer(tempfile.mkdtemp(), type=type)
            indexref2 = pd_indexer2.index(df2["text"], df2["docno"])

            index1 = pt.IndexFactory.of(indexref1)
            self.assertEqual(
                3,
                index1.getCollectionStatistics().getNumberOfDocuments())

            index2 = pt.IndexFactory.of(indexref2)
            self.assertEqual(
                1,
                index2.getCollectionStatistics().getNumberOfDocuments())

            index_combined = pt.autoclass(
                "org.terrier.python.IndexWithBackground")(index2, index1)
            self.assertEqual(
                3,
                index_combined.getCollectionStatistics().getNumberOfDocuments(
                ))

            self.assertEqual(
                1,
                index_combined.getLexicon()["test"].getFrequency())

            # this is 1 as we used the background index for the background
            # WITHOUT adding the statistics of the local index
            self.assertEqual(
                1,
                index_combined.getLexicon()["wave"].getFrequency())

        except JavaException as ja:
            print(ja.stacktrace)
            raise ja
Exemplo n.º 14
0
 def test_fbr_ltr(self):
     JIR = pt.autoclass('org.terrier.querying.IndexRef')
     indexref = JIR.of(self.here + "/fixtures/index/data.properties")
     retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
     topics = pt.io.read_topics(self.here + "/fixtures/vaswani_npl/query-text.trec").head(3)
     qrels = pt.io.read_qrels(self.here + "/fixtures/vaswani_npl/qrels")
     res = retr.transform(topics)
     res = res.merge(qrels, on=['qid', 'docno'], how='left').fillna(0)
     from sklearn.ensemble import RandomForestClassifier
     import numpy as np
     #print(res.dtypes)
     RandomForestClassifier(n_estimators=10).fit(np.stack(res["features"]), res["label"])
Exemplo n.º 15
0
 def __init__(self, *args, fb_terms=10, fb_docs=3, fb_lambda=0.6, **kwargs):
     """
     Args:
         index_like: the Terrier index to use
         fb_terms(int): number of terms to add to the query. Terrier's default setting is 10 expansion terms.
         fb_docs(int): number of feedback documents to consider. Terrier's default setting is 3 feedback documents.
     """
     _check_terrier_prf()
     rm = pt.autoclass("org.terrier.querying.RM3")()
     self.fb_lambda = fb_lambda
     kwargs["qeclass"] = rm
     super().__init__(*args, fb_terms=fb_terms, fb_docs=fb_docs, **kwargs)
Exemplo n.º 16
0
    def transform(self, topics_and_res):
        results = []
        from .model import query_columns, push_queries
        queries = topics_and_res[query_columns(
            topics_and_res, qid=True)].dropna(
                axis=0, subset=query_columns(topics_and_res,
                                             qid=False)).drop_duplicates()

        # instantiate the DependenceModelPreProcess, specifying a proximity model if specified
        sdm = DependenceModelPreProcess(
        ) if self.prox_model is None else DependenceModelPreProcess(
            self.prox_model)

        for row in tqdm(queries.itertuples(),
                        desc=self.name,
                        total=queries.shape[0],
                        unit="q") if self.verbose else queries.itertuples():
            qid = row.qid
            query = row.query
            # parse the querying into a MQT
            rq = pt.autoclass("org.terrier.querying.Request")()
            rq.setQueryID(qid)
            rq.setOriginalQuery(query)
            TerrierQLParser.process(None, rq)
            TerrierQLToMatchingQueryTerms.process(None, rq)
            if self.remove_stopwords:
                self.ApplyTermPipeline_stopsonly.process(None, rq)

            # rewrite the query
            sdm.expandQuery(rq.getMatchingQueryTerms(), rq)
            new_query = ""

            # put the query back into a matchopql form that Terrier can parse later
            for me in rq.getMatchingQueryTerms():
                term = me.getKey().toString()
                w = me.getValue().getWeight()
                prefix = ""
                if w != 1.0 or me.getValue().termModels.size() > 0:
                    prefix = "#combine"
                    if w != 1:
                        prefix += ":0=" + str(w)
                    if me.getValue().termModels.size() == 1:
                        prefix += ":wmodel=" + me.getValue(
                        ).termModels[0].getClass().getName()
                    term = prefix + "(" + term + ")"
                new_query += term + " "
            new_query = new_query[:-1]
            results.append([qid, new_query])
        new_queries = pd.DataFrame(results, columns=["qid", "query"])
        # restore any other columns, e.g. put back docs if we are re-ranking
        return new_queries.merge(push_queries(topics_and_res, inplace=True),
                                 on="qid")
Exemplo n.º 17
0
    def test_wmodel_dunders(self):

        wmodel = pt.autoclass("org.terrier.matching.models.BM25")()
        wmodel.__reduce__()
        wmodel.__getstate__()
        rtr = wmodel.__reduce__()
        pt.cast("org.terrier.matching.models.BM25", rtr[0](*rtr[1]))
        import pickle
        #import dill as pickle
        #check the byte array is picklable
        print(rtr[1][0])
        pickle.dumps(rtr[1][0])
        pickle.dumps(wmodel)
Exemplo n.º 18
0
 def __init__(self,
              verbose=0,
              remove_stopwords=True,
              prox_model=None,
              **kwargs):
     super().__init__(**kwargs)
     self.verbose = 0
     self.prox_model = prox_model
     self.remove_stopwords = remove_stopwords
     from . import check_version
     assert check_version("5.3")
     self.ApplyTermPipeline_stopsonly = pt.autoclass(
         "org.terrier.querying.ApplyTermPipeline")("Stopwords")
Exemplo n.º 19
0
 def __init__(self,
              index_like,
              fb_terms=10,
              fb_docs=3,
              qeclass="org.terrier.querying.QueryExpansion",
              verbose=0,
              properties={},
              **kwargs):
     super().__init__(**kwargs)
     self.verbose = verbose
     if isinstance(qeclass, str):
         self.qe = pt.autoclass(qeclass)()
     else:
         self.qe = qeclass
     self.indexref = _parse_index_like(index_like)
     for k, v in properties.items():
         pt.ApplicationSetup.setProperty(k, str(v))
     self.applytp = pt.autoclass("org.terrier.querying.ApplyTermPipeline")()
     self.fb_terms = fb_terms
     self.fb_docs = fb_docs
     self.manager = pt.autoclass(
         "org.terrier.querying.ManagerFactory")._from_(self.indexref)
Exemplo n.º 20
0
    def test_fbr_reranking2(self):
        if not pt.check_version("5.4"):
            self.skipTest("Requires Terrier 5.4")
        # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 3 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3
        pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref,
                                                      features=["WMODEL:PL2"])
        pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2")

        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result0 = firstpass.transform(input)
        result1 = pipe1.transform(input)
        result2 = pipe2.transform(input)

        result1["feature0"] = result1.apply(lambda row: row["features"][0],
                                            axis=1)
        #BM25 score
        result0_map = {row.docno: row.score for row in result0.itertuples()}
        result1S_map = {row.docno: row.score for row in result1.itertuples()}
        #PL2 score
        result1F_map = {
            row.docno: row.feature0
            for row in result1.itertuples()
        }
        result2_map = {row.docno: row.score for row in result2.itertuples()}

        print(result1F_map)
        print(result2_map)

        # check features scores
        # NB: places can go no less than 4, as two documents have similar PL2 scores
        for rank, row in enumerate(result0.itertuples()):
            docno = row.docno
            # check that score is unchanged
            self.assertAlmostEqual(
                result1S_map[docno],
                result0_map[docno],
                msg="input score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
            #  check that feature score is correct
            self.assertAlmostEqual(
                result1F_map[docno],
                result2_map[docno],
                msg="feature score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
Exemplo n.º 21
0
    def test_fbr(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.FeaturesBatchRetrieve(indexref, ["WMODEL:PL2"], wmodel="DPH")
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = retr.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertTrue(len(result) > 0)
        self.assertEqual(result.iloc[0]["features"].size, 1)

        retrBasic = pt.BatchRetrieve(indexref)
        if "matching" in retrBasic.controls:
            self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull")
Exemplo n.º 22
0
    def test_threading_manualref(self):

        if not pt.check_version("5.5"):
            self.skipTest("Requires Terrier 5.5")

        topics = pt.get_dataset("vaswani").get_topics().head(10)

        #this test ensures that we operate when the indexref is specified to be concurrent
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of("concurrent:" + self.here +
                          "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, threads=5)
        result = retr.transform(topics)

        #check that use of a callback model works under threading
        Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
        )
        retr = pt.BatchRetrieve(indexref, threads=5, wmodel=Tf)
        result = retr.transform(topics)
Exemplo n.º 23
0
    def transform(self, input):
        threshold = 0.9
        docid_provided = "docid" in input.columns
        docno_provided = "docno" in input.columns
        assert docid_provided or docno_provided
        import pyterrier as pt
        from pyterrier import autoclass
        index = pt.IndexFactory.of(self.indexref)

        for k, v in pt.BatchRetrieve.default_properties.items():
            pt.ApplicationSetup.setProperty(k, v)

        ManagerFactory = autoclass("org.terrier.querying.ManagerFactory")
        manager = ManagerFactory._from_(self.indexref)

        rtr = []
        for row in tqdm(input.itertuples(), total=len(input)):
            if docid_provided:
                docid = row.docid
                if not docno_provided:
                    docno = index.getMetaIndex().getItem("docno", docid)
            else:
                docno = row.docno
                docid = index.getMetaIndex().getDocument("docno", docno)
                if docid == -1:
                    raise KeyError("Could not convert docno %s to a docid" %
                                   docno)

            q = self.get_query(index, docid)
            srq = manager.newSearchRequest(docno, q)
            srq.setControl("wmodel", "Tf")
            manager.runSearchRequest(srq)
            results = srq.getResults()
            for r in results:
                score = r.getScore() / self.numHashes
                if score > self.sim_threshold and r.getDocid() != docid:
                    rtr.append([
                        docno, docid,
                        r.getMetadata("docno"),
                        r.getDocid(), score
                    ])
        return pd.DataFrame(
            rtr, columns=["docno_x", "docid_x", "docno_y", "docid_x", "score"])
Exemplo n.º 24
0
    def __init__(self, *args, fb_terms=10, fb_docs=3, **kwargs):
        global terrier_prf_package_loaded

        #if not terrier_prf_package_loaded:
        #    pt.extend_classpath("org.terrier:terrier-prf")
        #    terrier_prf_package_loaded = True
        #rm = pt.ApplicationSetup.getClass("org.terrier.querying.RM3").newInstance()
        import jnius_config
        prf_found = False
        for j in jnius_config.get_classpath():
            if "terrier-prf" in j:
                prf_found = True
                break
        assert prf_found, 'terrier-prf jar not found: you should start Pyterrier with '\
            + 'pt.init(boot_packages=["org.terrier:terrier-prf:0.0.1-SNAPSHOT"])'
        rm = pt.autoclass("org.terrier.querying.RM3")()
        self.fb_terms = fb_terms
        self.fb_docs = fb_docs
        kwargs["qeclass"] = rm
        super().__init__(*args, **kwargs)
Exemplo n.º 25
0
    def test_callable_wmodel_dunders(self):
        testPosting = pt.autoclass(
            "org.terrier.structures.postings.BasicPostingImpl")(0, 1)

        from pyterrier.batchretrieve import _function2wmodel
        lambdafn = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
        )
        callback, wmodel = _function2wmodel(lambdafn)

        from pyterrier.bootstrap import javabytebuffer2array
        byterep = javabytebuffer2array(wmodel.scoringClass.serializeFn())
        import dill as pickle
        from dill import extend
        #see https://github.com/SeldonIO/alibi/issues/447#issuecomment-881552005
        extend(use_dill=False)
        fn = pickle.loads(byterep)
        self.assertEqual(
            lambdafn(1, testPosting, None, None),
            fn(1, testPosting, None, None),
        )

        wmodel.__getstate__()
        rtr = wmodel.__reduce__()

        #check the byte array is picklable
        pickle.dumps(rtr[1][0])
        #check object is picklable
        pickle.dumps(wmodel)
        #check can be unpickled too
        wmodel2 = pickle.loads(pickle.dumps(wmodel))

        score1 = wmodel.score(testPosting)
        score2 = wmodel2.score(testPosting)
        self.assertEqual(score1, score2)

        #check newly unpickled can still be pickled
        pickle.dumps(wmodel2)
        wmodel3 = pickle.loads(pickle.dumps(wmodel2))
        score3 = wmodel3.score(testPosting)
        self.assertEqual(score1, score3)
    def _test2_manual(self, type):
        import pyterrier as pt
        #pt.logging("INFO")
        import pandas as pd
        df1 = pd.DataFrame({
            'docno': ['1048'],
            'body': [
                'h  f  noise radiators in ground flashes of tropical lightning  a '
                +
                'detailed analysis of h  f  noise sources in tropical ground flashes '
                +
                'v  l  f  phase characteristics deduced from atmospheric waveforms'
            ]
        })
        pd_indexer1 = pt.DFIndexer(tempfile.mkdtemp(), type=type)
        indexref1 = pd_indexer1.index(df1["body"], df1["docno"])
        index1 = pt.IndexFactory.of(indexref1)

        has_direct1 = index1.hasIndexStructure("direct")

        indexref_big = pt.get_dataset("vaswani").get_index()
        index_big = pt.IndexFactory.of(indexref_big)

        from pyterrier import autoclass
        stopwords = autoclass("org.terrier.terms.Stopwords")(None)
        stemmer = autoclass("org.terrier.terms.PorterStemmer")(None)

        q = "MATHEMATICAL ANALYSIS AND DESIGN DETAILS OF WAVEGUIDE FED MICROWAVE RADIATIONS"
        self.assertEqual("1048",
                         index_big.getMetaIndex().getItem("docno", 1047))
        contents_big = TestBackground.get_contents(1047, index_big)

        def _check_index(index_small):
            if has_direct1:
                contents1 = TestBackground.get_contents(0, index_small)
                self.assertEqual(contents1, contents_big)

            inv1 = index_small.getInvertedIndex()
            print(inv1.getClass().getName())
            lex1 = index_small.getLexicon()
            for t in contents_big:
                pointer = lex1[t]
                print(pointer.toString())
                p = inv1.getPostings(pointer)
                print(p.getClass().getName())
                rtr = p.next()
                self.assertEqual(0, rtr)
                self.assertEqual(
                    p.getDocumentLength(),
                    index_big.getDocumentIndex().getDocumentLength(1047))
                self.assertEqual(contents_big[t], p.getFrequency())
                self.assertEqual(p.next(), p.EOL)

            from jnius import JavaException
            try:
                br1 = pt.BatchRetrieve(index_small, wmodel="Tf")
                brall = pt.BatchRetrieve(index_big, wmodel="Tf")
                with_doc = pd.DataFrame(
                    [["q1", q, "1048", 1047]],
                    columns=["qid", "query", "docno", "docid"])
                rtr1 = br1.transform(q)
            except JavaException as ja:
                print(ja.stacktrace)
                raise ja
            rtrall = brall(with_doc)
            self.assertTrue(
                np.array_equal(rtr1["score"].values, rtrall["score"].values))

        _check_index(index1)
        _check_index(
            pt.autoclass("org.terrier.python.IndexWithBackground")(index1,
                                                                   index_big))
Exemplo n.º 27
0
 def get_index(self):
     import pyterrier as pt
     thedir = self._get_all_files("index")
     return pt.autoclass("org.terrier.querying.IndexRef").of(os.path.join(thedir, "data.properties"))
Exemplo n.º 28
0
import pyterrier as pt
from jnius import cast
import pandas as pd
from .batchretrieve import _parse_index_like
from .transformer import TransformerBase, Symbol
from . import tqdm
from warnings import warn
from typing import List

TerrierQLParser = pt.autoclass("org.terrier.querying.TerrierQLParser")()
TerrierQLToMatchingQueryTerms = pt.autoclass(
    "org.terrier.querying.TerrierQLToMatchingQueryTerms")()
QueryResultSet = pt.autoclass("org.terrier.matching.QueryResultSet")
DependenceModelPreProcess = pt.autoclass(
    "org.terrier.querying.DependenceModelPreProcess")

_terrier_prf_package_loaded = False
_terrier_prf_message = 'terrier-prf jar not found: you should start PyTerrier with '\
    + 'pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])'


def _check_terrier_prf():
    import jnius_config
    global _terrier_prf_package_loaded
    if _terrier_prf_package_loaded:
        return

    for j in jnius_config.get_classpath():
        if "terrier-prf" in j:
            _terrier_prf_package_loaded = True
            break
Exemplo n.º 29
0
 def test_br_pickle_straightwmodel(self):
     self._br(pickle,
              wmodel=pt.autoclass("org.terrier.matching.models.BM25")())