Пример #1
0
class DRMM(Reranker):
    """Jiafeng Guo, Yixing Fan, Qingyao Ai, and W. Bruce Croft. 2016. A Deep Relevance Matching Model for Ad-hoc Retrieval. In CIKM'16."""

    module_name = "DRMM"

    config_spec = [
        ConfigOption("nbins", 29, "number of bins in matching histogram"),
        ConfigOption("nodes", 5, "hidden layer dimension for feed forward matching network"),
        ConfigOption("histType", "LCH", "histogram type: CH, NH, or LCH"),
        ConfigOption("gateType", "IDF", "term gate type: TV or IDF"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = DRMM_class(self.extractor, self.config)

        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]

        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Пример #2
0
class DirichletQL(AnseriniSearcherMixIn, Searcher):
    """ Anserini QL with Dirichlet smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "DirichletQL"
    config_spec = [
        ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        """
        Runs Dirichlet QL search. Takes a query from the topic files, and fires it against the index
        Args:
            topicsfn: Path to a topics file
            output_path: Path where the results of the search (i.e the run file) should be stored

        Returns: Path to the run file where the results of the search are stored

        """
        mustr = list2str(config["mu"], delimiter=" ")
        hits = config["hits"]
        anserini_param_str = f"-qld -qld.mu {mustr} -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path
Пример #3
0
class BM25Grid(Searcher, AnseriniSearcherMixIn):
    """ Deprecated. BM25 with a grid search for k1 and b. Search is from 0.1 to bmax/k1max in 0.1 increments """

    module_name = "BM25Grid"
    dependencies = [Dependency(key="index", module="index", name="anserini")]
    config_spec = [
        ConfigOption(
            "k1max", 1.0,
            "maximum k1 value to include in grid search (starting at 0.1)"),
        ConfigOption(
            "bmax", 1.0,
            "maximum b value to include in grid search (starting at 0.1)"),
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        bs = np.around(np.arange(0.1, config["bmax"] + 0.1, 0.1), 1)
        k1s = np.around(np.arange(0.1, config["k1max"] + 0.1, 0.1), 1)
        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)
        hits = config["hits"]
        anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}"

        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Пример #4
0
class TFParade(Reranker):
    """
    TensorFlow implementation of PARADE.

    PARADE: Passage Representation Aggregation for Document Reranking.
    Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020.
    https://arxiv.org/pdf/2008.09093.pdf
    """

    module_name = "parade"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained", "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco"
        ),
        ConfigOption("aggregation", "transformer"),
    ]

    def build_model(self):
        self.model = TFParade_Class(self.extractor, self.config)
        return self.model
Пример #5
0
class CDSSM(Reranker):
    """Yelong Shen, Xiaodong He, Jianfeng Gao, Li Deng, and Grégoire Mesnil. 2014. A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval. In CIKM'14."""

    module_name = "CDSSM"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("nkernel", 3, "kernel dimension in conv"),
        ConfigOption("nfilter", 1, "number of filters in conv"),
        ConfigOption("nhiddens", 30, "hidden layer dimension for ffw layer"),
        ConfigOption("windowsize", 3, "number of query/document words to concatenate before conv"),
        ConfigOption("dropoutrate", 0, "dropout rate for conv"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = CDSSM_class(self.extractor, self.config)

        return self.model

    def score(self, d):
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [self.model(pos_sentence, query_sentence).view(-1), self.model(neg_sentence, query_sentence).view(-1)]

    def test(self, data):
        query_sentence, pos_sentence = data["query"], data["posdoc"]

        return self.model(pos_sentence, query_sentence).view(-1)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)
Пример #6
0
class DRMMTKS(Reranker):
    """Jiafeng Guo, Yixing Fan, Qingyao Ai, and W. Bruce Croft. 2016. A Deep Relevance Matching Model for Ad-hoc Retrieval. In CIKM'16."""

    # reference: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/models/drmmtks.py
    module_name = "DRMMTKS"

    config_spec = [
        ConfigOption("topk", 10, "number of bins in matching histogram"),
        ConfigOption("gateType", "IDF", "term gate type: TV or IDF"),
        ConfigOption("freezeemb", True, "term gate type: TV or IDF"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = DRMMTKS_class(self.extractor, self.config)

        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]

        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Пример #7
0
class BM25(AnseriniSearcherMixIn, Searcher):
    """ Anserini BM25. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "BM25"
    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        """
        Runs BM25 search. Takes a query from the topic files, and fires it against the index
        Args:
            topicsfn: Path to a topics file
            output_path: Path where the results of the search (i.e the run file) should be stored

        Returns: Path to the run file where the results of the search are stored

        """
        bstr, k1str = list2str(config["b"], delimiter=" "), list2str(config["k1"], delimiter=" ")
        hits = config["hits"]
        anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path
Пример #8
0
class TFBERTMaxP(Reranker):
    """
    TensorFlow implementation of BERT-MaxP.

    Deeper Text Understanding for IR with Contextual Neural Language Modeling. Zhuyun Dai and Jamie Callan. SIGIR 2019.
    https://arxiv.org/pdf/1905.09217.pdf
    """

    module_name = "TFBERTMaxP"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="bertpassage"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained",
            "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base-msmarco, or HuggingFace supported models",
        ),
        ConfigOption("aggregation", "max"),
        ConfigOption("hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."),
    ]

    def build_model(self):
        self.model = TFBERTMaxP_Class(self.extractor, self.config)
        return self.model
Пример #9
0
class SampledRobust04(Robust04Yang19, SampleMixin):
    module_name = "sampled_rob04.title"
    file_fn = DATA_PATH / module_name
    config_spec = [
        ConfigOption("rate", 1.0,
                     "sampling rate: fraction number between 0 to 1"),
        ConfigOption("mode", "deep",
                     "sampling mode: sample along queries or documents"),
    ]
Пример #10
0
class SampledGOV2(GOV2Benchmark, SampleMixin):
    module_name = "sampled_gov2.title"
    file_fn = DATA_PATH / module_name
    config_spec = [
        ConfigOption("rate", 1.0,
                     "sampling rate: fraction number between 0 to 1"),
        ConfigOption("mode", "deep",
                     "sampling mode: sample along queries or documents"),
    ]
Пример #11
0
class QLJM(AnseriniSearcherMixIn, Searcher):
    """ Anserini QL with Jelinek-Mercer smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "QLJM"
    config_spec = [ConfigOption("lam", 0.1, value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return")]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-qljm -qljm.lambda {0} -hits {1}".format(list2str(config["lam"], delimiter=" "), config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path
Пример #12
0
class INL2(AnseriniSearcherMixIn, Searcher):
    """ Anserini I(n)L2 scoring model. This searcher does not support list parameters. """

    module_name = "INL2"
    config_spec = [
        ConfigOption("c", 0.1),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-inl2 -inl2.c {0} -hits {1}".format(config["c"], config["hits"])
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)
        return output_path
Пример #13
0
class KNRM(Reranker):
    """Chenyan Xiong, Zhuyun Dai, Jamie Callan, Zhiyuan Liu, and Russell Power. 2017. End-to-End Neural Ad-hoc Ranking with Kernel Pooling. In SIGIR'17."""

    module_name = "KNRM"

    config_spec = [
        ConfigOption("gradkernels", True, "backprop through mus and sigmas"),
        ConfigOption(
            "scoretanh", False,
            "use a tanh on the prediction as in paper (True) or do not use a nonlinearity (False)"
        ),
        ConfigOption(
            "singlefc", True,
            "use single fully connected layer as in paper (True) or 2 fully connected layers (False)"
        ),
        ConfigOption(
            "finetune", False,
            "fine tune the embedding layer"),  # TODO check save when True
    ]

    def add_summary(self, summary_writer, niter):
        super(KNRM, self).add_summary(summary_writer, niter)
        if self.config["singlefc"]:
            fig = plt.figure()
            ax = fig.add_subplot(1, 1, 1)
            ax.matshow(self.model.combine[0].weight.data.cpu())
            summary_writer.add_figure("combine_steps weight", fig, niter)
        else:
            pass

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = KNRM_class(self.extractor, self.config)

        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]

        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Пример #14
0
class AxiomaticSemanticMatching(Searcher, AnseriniSearcherMixIn):
    """ Anserini BM25 with Axiomatic query expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "axiomatic"
    dependencies = [Dependency(key="index", module="index", name="anserini")]
    config_spec = [
        ConfigOption("k1",
                     0.9,
                     "controls term saturation",
                     value_type="floatlist"),
        ConfigOption("b",
                     0.4,
                     "controls document length normalization",
                     value_type="floatlist"),
        ConfigOption("r", 20, value_type="intlist"),
        ConfigOption("n", 30, value_type="intlist"),
        ConfigOption("beta", 0.4, value_type="floatlist"),
        ConfigOption("top", 20, value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = "-axiom -axiom.deterministic -axiom.r {0} -axiom.n {1} -axiom.beta {2} -axiom.top {3}".format(
            *[list2str(config[k], " ") for k in ["r", "n", "beta", "top"]])
        anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1} ".format(
            *[list2str(config[k], " ") for k in ["k1", "b"]])
        anserini_param_str += f" -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Пример #15
0
class CEDRKNRM(Reranker):
    """
    PyTorch implementation of CEDR-KNRM.
    Equivalant to BERT-KNRM when cls=None.

    CEDR: Contextualized Embeddings for Document Ranking
    Sean MacAvaney, Andrew Yates, Arman Cohan, and Nazli Goharian. SIGIR 2019.
    https://arxiv.org/pdf/1904.07094
    """

    module_name = "CEDRKNRM"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained",
            "electra-base",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base, or electra-base-msmarco",
        ),
        ConfigOption("mus",
                     [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
                     "mus",
                     value_type="floatlist"),
        ConfigOption("sigma", 0.1, "sigma"),
        ConfigOption("gradkernels", True, "tune mus and sigmas"),
        ConfigOption(
            "hidden_dropout_prob", 0.1,
            "The dropout probability of BERT-like model's hidden layers."),
        ConfigOption("simmat_layers",
                     "0..12,1",
                     "Layer outputs to include in similarity matrix",
                     value_type="intlist"),
        ConfigOption(
            "combine_hidden", 1024,
            "Hidden size to use with combination FC layer (0 to disable)"),
        ConfigOption("cls", "avg", "Handling of CLS token: avg, max, or None"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = CEDRKNRM_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_mask"],
                       d["pos_seg"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_mask"],
                       d["neg_seg"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_mask"],
                          d["pos_seg"]).view(-1)
Пример #16
0
class DSSM(Reranker):
    """Po-Sen Huang, Xiaodong He, Jianfeng Gao, Li Deng, Alex Acero, and Larry Heck. 2013. Learning deep structured semantic models for web search using clickthrough data. In CIKM'13."""

    module_name = "DSSM"
    dependencies = [
        Dependency(key="extractor", module="extractor", name="bagofwords"),
        Dependency(key="trainer", module="trainer", name="pytorch", default_config_overrides={"lr": 0.0001}),
    ]
    config_spec = [
        ConfigOption(
            "nhiddens",
            "56",
            "list of hidden layer sizes (eg '56 128'), where the i'th value indicates the output size of the i'th layer",
        )
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = DSSM_class(self.extractor, self.config)
        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]
        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Пример #17
0
    class DynamicIRDCollection(IRDCollection):
        module_name = collection_dataset
        ird_dataset_name = collection_dataset
        config_spec = [ConfigOption("fields", ["body"], "fields to index", value_type="strlist")]
        collection_type = "JsonCollection"

        def doc_as_json(self, doc):
            content = " ".join((getattr(doc, field) for field in self.config["fields"]))
            return json.dumps({"id": doc.doc_id, "contents": content})
Пример #18
0
class DeepTileBar(Reranker):
    """Zhiwen Tang and Grace Hui Yang. 2019. DeepTileBars: Visualizing Term Distribution for Neural Information Retrieval. In AAAI'19."""

    module_name = "DeepTileBar"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="deeptiles"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]

    config_spec = [
        ConfigOption("passagelen", 30),
        ConfigOption("numberfilter", 3),
        ConfigOption("lstmhiddendim", 3),
        ConfigOption("linearhiddendim1", 32),
        ConfigOption("linearhiddendim2", 16),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            config = copy.copy(dict(self.config))
            config["batch"] = self.trainer.config["batch"]
            self.model = DeepTileBar_class(self.extractor, config)

        return self.model

    def score(self, d):
        pos_tile_matrix = torch.cat(
            [d["posdoc"][i] for i in range(len(d["qid"]))])  # 32 x
        neg_tile_matrix = torch.cat(
            [d["negdoc"][i] for i in range(len(d["qid"]))])
        return self.model(pos_tile_matrix, neg_tile_matrix)

    def test(self, d):
        qids = d["qid"]
        pos_sentence = d["posdoc"]
        pos_tile_matrix = torch.cat(
            [pos_sentence[i] for i in range(len(qids))])

        return self.model.test_forward(pos_tile_matrix)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)
Пример #19
0
class HINT(Reranker):
    """Yixing Fan, Jiafeng Guo, Yanyan Lan, Jun Xu, Chengxiang Zhai, and Xueqi Cheng. 2018. Modeling Diverse Relevance Patterns in Ad-hoc Retrieval. In SIGIR'18."""

    module_name = "HINT"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("spatialGRU", 2),
        ConfigOption("LSTMdim", 6),
        ConfigOption("kmax", 10)
    ]

    def test(self, query_sentence, query_idf, pos_sentence, *args, **kwargs):
        return self.model.test_forward(query_sentence, query_idf, pos_sentence)

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return self.model(query_sentence, query_idf, pos_sentence,
                          neg_sentence)

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]

        return self.model.test_forward(query_sentence, query_idf, pos_sentence)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)

    def build_model(self):
        if not hasattr(self, "model"):
            config = dict(self.config)
            config.update(self.extractor.config)
            config["batch"] = self.trainer.config["batch"]
            self.model = HiNT_main(self.extractor, config)

        return self.model
Пример #20
0
class CodeSearchNet(Collection):
    """CodeSearchNet Corpus. [1]

    [1] Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. arXiv 2019.
    """

    module_name = "codesearchnet"
    url = "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2"
    collection_type = "TrecCollection"  # TODO: any other supported type?
    generator_type = "DefaultLuceneDocumentGenerator"
    config_spec = [ConfigOption("lang", "ruby", "CSN language dataset to use")]

    def download_if_missing(self):
        cachedir = self.get_cache_path()
        document_dir = cachedir / "documents"
        coll_filename = document_dir / ("csn-" + self.config["lang"] +
                                        "-collection.txt")

        if coll_filename.exists():
            return document_dir.as_posix()

        zipfile = self.config["lang"] + ".zip"
        lang_url = f"{self.url}/{zipfile}"
        tmp_dir = cachedir / "tmp"
        zip_path = tmp_dir / zipfile

        if zip_path.exists():
            logger.info(
                f"{zipfile} already exist under directory {tmp_dir}, skip downloaded"
            )
        else:
            tmp_dir.mkdir(exist_ok=True, parents=True)
            download_file(lang_url, zip_path)

        document_dir.mkdir(exist_ok=True, parents=True)  # tmp
        with ZipFile(zip_path, "r") as zipobj:
            zipobj.extractall(tmp_dir)

        pkl_path = tmp_dir / (self.config["lang"] +
                              "_dedupe_definitions_v2.pkl")
        self._pkl2trec(pkl_path, coll_filename)
        return document_dir.as_posix()

    def _pkl2trec(self, pkl_path, trec_path):
        lang = self.config["lang"]
        with open(pkl_path, "rb") as f:
            codes = pickle.load(f)

        fout = open(trec_path, "w", encoding="utf-8")
        for i, code in tqdm(enumerate(codes),
                            desc=f"Preparing the {lang} collection file"):
            docno = f"{lang}-FUNCTION-{i}"
            doc = remove_newline(" ".join(code["function_tokens"]))
            fout.write(document_to_trectxt(docno, doc))
        fout.close()
Пример #21
0
class Birch(Reranker):
    module_name = "birch"

    config_spec = [
        ConfigOption("topk", 3, "top k scores to use"),
        ConfigOption(
            "hidden", 0,
            "size of hidden layer or 0 to take the weighted sum of the topk"),
        ConfigOption("finetune", False, "fine-tune the BERT model"),
        ConfigOption(
            "pretrained", "msmarco_mb",
            "pretrained Birch model to load: mb, msmarco_mb, or car_mb"),
    ]
    dependencies = [
        Dependency(
            key="extractor",
            module="extractor",
            name="bertpassage",
            default_config_overrides={
                "tokenizer": {
                    "pretrained": "bert-large-uncased"
                }
            },
        ),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]

    def build_model(self):
        self.model = Birch_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_seg"],
                       d["pos_mask"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_seg"],
                       d["neg_mask"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_seg"],
                          d["pos_mask"]).view(-1)
Пример #22
0
class BM25PostProcess(BM25, PostprocessMixin):
    module_name = "BM25Postprocess"

    config_spec = [
        ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"),
        ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results expected from the core searcher"),
        ConfigOption("topn", 1000, "number of results expected after the filtering (if any)"),
        ConfigOption("dedup", False),
    ]

    def query_from_file(self, topicsfn, output_path, docs_to_remove=None):
        output_path = super().query_from_file(topicsfn, output_path)  # will call _query_from_file() from BM25

        if docs_to_remove:
            output_path = self.filter(output_path, docs_to_remove=docs_to_remove, topn=self.config["topn"])
        if self.config["dedup"]:
            output_path = self.dedup(output_path, topn=self.config["topn"])

        return output_path
Пример #23
0
    class DynamicIRDBenchmark(IRDBenchmark):
        module_name = ",".join(dataset)
        config_spec = [ConfigOption("query_type", "title")]

        @property
        def query_type(self):
            return self.config["query_type"]

        @property
        def queries(self):
            return self.topics[self.query_type]
Пример #24
0
class PTParade(Reranker):
    """
    PyTorch implementation of PARADE.

    PARADE: Passage Representation Aggregation for Document Reranking.
    Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020.
    https://arxiv.org/pdf/2008.09093.pdf
    """

    module_name = "ptparade"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained", "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco"
        ),
        ConfigOption("aggregation", "transformer"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = PTParade_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_mask"],
                       d["pos_seg"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_mask"],
                       d["neg_seg"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_mask"],
                          d["pos_seg"]).view(-1)
Пример #25
0
class TFKNRM(Reranker):
    """TensorFlow implementation of KNRM.

    Chenyan Xiong, Zhuyun Dai, Jamie Callan, Zhiyuan Liu, and Russell Power. 2017. End-to-End Neural Ad-hoc Ranking with Kernel Pooling. In SIGIR'17.
    """

    module_name = "TFKNRM"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption("gradkernels", True, "backprop through mus and sigmas"),
        ConfigOption("finetune", False, "fine tune the embedding layer"),  # TODO check save when True
    ]

    def build_model(self):
        self.model = TFKNRM_Class(self.extractor, self.config)

        return self.model
Пример #26
0
class BM25PRF(AnseriniSearcherMixIn, Searcher):
    """ Anserini BM25 PRF. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "BM25PRF"
    config_spec = [
        ConfigOption("k1", [0.65, 0.70, 0.75], "controls term saturation", value_type="floatlist"),
        ConfigOption("b", [0.60, 0.7], "controls document length normalization", value_type="floatlist"),
        ConfigOption("fbTerms", [65, 70, 95, 100], "number of generated terms from feedback", value_type="intlist"),
        ConfigOption("fbDocs", [5, 10, 15], "number of documents used for feedback", value_type="intlist"),
        ConfigOption("newTermWeight", [0.2, 0.25], value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = (
            "-bm25prf "
            + " ".join(f"-bm25prf.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "newTermWeight", "k1", "b"])
            + " -bm25 "
            + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"])
            + f" -hits {hits}"
        )
        print(output_path)
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path
Пример #27
0
class BM25RM3(AnseriniSearcherMixIn, Searcher):
    """ Anserini BM25 with RM3 expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "BM25RM3"
    config_spec = [
        ConfigOption("k1", "0.9", "controls term saturation", value_type="floatlist"),
        ConfigOption("b", "0.4", "controls document length normalization", value_type="floatlist"),
        ConfigOption("fbTerms", [5, 25], "number of generated terms from feedback", value_type="intlist"),
        ConfigOption("fbDocs", [5, 10], "number of documents used for feedback", value_type="intlist"),
        ConfigOption("originalQueryWeight", [0.5], "the weight of unexpended query", value_type="floatlist"),
        ConfigOption("hits", 1000, "number of results to return"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = (
            "-rm3 "
            + " ".join(f"-rm3.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "originalQueryWeight"])
            + " -bm25 "
            + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"])
            + f" -hits {hits}"
        )
        self._anserini_query_from_file(topicsfn, anserini_param_str, output_path)

        return output_path
Пример #28
0
class AnseriniTokenizer(Tokenizer):
    module_name = "anserini"
    config_spec = [
        ConfigOption("keepstops", True, "keep stopwords if True"),
        ConfigOption("stemmer", "none", "stemmer: porter, krovetz, or none"),
    ]

    def build(self):
        self._tokenize = self._get_tokenize_fn()

    def _get_tokenize_fn(self):
        from jnius import autoclass

        stemmer, keepstops = self.config["stemmer"], self.config["keepstops"]
        if stemmer is None:
            stemmer = "none"

        emptyjchar = autoclass(
            "org.apache.lucene.analysis.CharArraySet").EMPTY_SET
        Analyzer = autoclass("io.anserini.analysis.DefaultEnglishAnalyzer")
        analyzer = Analyzer.newStemmingInstance(
            stemmer,
            emptyjchar) if keepstops else Analyzer.newStemmingInstance(stemmer)
        tokenizefn = autoclass("io.anserini.analysis.AnalyzerUtils").analyze

        def _tokenize(sentence):
            return tokenizefn(analyzer, sentence).toArray()

        return _tokenize

    def tokenize(self, sentences):
        if not sentences or len(sentences) == 0:  # either "" or []
            return []

        if isinstance(sentences, str):
            return self._tokenize(sentences)

        return [self._tokenize(s) for s in sentences]
Пример #29
0
class F2Log(Searcher, AnseriniSearcherMixIn):
    """
    F2Log scoring model. This searcher does not support list parameters.
    """

    module_name = "F2Log"
    dependencies = [Dependency(key="index", module="index", name="anserini")]

    config_spec = [
        ConfigOption(
            "s", 0.5
        ),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-f2log -f2log.s {0} -hits {1}".format(
            config["s"], config["hits"])

        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Пример #30
0
class TutorialTask(Task):
    module_name = "tutorial"
    config_spec = [
        ConfigOption("optimize", "map",
                     "metric to maximize on the validation set")
    ]
    dependencies = [
        Dependency(key="benchmark",
                   module="benchmark",
                   name="nf",
                   provide_this=True,
                   provide_children=["collection"]),
        Dependency(key="searcher1", module="searcher", name="BM25RM3"),
        Dependency(key="searcher2", module="searcher", name="SDM"),
    ]

    commands = ["run"] + Task.help_commands
    default_command = "run"

    def run(self):
        output_dir = self.get_results_path()

        # read the title queries from the chosen benchmark's topic file
        results1 = self.searcher1.query_from_file(
            self.benchmark.get_topics_file(), output_dir / "searcher1")
        results2 = self.searcher2.query_from_file(
            self.benchmark.get_topics_file(), output_dir / "searcher2")
        searcher_results = [results1, results2]

        # using the benchmark's folds, which each contain train/validation/test queries,
        # choose the best run in `output_dir` for the fold based on the validation queries
        # and return metrics calculated on the test queries
        best_results = evaluator.search_best_run(
            searcher_results,
            self.benchmark,
            primary_metric=self.config["optimize"],
            metrics=evaluator.DEFAULT_METRICS)

        for fold, path in best_results["path"].items():
            shortpath = "..." + path[-40:]
            logger.info("fold=%s best run: %s", fold, shortpath)

        logger.info("cross-validated results when optimizing for '%s':",
                    self.config["optimize"])
        for metric, score in sorted(best_results["score"].items()):
            logger.info("%15s: %0.4f", metric, score)

        return best_results