class DRMM(Reranker): """Jiafeng Guo, Yixing Fan, Qingyao Ai, and W. Bruce Croft. 2016. A Deep Relevance Matching Model for Ad-hoc Retrieval. In CIKM'16.""" module_name = "DRMM" config_spec = [ ConfigOption("nbins", 29, "number of bins in matching histogram"), ConfigOption("nodes", 5, "hidden layer dimension for feed forward matching network"), ConfigOption("histType", "LCH", "histogram type: CH, NH, or LCH"), ConfigOption("gateType", "IDF", "term gate type: TV or IDF"), ] def build_model(self): if not hasattr(self, "model"): self.model = DRMM_class(self.extractor, self.config) return self.model def score(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return [ self.model(pos_sentence, query_sentence, query_idf).view(-1), self.model(neg_sentence, query_sentence, query_idf).view(-1), ] def test(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence = d["posdoc"] return self.model(pos_sentence, query_sentence, query_idf).view(-1)
class DirichletQL(AnseriniSearcherMixIn, Searcher): """ Anserini QL with Dirichlet smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "DirichletQL" config_spec = [ ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"), ConfigOption("hits", 1000, "number of results to return"), ] def _query_from_file(self, topicsfn, output_path, config): """ Runs Dirichlet QL search. Takes a query from the topic files, and fires it against the index Args: topicsfn: Path to a topics file output_path: Path where the results of the search (i.e the run file) should be stored Returns: Path to the run file where the results of the search are stored """ mustr = list2str(config["mu"], delimiter=" ") hits = config["hits"] anserini_param_str = f"-qld -qld.mu {mustr} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class BM25Grid(Searcher, AnseriniSearcherMixIn): """ Deprecated. BM25 with a grid search for k1 and b. Search is from 0.1 to bmax/k1max in 0.1 increments """ module_name = "BM25Grid" dependencies = [Dependency(key="index", module="index", name="anserini")] config_spec = [ ConfigOption( "k1max", 1.0, "maximum k1 value to include in grid search (starting at 0.1)"), ConfigOption( "bmax", 1.0, "maximum b value to include in grid search (starting at 0.1)"), ConfigOption("hits", 1000, "number of results to return"), ConfigOption("fields", "title"), ] def _query_from_file(self, topicsfn, output_path, config): bs = np.around(np.arange(0.1, config["bmax"] + 0.1, 0.1), 1) k1s = np.around(np.arange(0.1, config["k1max"] + 0.1, 0.1), 1) bstr = " ".join(str(x) for x in bs) k1str = " ".join(str(x) for x in k1s) hits = config["hits"] anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path, config["fields"]) return output_path
class TFParade(Reranker): """ TensorFlow implementation of PARADE. PARADE: Passage Representation Aggregation for Document Reranking. Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020. https://arxiv.org/pdf/2008.09093.pdf """ module_name = "parade" dependencies = [ Dependency(key="extractor", module="extractor", name="pooledbertpassage"), Dependency(key="trainer", module="trainer", name="tensorflow"), ] config_spec = [ ConfigOption( "pretrained", "bert-base-uncased", "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco" ), ConfigOption("aggregation", "transformer"), ] def build_model(self): self.model = TFParade_Class(self.extractor, self.config) return self.model
class CDSSM(Reranker): """Yelong Shen, Xiaodong He, Jianfeng Gao, Li Deng, and Grégoire Mesnil. 2014. A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval. In CIKM'14.""" module_name = "CDSSM" dependencies = [ Dependency(key="extractor", module="extractor", name="slowembedtext"), Dependency(key="trainer", module="trainer", name="pytorch"), ] config_spec = [ ConfigOption("nkernel", 3, "kernel dimension in conv"), ConfigOption("nfilter", 1, "number of filters in conv"), ConfigOption("nhiddens", 30, "hidden layer dimension for ffw layer"), ConfigOption("windowsize", 3, "number of query/document words to concatenate before conv"), ConfigOption("dropoutrate", 0, "dropout rate for conv"), ] def build_model(self): if not hasattr(self, "model"): self.model = CDSSM_class(self.extractor, self.config) return self.model def score(self, d): query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return [self.model(pos_sentence, query_sentence).view(-1), self.model(neg_sentence, query_sentence).view(-1)] def test(self, data): query_sentence, pos_sentence = data["query"], data["posdoc"] return self.model(pos_sentence, query_sentence).view(-1) def zero_grad(self, *args, **kwargs): self.model.zero_grad(*args, **kwargs)
class DRMMTKS(Reranker): """Jiafeng Guo, Yixing Fan, Qingyao Ai, and W. Bruce Croft. 2016. A Deep Relevance Matching Model for Ad-hoc Retrieval. In CIKM'16.""" # reference: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/models/drmmtks.py module_name = "DRMMTKS" config_spec = [ ConfigOption("topk", 10, "number of bins in matching histogram"), ConfigOption("gateType", "IDF", "term gate type: TV or IDF"), ConfigOption("freezeemb", True, "term gate type: TV or IDF"), ] def build_model(self): if not hasattr(self, "model"): self.model = DRMMTKS_class(self.extractor, self.config) return self.model def score(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return [ self.model(pos_sentence, query_sentence, query_idf).view(-1), self.model(neg_sentence, query_sentence, query_idf).view(-1), ] def test(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence = d["posdoc"] return self.model(pos_sentence, query_sentence, query_idf).view(-1)
class BM25(AnseriniSearcherMixIn, Searcher): """ Anserini BM25. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "BM25" config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"), ] def _query_from_file(self, topicsfn, output_path, config): """ Runs BM25 search. Takes a query from the topic files, and fires it against the index Args: topicsfn: Path to a topics file output_path: Path where the results of the search (i.e the run file) should be stored Returns: Path to the run file where the results of the search are stored """ bstr, k1str = list2str(config["b"], delimiter=" "), list2str(config["k1"], delimiter=" ") hits = config["hits"] anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class TFBERTMaxP(Reranker): """ TensorFlow implementation of BERT-MaxP. Deeper Text Understanding for IR with Contextual Neural Language Modeling. Zhuyun Dai and Jamie Callan. SIGIR 2019. https://arxiv.org/pdf/1905.09217.pdf """ module_name = "TFBERTMaxP" dependencies = [ Dependency(key="extractor", module="extractor", name="bertpassage"), Dependency(key="trainer", module="trainer", name="tensorflow"), ] config_spec = [ ConfigOption( "pretrained", "bert-base-uncased", "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base-msmarco, or HuggingFace supported models", ), ConfigOption("aggregation", "max"), ConfigOption("hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."), ] def build_model(self): self.model = TFBERTMaxP_Class(self.extractor, self.config) return self.model
class SampledRobust04(Robust04Yang19, SampleMixin): module_name = "sampled_rob04.title" file_fn = DATA_PATH / module_name config_spec = [ ConfigOption("rate", 1.0, "sampling rate: fraction number between 0 to 1"), ConfigOption("mode", "deep", "sampling mode: sample along queries or documents"), ]
class SampledGOV2(GOV2Benchmark, SampleMixin): module_name = "sampled_gov2.title" file_fn = DATA_PATH / module_name config_spec = [ ConfigOption("rate", 1.0, "sampling rate: fraction number between 0 to 1"), ConfigOption("mode", "deep", "sampling mode: sample along queries or documents"), ]
class QLJM(AnseriniSearcherMixIn, Searcher): """ Anserini QL with Jelinek-Mercer smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "QLJM" config_spec = [ConfigOption("lam", 0.1, value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return")] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-qljm -qljm.lambda {0} -hits {1}".format(list2str(config["lam"], delimiter=" "), config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class INL2(AnseriniSearcherMixIn, Searcher): """ Anserini I(n)L2 scoring model. This searcher does not support list parameters. """ module_name = "INL2" config_spec = [ ConfigOption("c", 0.1), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"), ] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-inl2 -inl2.c {0} -hits {1}".format(config["c"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class KNRM(Reranker): """Chenyan Xiong, Zhuyun Dai, Jamie Callan, Zhiyuan Liu, and Russell Power. 2017. End-to-End Neural Ad-hoc Ranking with Kernel Pooling. In SIGIR'17.""" module_name = "KNRM" config_spec = [ ConfigOption("gradkernels", True, "backprop through mus and sigmas"), ConfigOption( "scoretanh", False, "use a tanh on the prediction as in paper (True) or do not use a nonlinearity (False)" ), ConfigOption( "singlefc", True, "use single fully connected layer as in paper (True) or 2 fully connected layers (False)" ), ConfigOption( "finetune", False, "fine tune the embedding layer"), # TODO check save when True ] def add_summary(self, summary_writer, niter): super(KNRM, self).add_summary(summary_writer, niter) if self.config["singlefc"]: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.matshow(self.model.combine[0].weight.data.cpu()) summary_writer.add_figure("combine_steps weight", fig, niter) else: pass def build_model(self): if not hasattr(self, "model"): self.model = KNRM_class(self.extractor, self.config) return self.model def score(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return [ self.model(pos_sentence, query_sentence, query_idf).view(-1), self.model(neg_sentence, query_sentence, query_idf).view(-1), ] def test(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence = d["posdoc"] return self.model(pos_sentence, query_sentence, query_idf).view(-1)
class AxiomaticSemanticMatching(Searcher, AnseriniSearcherMixIn): """ Anserini BM25 with Axiomatic query expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "axiomatic" dependencies = [Dependency(key="index", module="index", name="anserini")] config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("r", 20, value_type="intlist"), ConfigOption("n", 30, value_type="intlist"), ConfigOption("beta", 0.4, value_type="floatlist"), ConfigOption("top", 20, value_type="intlist"), ConfigOption("hits", 1000, "number of results to return"), ConfigOption("fields", "title"), ] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = "-axiom -axiom.deterministic -axiom.r {0} -axiom.n {1} -axiom.beta {2} -axiom.top {3}".format( *[list2str(config[k], " ") for k in ["r", "n", "beta", "top"]]) anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1} ".format( *[list2str(config[k], " ") for k in ["k1", "b"]]) anserini_param_str += f" -hits {hits}" self._anserini_query_from_file(topicsfn, anserini_param_str, output_path, config["fields"]) return output_path
class CEDRKNRM(Reranker): """ PyTorch implementation of CEDR-KNRM. Equivalant to BERT-KNRM when cls=None. CEDR: Contextualized Embeddings for Document Ranking Sean MacAvaney, Andrew Yates, Arman Cohan, and Nazli Goharian. SIGIR 2019. https://arxiv.org/pdf/1904.07094 """ module_name = "CEDRKNRM" dependencies = [ Dependency(key="extractor", module="extractor", name="pooledbertpassage"), Dependency(key="trainer", module="trainer", name="pytorch"), ] config_spec = [ ConfigOption( "pretrained", "electra-base", "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base, or electra-base-msmarco", ), ConfigOption("mus", [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9], "mus", value_type="floatlist"), ConfigOption("sigma", 0.1, "sigma"), ConfigOption("gradkernels", True, "tune mus and sigmas"), ConfigOption( "hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."), ConfigOption("simmat_layers", "0..12,1", "Layer outputs to include in similarity matrix", value_type="intlist"), ConfigOption( "combine_hidden", 1024, "Hidden size to use with combination FC layer (0 to disable)"), ConfigOption("cls", "avg", "Handling of CLS token: avg, max, or None"), ] def build_model(self): if not hasattr(self, "model"): self.model = CEDRKNRM_Class(self.extractor, self.config) return self.model def score(self, d): return [ self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1), self.model(d["neg_bert_input"], d["neg_mask"], d["neg_seg"]).view(-1), ] def test(self, d): return self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1)
class DSSM(Reranker): """Po-Sen Huang, Xiaodong He, Jianfeng Gao, Li Deng, Alex Acero, and Larry Heck. 2013. Learning deep structured semantic models for web search using clickthrough data. In CIKM'13.""" module_name = "DSSM" dependencies = [ Dependency(key="extractor", module="extractor", name="bagofwords"), Dependency(key="trainer", module="trainer", name="pytorch", default_config_overrides={"lr": 0.0001}), ] config_spec = [ ConfigOption( "nhiddens", "56", "list of hidden layer sizes (eg '56 128'), where the i'th value indicates the output size of the i'th layer", ) ] def build_model(self): if not hasattr(self, "model"): self.model = DSSM_class(self.extractor, self.config) return self.model def score(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return [ self.model(pos_sentence, query_sentence, query_idf).view(-1), self.model(neg_sentence, query_sentence, query_idf).view(-1), ] def test(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence = d["posdoc"] return self.model(pos_sentence, query_sentence, query_idf).view(-1)
class DynamicIRDCollection(IRDCollection): module_name = collection_dataset ird_dataset_name = collection_dataset config_spec = [ConfigOption("fields", ["body"], "fields to index", value_type="strlist")] collection_type = "JsonCollection" def doc_as_json(self, doc): content = " ".join((getattr(doc, field) for field in self.config["fields"])) return json.dumps({"id": doc.doc_id, "contents": content})
class DeepTileBar(Reranker): """Zhiwen Tang and Grace Hui Yang. 2019. DeepTileBars: Visualizing Term Distribution for Neural Information Retrieval. In AAAI'19.""" module_name = "DeepTileBar" dependencies = [ Dependency(key="extractor", module="extractor", name="deeptiles"), Dependency(key="trainer", module="trainer", name="pytorch"), ] config_spec = [ ConfigOption("passagelen", 30), ConfigOption("numberfilter", 3), ConfigOption("lstmhiddendim", 3), ConfigOption("linearhiddendim1", 32), ConfigOption("linearhiddendim2", 16), ] def build_model(self): if not hasattr(self, "model"): config = copy.copy(dict(self.config)) config["batch"] = self.trainer.config["batch"] self.model = DeepTileBar_class(self.extractor, config) return self.model def score(self, d): pos_tile_matrix = torch.cat( [d["posdoc"][i] for i in range(len(d["qid"]))]) # 32 x neg_tile_matrix = torch.cat( [d["negdoc"][i] for i in range(len(d["qid"]))]) return self.model(pos_tile_matrix, neg_tile_matrix) def test(self, d): qids = d["qid"] pos_sentence = d["posdoc"] pos_tile_matrix = torch.cat( [pos_sentence[i] for i in range(len(qids))]) return self.model.test_forward(pos_tile_matrix) def zero_grad(self, *args, **kwargs): self.model.zero_grad(*args, **kwargs)
class HINT(Reranker): """Yixing Fan, Jiafeng Guo, Yanyan Lan, Jun Xu, Chengxiang Zhai, and Xueqi Cheng. 2018. Modeling Diverse Relevance Patterns in Ad-hoc Retrieval. In SIGIR'18.""" module_name = "HINT" dependencies = [ Dependency(key="extractor", module="extractor", name="slowembedtext"), Dependency(key="trainer", module="trainer", name="pytorch"), ] config_spec = [ ConfigOption("spatialGRU", 2), ConfigOption("LSTMdim", 6), ConfigOption("kmax", 10) ] def test(self, query_sentence, query_idf, pos_sentence, *args, **kwargs): return self.model.test_forward(query_sentence, query_idf, pos_sentence) def score(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence, neg_sentence = d["posdoc"], d["negdoc"] return self.model(query_sentence, query_idf, pos_sentence, neg_sentence) def test(self, d): query_idf = d["query_idf"] query_sentence = d["query"] pos_sentence = d["posdoc"] return self.model.test_forward(query_sentence, query_idf, pos_sentence) def zero_grad(self, *args, **kwargs): self.model.zero_grad(*args, **kwargs) def build_model(self): if not hasattr(self, "model"): config = dict(self.config) config.update(self.extractor.config) config["batch"] = self.trainer.config["batch"] self.model = HiNT_main(self.extractor, config) return self.model
class CodeSearchNet(Collection): """CodeSearchNet Corpus. [1] [1] Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, and Marc Brockschmidt. 2019. CodeSearchNet Challenge: Evaluating the State of Semantic Code Search. arXiv 2019. """ module_name = "codesearchnet" url = "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2" collection_type = "TrecCollection" # TODO: any other supported type? generator_type = "DefaultLuceneDocumentGenerator" config_spec = [ConfigOption("lang", "ruby", "CSN language dataset to use")] def download_if_missing(self): cachedir = self.get_cache_path() document_dir = cachedir / "documents" coll_filename = document_dir / ("csn-" + self.config["lang"] + "-collection.txt") if coll_filename.exists(): return document_dir.as_posix() zipfile = self.config["lang"] + ".zip" lang_url = f"{self.url}/{zipfile}" tmp_dir = cachedir / "tmp" zip_path = tmp_dir / zipfile if zip_path.exists(): logger.info( f"{zipfile} already exist under directory {tmp_dir}, skip downloaded" ) else: tmp_dir.mkdir(exist_ok=True, parents=True) download_file(lang_url, zip_path) document_dir.mkdir(exist_ok=True, parents=True) # tmp with ZipFile(zip_path, "r") as zipobj: zipobj.extractall(tmp_dir) pkl_path = tmp_dir / (self.config["lang"] + "_dedupe_definitions_v2.pkl") self._pkl2trec(pkl_path, coll_filename) return document_dir.as_posix() def _pkl2trec(self, pkl_path, trec_path): lang = self.config["lang"] with open(pkl_path, "rb") as f: codes = pickle.load(f) fout = open(trec_path, "w", encoding="utf-8") for i, code in tqdm(enumerate(codes), desc=f"Preparing the {lang} collection file"): docno = f"{lang}-FUNCTION-{i}" doc = remove_newline(" ".join(code["function_tokens"])) fout.write(document_to_trectxt(docno, doc)) fout.close()
class Birch(Reranker): module_name = "birch" config_spec = [ ConfigOption("topk", 3, "top k scores to use"), ConfigOption( "hidden", 0, "size of hidden layer or 0 to take the weighted sum of the topk"), ConfigOption("finetune", False, "fine-tune the BERT model"), ConfigOption( "pretrained", "msmarco_mb", "pretrained Birch model to load: mb, msmarco_mb, or car_mb"), ] dependencies = [ Dependency( key="extractor", module="extractor", name="bertpassage", default_config_overrides={ "tokenizer": { "pretrained": "bert-large-uncased" } }, ), Dependency(key="trainer", module="trainer", name="pytorch"), ] def build_model(self): self.model = Birch_Class(self.extractor, self.config) return self.model def score(self, d): return [ self.model(d["pos_bert_input"], d["pos_seg"], d["pos_mask"]).view(-1), self.model(d["neg_bert_input"], d["neg_seg"], d["neg_mask"]).view(-1), ] def test(self, d): return self.model(d["pos_bert_input"], d["pos_seg"], d["pos_mask"]).view(-1)
class BM25PostProcess(BM25, PostprocessMixin): module_name = "BM25Postprocess" config_spec = [ ConfigOption("k1", 0.9, "controls term saturation", value_type="floatlist"), ConfigOption("b", 0.4, "controls document length normalization", value_type="floatlist"), ConfigOption("hits", 1000, "number of results expected from the core searcher"), ConfigOption("topn", 1000, "number of results expected after the filtering (if any)"), ConfigOption("dedup", False), ] def query_from_file(self, topicsfn, output_path, docs_to_remove=None): output_path = super().query_from_file(topicsfn, output_path) # will call _query_from_file() from BM25 if docs_to_remove: output_path = self.filter(output_path, docs_to_remove=docs_to_remove, topn=self.config["topn"]) if self.config["dedup"]: output_path = self.dedup(output_path, topn=self.config["topn"]) return output_path
class DynamicIRDBenchmark(IRDBenchmark): module_name = ",".join(dataset) config_spec = [ConfigOption("query_type", "title")] @property def query_type(self): return self.config["query_type"] @property def queries(self): return self.topics[self.query_type]
class PTParade(Reranker): """ PyTorch implementation of PARADE. PARADE: Passage Representation Aggregation for Document Reranking. Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020. https://arxiv.org/pdf/2008.09093.pdf """ module_name = "ptparade" dependencies = [ Dependency(key="extractor", module="extractor", name="pooledbertpassage"), Dependency(key="trainer", module="trainer", name="pytorch"), ] config_spec = [ ConfigOption( "pretrained", "bert-base-uncased", "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco" ), ConfigOption("aggregation", "transformer"), ] def build_model(self): if not hasattr(self, "model"): self.model = PTParade_Class(self.extractor, self.config) return self.model def score(self, d): return [ self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1), self.model(d["neg_bert_input"], d["neg_mask"], d["neg_seg"]).view(-1), ] def test(self, d): return self.model(d["pos_bert_input"], d["pos_mask"], d["pos_seg"]).view(-1)
class TFKNRM(Reranker): """TensorFlow implementation of KNRM. Chenyan Xiong, Zhuyun Dai, Jamie Callan, Zhiyuan Liu, and Russell Power. 2017. End-to-End Neural Ad-hoc Ranking with Kernel Pooling. In SIGIR'17. """ module_name = "TFKNRM" dependencies = [ Dependency(key="extractor", module="extractor", name="slowembedtext"), Dependency(key="trainer", module="trainer", name="tensorflow"), ] config_spec = [ ConfigOption("gradkernels", True, "backprop through mus and sigmas"), ConfigOption("finetune", False, "fine tune the embedding layer"), # TODO check save when True ] def build_model(self): self.model = TFKNRM_Class(self.extractor, self.config) return self.model
class BM25PRF(AnseriniSearcherMixIn, Searcher): """ Anserini BM25 PRF. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "BM25PRF" config_spec = [ ConfigOption("k1", [0.65, 0.70, 0.75], "controls term saturation", value_type="floatlist"), ConfigOption("b", [0.60, 0.7], "controls document length normalization", value_type="floatlist"), ConfigOption("fbTerms", [65, 70, 95, 100], "number of generated terms from feedback", value_type="intlist"), ConfigOption("fbDocs", [5, 10, 15], "number of documents used for feedback", value_type="intlist"), ConfigOption("newTermWeight", [0.2, 0.25], value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"), ] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = ( "-bm25prf " + " ".join(f"-bm25prf.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "newTermWeight", "k1", "b"]) + " -bm25 " + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"]) + f" -hits {hits}" ) print(output_path) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class BM25RM3(AnseriniSearcherMixIn, Searcher): """ Anserini BM25 with RM3 expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """ module_name = "BM25RM3" config_spec = [ ConfigOption("k1", "0.9", "controls term saturation", value_type="floatlist"), ConfigOption("b", "0.4", "controls document length normalization", value_type="floatlist"), ConfigOption("fbTerms", [5, 25], "number of generated terms from feedback", value_type="intlist"), ConfigOption("fbDocs", [5, 10], "number of documents used for feedback", value_type="intlist"), ConfigOption("originalQueryWeight", [0.5], "the weight of unexpended query", value_type="floatlist"), ConfigOption("hits", 1000, "number of results to return"), ] def _query_from_file(self, topicsfn, output_path, config): hits = str(config["hits"]) anserini_param_str = ( "-rm3 " + " ".join(f"-rm3.{k} {list2str(config[k], ' ')}" for k in ["fbTerms", "fbDocs", "originalQueryWeight"]) + " -bm25 " + " ".join(f"-bm25.{k} {list2str(config[k], ' ')}" for k in ["k1", "b"]) + f" -hits {hits}" ) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path) return output_path
class AnseriniTokenizer(Tokenizer): module_name = "anserini" config_spec = [ ConfigOption("keepstops", True, "keep stopwords if True"), ConfigOption("stemmer", "none", "stemmer: porter, krovetz, or none"), ] def build(self): self._tokenize = self._get_tokenize_fn() def _get_tokenize_fn(self): from jnius import autoclass stemmer, keepstops = self.config["stemmer"], self.config["keepstops"] if stemmer is None: stemmer = "none" emptyjchar = autoclass( "org.apache.lucene.analysis.CharArraySet").EMPTY_SET Analyzer = autoclass("io.anserini.analysis.DefaultEnglishAnalyzer") analyzer = Analyzer.newStemmingInstance( stemmer, emptyjchar) if keepstops else Analyzer.newStemmingInstance(stemmer) tokenizefn = autoclass("io.anserini.analysis.AnalyzerUtils").analyze def _tokenize(sentence): return tokenizefn(analyzer, sentence).toArray() return _tokenize def tokenize(self, sentences): if not sentences or len(sentences) == 0: # either "" or [] return [] if isinstance(sentences, str): return self._tokenize(sentences) return [self._tokenize(s) for s in sentences]
class F2Log(Searcher, AnseriniSearcherMixIn): """ F2Log scoring model. This searcher does not support list parameters. """ module_name = "F2Log" dependencies = [Dependency(key="index", module="index", name="anserini")] config_spec = [ ConfigOption( "s", 0.5 ), # array input of this parameter is not support by anserini.SearchCollection ConfigOption("hits", 1000, "number of results to return"), ConfigOption("fields", "title"), ] def _query_from_file(self, topicsfn, output_path, config): anserini_param_str = "-f2log -f2log.s {0} -hits {1}".format( config["s"], config["hits"]) self._anserini_query_from_file(topicsfn, anserini_param_str, output_path, config["fields"]) return output_path
class TutorialTask(Task): module_name = "tutorial" config_spec = [ ConfigOption("optimize", "map", "metric to maximize on the validation set") ] dependencies = [ Dependency(key="benchmark", module="benchmark", name="nf", provide_this=True, provide_children=["collection"]), Dependency(key="searcher1", module="searcher", name="BM25RM3"), Dependency(key="searcher2", module="searcher", name="SDM"), ] commands = ["run"] + Task.help_commands default_command = "run" def run(self): output_dir = self.get_results_path() # read the title queries from the chosen benchmark's topic file results1 = self.searcher1.query_from_file( self.benchmark.get_topics_file(), output_dir / "searcher1") results2 = self.searcher2.query_from_file( self.benchmark.get_topics_file(), output_dir / "searcher2") searcher_results = [results1, results2] # using the benchmark's folds, which each contain train/validation/test queries, # choose the best run in `output_dir` for the fold based on the validation queries # and return metrics calculated on the test queries best_results = evaluator.search_best_run( searcher_results, self.benchmark, primary_metric=self.config["optimize"], metrics=evaluator.DEFAULT_METRICS) for fold, path in best_results["path"].items(): shortpath = "..." + path[-40:] logger.info("fold=%s best run: %s", fold, shortpath) logger.info("cross-validated results when optimizing for '%s':", self.config["optimize"]) for metric, score in sorted(best_results["score"].items()): logger.info("%15s: %0.4f", metric, score) return best_results