Пример #1
0
    name = "anserini"

    def __init__(self, index, stemmer="none", keepstops=False, use_cache=True):
        super().__init__(index)
        self.params = {"stemmer": stemmer, "keepstops": keepstops}

        if use_cache:
            self.initialize_cache(self.params)
        self.create()

    def create(self):
        stemmer = self.params["stemmer"]
        keepstops = self.params["keepstops"]

        if keepstops:
            emptyjchar = autoclass("org.apache.lucene.analysis.CharArraySet").EMPTY_SET
            self.analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer, emptyjchar)
        else:
            self.analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer)

        self._tokenize = autoclass("io.anserini.analysis.AnalyzerUtils").tokenize

    def tokenize(self, s):
        if not s:
            return []
        return self._tokenize(self.analyzer, s).toArray()


import_component_modules("tokenizer")
Пример #2
0
        for name, cls in Index.ALL.items():
            if name in index_path:
                return cls

        return None

    @staticmethod
    def config():
        raise NotImplementedError("config method must be provided by subclass")

    @classmethod
    def register(cls, subcls):
        return register_component_module(cls, subcls)

    def exists(self):
        return os.path.exists(os.path.join(self.index_path, "done"))

    def _build_index(self, config):
        raise NotImplementedError

    def create(self, config):
        if self.exists():
            return

        self._build_index(config)
        with open(os.path.join(self.index_path, "done"), "wt") as donef:
            print("done", file=donef)


import_component_modules("index")
Пример #3
0
        expected_metrics = {m for m in valid_metrics if not m.endswith("_cut") and m != "P"} | {
            m + "_" + str(cutoff) for cutoff in cut_points for m in valid_metrics if m.endswith("_cut") or m == "P"
        }

        if metric in ["ndcg", "ndcg_cut"]:
            mkey = "ndcg_cut_20"
        elif metric in expected_metrics:
            mkey = metric
        else:
            raise RuntimeError("requested metric %s is not one of the supported metrics: %s" % (metric, sorted(expected_metrics)))
        avg_metric = lambda run_metrics: np.mean([qid[mkey] for qid in run_metrics.values()])

        assert len(set(dev["qrels"].keys()).intersection(test["qrels"].keys())) == 0
        dev_eval = pytrec_eval.RelevanceEvaluator(dev["qrels"], valid_metrics)
        best_metric, best_alpha = -np.inf, None
        for alpha in np.arange(0, 1.001, 0.05):
            run_metrics = dev_eval.evaluate(
                Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), alpha)
            )
            mavgp = avg_metric(run_metrics)
            if mavgp > best_metric:
                best_metric = mavgp
                best_alpha = alpha

        test_run = Searcher.interpolate_runs(test["reranker"], test["searcher"], test["qrels"].keys(), best_alpha)
        dev_run = Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), best_alpha)
        return (best_alpha, test_run, dev_run)


import_component_modules("searcher")
Пример #4
0
                    posdocid = random.choice(reldocs[qid])
                    negdocid = random.choice(negdocs[qid])

                    features = self.get_features({"qid": qid, "posdocid": posdocid, "negdocid": negdocid})
                    if features is None:
                        logger.warning("got none features: qid=%s posid=%s negid=%s", qid, posdocid, negdocid)
                        continue

                    for k, v in features.items():
                        batch[k].append(v)

                    if len(batch["qid"]) == self.pipeline_config["batch"]:
                        yield self.prepare_batch(batch)
                        batch = defaultdict(list)

        return genf()

    def prepare_batch(self, batch):
        return batch

    @staticmethod
    def config():
        raise NotImplementedError("config method must be provided by subclass")

    @classmethod
    def register(cls, subcls):
        return register_component_module(cls, subcls)


import_component_modules("benchmark")