Пример #1
0
def estime(ba, memoire: Chest):
    """
        L'estimation maximale des paramètre se fait par le calcul du powerset sur un ensemble BA (Base d'Apprentissage).
        On fait donc un tf-idf mais en ayant en vocabulaire les composantes du powerset
    :param BA:
    :return:
    """
    len_ba = sum(pow(2, len(x)) for x in ba)
    with Pool(processes=5) as p:
        for element in ba:
            for x in p.map(lambda i: powerset(element, i),
                           range(len(element))):
                for y in x:
                    if not memoire.get(x):
                        memoire[x] = frequence_brute(y, element, True)
    for x in memoire:
        memoire[x] /= len(memoire)
Пример #2
0
def generate_regex(seq: OptimString, memo: Chest = None):
    file = deque()
    sortie = deque()

    file.appendleft(seq)

    while file:
        current = file.pop()
        if not all(x == '.' for x in str(current)):
            if not memo.get(current):
                yield current
        if not isinstance(current.data_pointe[-1], Point):
            current = current.add_point()
            file.appendleft(current)
            for _ in range(
                    current.control.get(current.get_point)[-1] + 1,
                    len(current.data)):
                current = current.deplace_point()
                file.appendleft(current)
Пример #3
0
class ChestCacheTransformer(TransformerBase):
    def __init__(self, inner, **kwargs):
        super().__init__(**kwargs)
        on = "qid"
        self.inner = inner
        self.disable = False
        if CACHE_DIR is None:
            init()

        # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline
        # all different pipelines should return unique __repr_() values, as these are intended to be
        # unambiguous
        trepr = repr(self.inner)
        if "object at 0x" in trepr:
            warn(
                "Cannot cache pipeline %s has a component has not overridden __repr__"
                % trepr)
            self.disable = True

        uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest()
        destdir = path.join(CACHE_DIR, uid)
        os.makedirs(destdir, exist_ok=True)
        definition_file = path.join(destdir, DEFINITION_FILE)
        if not path.exists(definition_file):
            with open(definition_file, "w") as f:
                f.write(trepr)
        self.chest = Chest(
            path=destdir,
            dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename)
            if isinstance(data, pd.DataFrame) else pickle.dump(
                data, filename, protocol=1),
            load=lambda filehandle: pickle.load(filehandle)
            if ".keys" in filehandle.name else pd.read_pickle(filehandle))
        self.hits = 0
        self.requests = 0

    def stats(self):
        return self.hits / self.requests if self.requests > 0 else 0

    # dont double cache - we cannot cache ourselves
    def __invert__(self):
        return self

    def __repr__(self):
        return "Cache(" + self.inner.__repr__() + ")"

    def __str__(self):
        return "Cache(" + str(self.inner) + ")"

    @property
    def NOCACHE(self):
        return self.inner

    def transform(self, input_res):
        if self.disable:
            return self.inner.transform(input_res)
        if "docid" in input_res.columns or "docno" in input_res.columns:
            raise ValueError(
                "Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers"
            )
        return self._transform_qid(input_res)

    def _transform_qid(self, input_res):
        rtr = []
        todo = []

        for index, row in input_res.iterrows():
            qid = row["qid"]
            self.requests += 1
            try:
                df = self.chest.get(qid, None)
            except:
                # occasionally we have file not founds,
                # lets remove from the cache and continue
                del self.chest[qid]
                df = None
            if df is None:
                todo.append(row.to_frame().T)
            else:
                self.hits += 1
                rtr.append(df)
        if len(todo) > 0:
            tood_df = pd.concat(todo)
            todo_res = self.inner.transform(tood_df)
            for indx, row in tood_df.iterrows():
                qid = row["qid"]
                this_query_res = todo_res[todo_res["qid"] == qid]
                self.chest[qid] = this_query_res
                rtr.append(this_query_res)
        self.chest.flush()
        return pd.concat(rtr)
Пример #4
0
class ChestCacheTransformer(TransformerBase):
    """
        A transformer that cache the results of the consituent (inner) transformer. 
        This is instantiated using the `~` operator on any transformer.

        Caching is unqiue based on the configuration of the pipeline, as read by executing
        retr() on the pipeline. Caching lookup is based on the qid, so any change in query
        _formulation_ will not be reflected in a cache's results.

        Example Usage::

            dataset = pt.get_dataset("trec-robust-2004")
            # use for first pass and 2nd pass
            BM25 = pt.BatchRetrieve(index, wmodel="BM25")

            # used for query expansion
            RM3 = pt.rewrite.RM3(index)
            pt.Experiment([
                    ~BM25,
                    (~BM25) >> RM3 >> BM25
                ],
                dataset.get_topics(),
                dataset.get_qrels(),
                eval_metrics=["map"]
            )

        In the above example, we use the `~` operator on the first pass retrieval using BM25, but not on the 2nd pass retrieval, 
        as the query formulation will differ during the second pass.

        Caching is not supported for re-ranking transformers.        
    """
    def __init__(self, inner, **kwargs):
        super().__init__(**kwargs)
        on = "qid"
        self.inner = inner
        self.disable = False
        if CACHE_DIR is None:
            init()

        # we take the md5 of the __repr__ of the pipeline to make a unique identifier for the pipeline
        # all different pipelines should return unique __repr_() values, as these are intended to be
        # unambiguous
        trepr = repr(self.inner)
        if "object at 0x" in trepr:
            warn(
                "Cannot cache pipeline %s has a component has not overridden __repr__"
                % trepr)
            self.disable = True

        uid = hashlib.md5(bytes(trepr, "utf-8")).hexdigest()
        destdir = path.join(CACHE_DIR, uid)
        os.makedirs(destdir, exist_ok=True)
        definition_file = path.join(destdir, DEFINITION_FILE)
        if not path.exists(definition_file):
            with open(definition_file, "w") as f:
                f.write(trepr)
        from chest import Chest
        self.chest = Chest(
            path=destdir,
            dump=lambda data, filename: pd.DataFrame.to_pickle(data, filename)
            if isinstance(data, pd.DataFrame) else pickle.dump(
                data, filename, protocol=1),
            load=lambda filehandle: pickle.load(filehandle)
            if ".keys" in filehandle.name else pd.read_pickle(filehandle))
        self.hits = 0
        self.requests = 0

    def stats(self):
        return self.hits / self.requests if self.requests > 0 else 0

    # dont double cache - we cannot cache ourselves
    def __invert__(self):
        return self

    def __repr__(self):
        return "Cache(" + self.inner.__repr__() + ")"

    def __str__(self):
        return "Cache(" + str(self.inner) + ")"

    @property
    def NOCACHE(self):
        return self.inner

    def transform(self, input_res):
        if self.disable:
            return self.inner.transform(input_res)
        if "docid" in input_res.columns or "docno" in input_res.columns:
            raise ValueError(
                "Caching of %s for re-ranking is not supported. Caching currently only supports input dataframes with queries as inputs and cannot be used for re-rankers."
                % self.inner.__repr__())
        return self._transform_qid(input_res)

    def _transform_qid(self, input_res):
        rtr = []
        todo = []

        # We cannot remove this iterrows() without knowing how to take named tuples into a dataframe
        for index, row in input_res.iterrows():
            qid = str(row["qid"])
            self.requests += 1
            try:
                df = self.chest.get(qid, None)
            except:
                # occasionally we have file not founds,
                # lets remove from the cache and continue
                del self.chest[qid]
                df = None
            if df is None:
                todo.append(row.to_frame().T)
            else:
                self.hits += 1
                rtr.append(df)
        if len(todo) > 0:
            todo_df = pd.concat(todo)
            todo_res = self.inner.transform(todo_df)
            for row in todo_df.itertuples():
                qid = row.qid
                this_query_res = todo_res[todo_res["qid"] == qid]
                self.chest[qid] = this_query_res
                rtr.append(this_query_res)
        self.chest.flush()
        return pd.concat(rtr)