def test_parse(self):
     identifier = Identifier.parse(
         "Spacy(lang=en_core_web_sm, neuralcoref=False, columns=['text'])"
     )
     self.assertEqual(
         str(identifier),
         "Spacy(lang=en_core_web_sm, neuralcoref=False, columns=['text'])",
     )
     self.assertEqual(
         set(identifier.parameters.keys()), {"lang", "neuralcoref", "columns"}
     )
예제 #2
0
    def retrieve(
        self_or_cls,
        batch: Batch,
        columns: Union[List[str], List[List[str]]],
        proc_fns: Union[str, Callable, List[Union[str, Callable]]] = None,
        identifier: Union[str, Identifier] = None,
        reapply: bool = False,
        **kwargs,
    ) -> Optional[Union[Dict[tuple, List], List[List], Batch, List[Batch]]]:
        """Retrieve information from the cache.

        Args:
            batch: a batch of data
            columns: list of columns to retrieve cached information for
            proc_fns: list of processing functions to be executed left to right on
            the cached data
            identifier: name of the identifier to retrieve
            reapply: whether to recompute the cached operation at retrieval

        Returns: dict mapping a column to a list of length len(batch)
        """
        if not reapply:
            # Infer the most relevant key to retrieve if an identifier is not specified
            if not identifier:
                if isinstance(self_or_cls, type):
                    # cls
                    target_ident_key = self_or_cls.__name__
                else:
                    # self
                    target_ident_key = str(self_or_cls.identifier)

                cachedop_columns = defaultdict(list)
                for ident_key in batch.keys():
                    # Parse the identifier
                    ident = Identifier.parse(ident_key)
                    cachedop_columns[ident.without("columns")].append(ident)

                best_match, best_distance = None, 100000000
                for ident in cachedop_columns:
                    ident_key = str(ident)
                    # Pick the key that best matches the cls name or instance identifier
                    if (ident_key.startswith(target_ident_key)
                            and len(ident_key.replace(target_ident_key,
                                                      "")) < best_distance):
                        best_match = ident
                        best_distance = len(
                            ident_key.replace(target_ident_key, ""))

                identifier = best_match

                # Still no identifier
                if not identifier:
                    raise ValueError(
                        f"Retrieval failed: couldn't find a key called "
                        f"{target_ident_key} in cache.")

            try:
                if isinstance(columns, str) or (isinstance(columns[0], str)
                                                and len(columns) == 1):
                    if isinstance(columns, str):
                        columns = [columns]
                    # Retrieving single piece of information for a single column
                    retrieval = [
                        self_or_cls.decode(val)
                        for val in batch[identifier(columns=columns)]
                    ]

                elif isinstance(columns[0], str):
                    # Retrieving single piece of information `columns` list
                    retrieval = {
                        tuple(columns): [
                            self_or_cls.decode(val)
                            for val in batch[identifier(columns=columns)]
                        ]
                    }
                else:
                    # Retrieving multiple pieces of information
                    retrieval = {
                        tuple(cols_) if len(cols_) > 1 else cols_[0]: [
                            self_or_cls.decode(val)
                            for val in batch[identifier(columns=cols_)]
                        ]
                        for cols_ in columns
                    }

            except KeyError:
                raise KeyError(
                    "Could not retrieve information for all columns. "
                    "If you're trying to retrieve information for multiple columns, "
                    "use columns=[[col_1], [col_2], ..] "
                    "instead of columns=[col_1, col_2, ..].")

            # Check if the retrieved information needs to be processed
            if not proc_fns:
                return retrieval

            # Resolve the str proc_fns to callable(s)
            if isinstance(proc_fns, str):
                proc_fns = getattr(self_or_cls, proc_fns)
            elif isinstance(proc_fns, List):
                proc_fns = [
                    proc_fn if isinstance(proc_fn, Callable) else getattr(
                        self_or_cls, proc_fn) for proc_fn in proc_fns
                ]

            # Process and return the retrieved information
            if isinstance(proc_fns, Callable):
                if isinstance(retrieval, list):
                    return proc_fns(retrieval)
                else:
                    return {k: proc_fns(v) for k, v in retrieval.items()}

            if isinstance(retrieval, list):
                return [[proc_fn(v) for v in retrieval]
                        for proc_fn in proc_fns]

            return [{k: proc_fn(v)
                     for k, v in retrieval.items()} for proc_fn in proc_fns]

        else:
            if proc_fns:
                print("Warning: proc_fns has no effect when reapply=True.")

            # Run the operation on the fly
            # TODO(karan): does this work for ops that require process_dataset
            if isinstance(columns, str) or (isinstance(columns[0], str)
                                            and len(columns) == 1):
                if isinstance(columns, str):
                    columns = [columns]
                return (self_or_cls(
                    **kwargs).apply(batch=batch, columns=columns)
                        if isinstance(self_or_cls, type) else
                        self_or_cls.apply(batch=batch, columns=columns))
            elif isinstance(columns[0], str):
                return {
                    tuple(columns):
                    self_or_cls(**kwargs).apply(batch=batch, columns=columns)
                    if isinstance(self_or_cls, type) else self_or_cls.apply(
                        batch=batch, columns=columns)
                }
            return {
                tuple(cols_) if len(cols_) > 1 else cols_[0]:
                self_or_cls(**kwargs).apply(batch=batch, columns=cols_)
                if isinstance(self_or_cls, type) else self_or_cls.apply(
                    batch=batch, columns=cols_)
                for cols_ in columns
            }