def test_parse(self): identifier = Identifier.parse( "Spacy(lang=en_core_web_sm, neuralcoref=False, columns=['text'])" ) self.assertEqual( str(identifier), "Spacy(lang=en_core_web_sm, neuralcoref=False, columns=['text'])", ) self.assertEqual( set(identifier.parameters.keys()), {"lang", "neuralcoref", "columns"} )
def retrieve( self_or_cls, batch: Batch, columns: Union[List[str], List[List[str]]], proc_fns: Union[str, Callable, List[Union[str, Callable]]] = None, identifier: Union[str, Identifier] = None, reapply: bool = False, **kwargs, ) -> Optional[Union[Dict[tuple, List], List[List], Batch, List[Batch]]]: """Retrieve information from the cache. Args: batch: a batch of data columns: list of columns to retrieve cached information for proc_fns: list of processing functions to be executed left to right on the cached data identifier: name of the identifier to retrieve reapply: whether to recompute the cached operation at retrieval Returns: dict mapping a column to a list of length len(batch) """ if not reapply: # Infer the most relevant key to retrieve if an identifier is not specified if not identifier: if isinstance(self_or_cls, type): # cls target_ident_key = self_or_cls.__name__ else: # self target_ident_key = str(self_or_cls.identifier) cachedop_columns = defaultdict(list) for ident_key in batch.keys(): # Parse the identifier ident = Identifier.parse(ident_key) cachedop_columns[ident.without("columns")].append(ident) best_match, best_distance = None, 100000000 for ident in cachedop_columns: ident_key = str(ident) # Pick the key that best matches the cls name or instance identifier if (ident_key.startswith(target_ident_key) and len(ident_key.replace(target_ident_key, "")) < best_distance): best_match = ident best_distance = len( ident_key.replace(target_ident_key, "")) identifier = best_match # Still no identifier if not identifier: raise ValueError( f"Retrieval failed: couldn't find a key called " f"{target_ident_key} in cache.") try: if isinstance(columns, str) or (isinstance(columns[0], str) and len(columns) == 1): if isinstance(columns, str): columns = [columns] # Retrieving single piece of information for a single column retrieval = [ self_or_cls.decode(val) for val in batch[identifier(columns=columns)] ] elif isinstance(columns[0], str): # Retrieving single piece of information `columns` list retrieval = { tuple(columns): [ self_or_cls.decode(val) for val in batch[identifier(columns=columns)] ] } else: # Retrieving multiple pieces of information retrieval = { tuple(cols_) if len(cols_) > 1 else cols_[0]: [ self_or_cls.decode(val) for val in batch[identifier(columns=cols_)] ] for cols_ in columns } except KeyError: raise KeyError( "Could not retrieve information for all columns. " "If you're trying to retrieve information for multiple columns, " "use columns=[[col_1], [col_2], ..] " "instead of columns=[col_1, col_2, ..].") # Check if the retrieved information needs to be processed if not proc_fns: return retrieval # Resolve the str proc_fns to callable(s) if isinstance(proc_fns, str): proc_fns = getattr(self_or_cls, proc_fns) elif isinstance(proc_fns, List): proc_fns = [ proc_fn if isinstance(proc_fn, Callable) else getattr( self_or_cls, proc_fn) for proc_fn in proc_fns ] # Process and return the retrieved information if isinstance(proc_fns, Callable): if isinstance(retrieval, list): return proc_fns(retrieval) else: return {k: proc_fns(v) for k, v in retrieval.items()} if isinstance(retrieval, list): return [[proc_fn(v) for v in retrieval] for proc_fn in proc_fns] return [{k: proc_fn(v) for k, v in retrieval.items()} for proc_fn in proc_fns] else: if proc_fns: print("Warning: proc_fns has no effect when reapply=True.") # Run the operation on the fly # TODO(karan): does this work for ops that require process_dataset if isinstance(columns, str) or (isinstance(columns[0], str) and len(columns) == 1): if isinstance(columns, str): columns = [columns] return (self_or_cls( **kwargs).apply(batch=batch, columns=columns) if isinstance(self_or_cls, type) else self_or_cls.apply(batch=batch, columns=columns)) elif isinstance(columns[0], str): return { tuple(columns): self_or_cls(**kwargs).apply(batch=batch, columns=columns) if isinstance(self_or_cls, type) else self_or_cls.apply( batch=batch, columns=columns) } return { tuple(cols_) if len(cols_) > 1 else cols_[0]: self_or_cls(**kwargs).apply(batch=batch, columns=cols_) if isinstance(self_or_cls, type) else self_or_cls.apply( batch=batch, columns=cols_) for cols_ in columns }