def __init__(self, from_cache: bool = False, cache_directory: Optional[Path] = None, append_to_cache: bool = False): """ Args: from_cache (bool, optional): Decide whether to read from cache if cache file exists. By default (``True``), the reader will try to read an datapack from the first line of the caching file. If ``False``, the reader will only read from the original file and use the cache file path only for output. cache_directory (str, optional): The path of the caching file. If :attr:`cache_file_path` is ``None`` and :attr:`self._cache_directory` is not ``None``, use the result of :meth:`_get_cache_location_for_file_path`. If both :attr:`cache_file_path` and :attr:`self._cache_directory` are ``None``, will not read from or write to a caching file. append_to_cache (bool, optional): Decide whether to append write if cache file already exists. By default (``False``), we will overwrite the existing caching file. If ``True``, we will cache the datapack append to end of the caching file. """ self.from_cache = from_cache self._cache_directory = cache_directory self.component_name = get_full_module_name(self) self.append_to_cache = append_to_cache
def __init__(self, from_cache: bool = False, cache_directory: Optional[Path] = None, append_to_cache: bool = False): """ Args: from_cache (bool, optional): Decide whether to read from cache if cache file exists. By default (``True``), the reader will try to read an datapack from the first line of the caching file. If ``False``, the reader will only read from the original file and use the cache file path only for output. cache_directory (str, optional): The base directory to place the path of the caching files. Each collection is contained in one cached file, under this directory. The cached location for each collection is computed by :meth:`_cache_key_function`. Note: A collection is the data returned by :meth:`_collect`. append_to_cache (bool, optional): Decide whether to append write if cache file already exists. By default (``False``), we will overwrite the existing caching file. If ``True``, we will cache the datapack append to end of the caching file. """ self.from_cache = from_cache self._cache_directory = cache_directory self.component_name = get_full_module_name(self) self.append_to_cache = append_to_cache
def set_state_func(instance, state): # pylint: disable=protected-access """ An internal used function. `instance` is an instance of Entry or a MultiEntry. This function will populate the internal states for them. Args: instance: state: Returns: """ # During de-serialization, convert the list back to numpy array. if "_embedding" in state: state["_embedding"] = np.array(state["_embedding"]) else: state["_embedding"] = np.empty(0) # NOTE: the __pack will be set via set_pack from the Pack side. cls_name = get_full_module_name(instance) for k, v in state.items(): key = cls_name + "_" + k if _f_struct_keys.get(key, False): v._set_parent(instance) else: if isinstance(v, (FList, FDict)): v._set_parent(instance) _f_struct_keys[key] = True else: _f_struct_keys[key] = False instance.__dict__.update(state)
def get_state_func(instance): # pylint: disable=protected-access r"""In serialization, the reference to pack is not set, and it will be set by the container. This also implies that it is not advised to serialize an entry on its own, without the ``Container`` as the context, there is little semantics remained in an entry and unexpected errors could occur. """ state = instance.__dict__.copy() # During serialization, convert the numpy array as a list. emb = list(instance._embedding.tolist()) if len(emb) == 0: state.pop("_embedding") else: state["_embedding"] = emb cls_name = get_full_module_name(instance) for k, v in state.items(): key = cls_name + "_" + k if k in _pointer_keys: if _pointer_keys[key]: state[k] = v.as_pointer(instance) else: if isinstance(v, Entry): state[k] = v.as_pointer(instance) _pointer_keys[key] = True else: _pointer_keys[key] = False state.pop("_Entry__pack") return state
def relink_pointer(self): """ This function is normally called after deserialization. It can be called when the pack reference of this entry is ready (i.e. after `set_pack`). The purpose is to convert the `Pointer` objects into actual entries. """ cls_name = get_full_module_name(self) for k, v in self.__dict__.items(): key = cls_name + "_" + k if k in _pointer_keys: if _pointer_keys[key]: setattr(self, k, self._resolve_pointer(v)) else: if isinstance(v, BasePointer): _pointer_keys[key] = True setattr(self, k, self._resolve_pointer(v)) else: _pointer_keys[key] = False
def name(self): return get_full_module_name(self)
def test_reuse_processor(self): # Create a basic pipeline of multi packs that have two pack (by copying) nlp = ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) ) # Create one shared instance of this extractor dummy = DummyPackProcessor() nlp.add( dummy, config={"test": "dummy1"}, selector=NameMatchSelector(), selector_config={"select_name": "default"}, ) # This will not add the component successfully because the processor is # initialized. with self.assertRaises(ProcessorConfigError): nlp.add(dummy, config={"test": "dummy2"}) # This will add the component, with a different selector nlp.add( dummy, selector=NameMatchSelector(), selector_config={"select_name": "copy"}, ) nlp.initialize() # Check that the two processors have the same name. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Check that the two processors are also the same instance. self.assertEqual(nlp.components[2], nlp.components[3]) # Check that the initialization is only done once, here the count # will only be 1. self.assertEqual(nlp.components[2].initialize_count, 1) self.assertEqual(nlp.components[3].initialize_count, 1) # Check that the configuration is not changed by the second insertion. self.assertEqual(nlp.components[3].configs.test, "dummy1") # Run it once to make sure it can run. dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt") nlp.run(dataset_path) # Check that initialization will be false after `run`, because it # calls the `finish` function of all components. self.assertFalse(nlp.components[2].is_initialized) self.assertFalse(nlp.components[3].is_initialized) # Check that we are able to re-initialize the pipeline. nlp.initialize() # initialize the first time. nlp.initialize() # re-initialize. # Check the name again after re-initialize. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Obtain the results from the multipack. mp: MultiPack = nlp.process(dataset_path) pack: DataPack = mp.get_pack("default") pack_copy: DataPack = mp.get_pack("copy") # Check both pack are processed by the DummyProcessor once, because # we use different selector. pack.get_single(NewType).value = "[PACK]" pack_copy.get_single(NewType).value = "[PACK]"
def __init__(self): self.component_name = get_full_module_name(self) self.selector = DummySelector()