示例#1
0
    def __init__(self,
                 from_cache: bool = False,
                 cache_directory: Optional[Path] = None,
                 append_to_cache: bool = False):
        """
        Args:
            from_cache (bool, optional): Decide whether to read from cache
                if cache file exists. By default (``True``), the reader will
                try to read an datapack from the first line of the caching file.
                If ``False``, the reader will only read from the original file
                and use the cache file path only for output.
            cache_directory (str, optional): The path of the caching file. If
                :attr:`cache_file_path` is ``None`` and
                :attr:`self._cache_directory` is not ``None``, use the result
                of :meth:`_get_cache_location_for_file_path`. If both
                :attr:`cache_file_path` and :attr:`self._cache_directory`
                are ``None``, will not read from or write to a caching file.
            append_to_cache (bool, optional): Decide whether to append write
                if cache file already exists.  By default (``False``), we
                will overwrite the existing caching file. If ``True``, we will
                cache the datapack append to end of the caching file.
    """

        self.from_cache = from_cache
        self._cache_directory = cache_directory
        self.component_name = get_full_module_name(self)
        self.append_to_cache = append_to_cache
示例#2
0
 def __init__(self,
              from_cache: bool = False,
              cache_directory: Optional[Path] = None,
              append_to_cache: bool = False):
     """
     Args:
         from_cache (bool, optional): Decide whether to read from cache
             if cache file exists. By default (``True``), the reader will
             try to read an datapack from the first line of the caching file.
             If ``False``, the reader will only read from the original file
             and use the cache file path only for output.
         cache_directory (str, optional): The base directory to place the
             path of the caching files. Each collection is contained in one
             cached file, under this directory. The cached location for each
             collection is computed by :meth:`_cache_key_function`. Note:
             A collection is the data returned by :meth:`_collect`.
         append_to_cache (bool, optional): Decide whether to append write
             if cache file already exists.  By default (``False``), we
             will overwrite the existing caching file. If ``True``, we will
             cache the datapack append to end of the caching file.
 """
     self.from_cache = from_cache
     self._cache_directory = cache_directory
     self.component_name = get_full_module_name(self)
     self.append_to_cache = append_to_cache
示例#3
0
def set_state_func(instance, state):
    # pylint: disable=protected-access
    """
    An internal used function. `instance` is an instance of Entry or a
    MultiEntry. This function will populate the internal states for them.

    Args:
        instance:
        state:

    Returns:

    """
    # During de-serialization, convert the list back to numpy array.
    if "_embedding" in state:
        state["_embedding"] = np.array(state["_embedding"])
    else:
        state["_embedding"] = np.empty(0)

    # NOTE: the __pack will be set via set_pack from the Pack side.
    cls_name = get_full_module_name(instance)
    for k, v in state.items():
        key = cls_name + "_" + k
        if _f_struct_keys.get(key, False):
            v._set_parent(instance)
        else:
            if isinstance(v, (FList, FDict)):
                v._set_parent(instance)
                _f_struct_keys[key] = True
            else:
                _f_struct_keys[key] = False

    instance.__dict__.update(state)
示例#4
0
def get_state_func(instance):
    # pylint: disable=protected-access
    r"""In serialization, the reference to pack is not set, and
    it will be set by the container.

    This also implies that it is not advised to serialize an entry on its
    own, without the ``Container`` as the context, there is little semantics
    remained in an entry and unexpected errors could occur.
    """
    state = instance.__dict__.copy()
    # During serialization, convert the numpy array as a list.
    emb = list(instance._embedding.tolist())
    if len(emb) == 0:
        state.pop("_embedding")
    else:
        state["_embedding"] = emb

    cls_name = get_full_module_name(instance)
    for k, v in state.items():
        key = cls_name + "_" + k
        if k in _pointer_keys:
            if _pointer_keys[key]:
                state[k] = v.as_pointer(instance)
        else:
            if isinstance(v, Entry):
                state[k] = v.as_pointer(instance)
                _pointer_keys[key] = True
            else:
                _pointer_keys[key] = False

    state.pop("_Entry__pack")
    return state
示例#5
0
 def relink_pointer(self):
     """
     This function is normally called after deserialization. It can be called
     when the pack reference of this entry is ready (i.e. after `set_pack`).
     The purpose is to convert the `Pointer` objects into actual entries.
     """
     cls_name = get_full_module_name(self)
     for k, v in self.__dict__.items():
         key = cls_name + "_" + k
         if k in _pointer_keys:
             if _pointer_keys[key]:
                 setattr(self, k, self._resolve_pointer(v))
         else:
             if isinstance(v, BasePointer):
                 _pointer_keys[key] = True
                 setattr(self, k, self._resolve_pointer(v))
             else:
                 _pointer_keys[key] = False
示例#6
0
 def name(self):
     return get_full_module_name(self)
示例#7
0
    def test_reuse_processor(self):
        # Create a basic pipeline of multi packs that have two pack (by copying)
        nlp = (
            Pipeline()
            .set_reader(SentenceReader())
            .add(MultiPackBoxer())
            .add(MultiPackCopier())
        )

        # Create one shared instance of this extractor
        dummy = DummyPackProcessor()
        nlp.add(
            dummy,
            config={"test": "dummy1"},
            selector=NameMatchSelector(),
            selector_config={"select_name": "default"},
        )

        # This will not add the component successfully because the processor is
        # initialized.
        with self.assertRaises(ProcessorConfigError):
            nlp.add(dummy, config={"test": "dummy2"})

        # This will add the component, with a different selector
        nlp.add(
            dummy,
            selector=NameMatchSelector(),
            selector_config={"select_name": "copy"},
        )
        nlp.initialize()

        # Check that the two processors have the same name.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Check that the two processors are also the same instance.
        self.assertEqual(nlp.components[2], nlp.components[3])

        # Check that the initialization is only done once, here the count
        #  will only be 1.
        self.assertEqual(nlp.components[2].initialize_count, 1)
        self.assertEqual(nlp.components[3].initialize_count, 1)

        # Check that the configuration is not changed by the second insertion.
        self.assertEqual(nlp.components[3].configs.test, "dummy1")

        # Run it once to make sure it can run.
        dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt")
        nlp.run(dataset_path)

        # Check that initialization will be false after `run`, because it
        #  calls the `finish` function of all components.
        self.assertFalse(nlp.components[2].is_initialized)
        self.assertFalse(nlp.components[3].is_initialized)

        # Check that we are able to re-initialize the pipeline.
        nlp.initialize()  # initialize the first time.
        nlp.initialize()  # re-initialize.

        # Check the name again after re-initialize.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Obtain the results from the multipack.
        mp: MultiPack = nlp.process(dataset_path)
        pack: DataPack = mp.get_pack("default")
        pack_copy: DataPack = mp.get_pack("copy")

        # Check both pack are processed by the DummyProcessor once, because
        #  we use different selector.
        pack.get_single(NewType).value = "[PACK]"
        pack_copy.get_single(NewType).value = "[PACK]"
示例#8
0
 def __init__(self):
     self.component_name = get_full_module_name(self)
     self.selector = DummySelector()