예제 #1
0
 def _process_with_component(
         self, selector: Selector, component: PipelineComponent,
         raw_job: ProcessJob):
     for pack in selector.select(raw_job.pack):
         # First, perform the component action on the pack
         try:
             if isinstance(component, Caster):
                 # Replacing the job pack with the casted version.
                 raw_job.alter_pack(component.cast(pack))
             elif isinstance(component, BaseBatchProcessor):
                 pack.set_control_component(component.name)
                 component.process(pack)
             elif isinstance(component, Evaluator):
                 pack.set_control_component(component.name)
                 component.consume_next(
                     pack, self._predict_to_gold[raw_job.id]
                 )
             elif isinstance(component, BaseProcessor):
                 # Should be BasePackProcessor:
                 # All other processor are considered to be
                 # streaming processor like this.
                 pack.set_control_component(component.name)
                 component.process(pack)
             # After the component action, make sure the entry is
             # added into the index.
             pack.add_all_remaining_entries()
         except ValueError as e:
             raise ProcessExecutionException(
                 f'Exception occurred when running '
                 f'{component.name}') from e
예제 #2
0
    def add(self, component: PipelineComponent,
            config: Optional[Union[Config, Dict[str, Any]]] = None,
            selector: Optional[Selector] = None):
        self._processors_index[component.name] = len(self.components)

        if isinstance(component, BaseReader):
            raise ProcessFlowException("Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        component.assign_manager(self._proc_mgr, self._pack_manager)
        self._components.append(component)
        self.processor_configs.append(component.make_configs(config))

        if selector is None:
            self._selectors.append(DummySelector())
        else:
            self._selectors.append(selector)
예제 #3
0
    def add(
        self,
        component: PipelineComponent,
        config: Optional[Union[Config, Dict[str, Any]]] = None,
        selector: Optional[Selector] = None,
    ) -> "Pipeline":
        """
        Adds a pipeline component to the pipeline. The pipeline components
        will form a chain based on the insertion order. The customized
        `config` and `selector` (:class:`~forte.data.selector.Selector`)
        will be associated with this particular component. If the `config`
        or the `selector` is not provided, the default ones will be used.

        Here, note that the same component instance can be added multiple
        times to the pipeline. In such cases, the instance will only be
        setup at the first insertion (i.e. its `initialize` function will
        only be called once). The subsequent insertion of the same component
        instance will not change the behavior nor the states of the instance.
        Thus, a different `config` cannot be provided (should be `None`) when
        added the second time, otherwise a `ProcessorConfigError` will be
        thrown. In the case where one want to them to behave differently, a
        different instance should be used.

        Args:
            component (PipelineComponent): The component to be inserted next
              to the pipeline.
            config (Union[Config, Dict[str, Any]): The custom configuration
              to be used for the added component. Default None, which means
              the `default_configs()` of the component will be used.
            selector (Selector): The selector used to pick the corresponding
              data pack to be consumed by the component. Default None, which
              means the whole pack will be used.

        Returns:
            The pipeline itself, which enables one to chain the creation of
            the pipeline, i.e., you can do:

            .. code-block:: python

                Pipeline().set_reader(your_reader()).add(
                    your_processor()).add(anther_processor())
        """
        if isinstance(component, BaseReader):
            raise ProcessFlowException(
                "Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        if component not in self.__component_set:
            # The case where the component is not found.
            self._components.append(component)
            self.__component_set.add(component)
            self.component_configs.append(component.make_configs(config))
        else:
            if config is None:
                self._components.append(component)
                # We insert a `None` value here just to make the config list
                # to match the component list, but this config should not be
                # used.
                self.component_configs.append(None)
            else:
                raise ProcessorConfigError(
                    f"The same instance of a component named {component.name} "
                    f" has already been added to"
                    f" the pipeline, we do not accept a different configuration"
                    f" for it. If you would like to use a differently"
                    f" configured component, please create another instance."
                    f" If you intend to re-use the component instance, please"
                    f" do not provide the `config` (or provide a `None`).")

        if selector is None:
            self._selectors.append(self.__default_selector)
        else:
            self._selectors.append(selector)

        return self