예제 #1
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        _validation: Config = self.configs.validation

        # Verify the service is running
        response = self._requests.get(self.configs.url)
        if response.status_code != 200 or response.json()["status"] != "OK":
            raise ProcessorConfigError(
                f"{response.status_code} {response.reason}: Please double "
                "check your endpoint URL configuration and make sure that the "
                f"remote service at {self.configs.url} is a valid pipeline "
                "service that is up and running.")
        service_name: str = response.json()["service_name"]
        input_format: str = response.json()["input_format"]

        if _validation.do_init_type_check:
            # Validate service name and input format
            if service_name != _validation.expected_name:
                raise ProcessorConfigError(
                    "Validation fail: The expected service name "
                    f"('{_validation.expected_name}') does not match the "
                    "actual name returned by remote service "
                    f"('{service_name}'). Please double check your endpoint "
                    f"URL {self.configs.url} or consider updating the configs "
                    "of RemoteProcessor so that 'validation.expected_name' "
                    f"equals to '{service_name}'.")
            if input_format != _validation.input_format:
                raise ProcessorConfigError(
                    "Validation fail: The expected input format "
                    f"('{_validation.input_format}') does not match the "
                    "actual input format returned by remote service "
                    f"('{input_format}'). Please double check your endpoint "
                    f"URL {self.configs.url} or consider updating the configs "
                    "of RemoteProcessor so that 'validation.input_format' "
                    f"equals to '{input_format}'.")
예제 #2
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.index = ElasticSearchIndexer(self.configs.indexer.hparams)
        if self.configs.query_result_project_id < 0:
            raise ProcessorConfigError("Query Result Project is not set.")

        if not os.path.exists(self.configs.stave_db_path):
            raise ProcessorConfigError(
                f"Cannot find Stave DB at: {self.configs.stave_db_path}")
예제 #3
0
 def initialize(self, config: Union[Dict, Config]):
     super().initialize(config)
     if self.config.entry_type is None:
         raise ProcessorConfigError(
             "The `entry_type` configuration must be "
             "provided and cannot be None.")
     if self.config.attribute is None:
         raise ProcessorConfigError("The `attribute` configuration must be "
                                    "provided and cannot be None.")
예제 #4
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if not configs.stave_db_path:
            raise ProcessorConfigError(
                'The database path to stave is not specified.')

        if not configs.datapack_table:
            raise ProcessorConfigError(
                'The table name that stores the data pack is not stored.')
예제 #5
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if not configs.stave_db_path:
            raise ProcessorConfigError(
                'The database path to stave is not specified.')

        if not configs.datapack_table:
            raise ProcessorConfigError(
                'The table name that stores the data pack is not stored.')

        if not configs.target_project_name:
            logging.info(
                "No project specified, will attempt to read all proejcts.")
예제 #6
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        if ("pos" in configs.processors or "depparse" in configs.processors
                or "depparse" in configs.processors):
            if "tokenize" not in self.configs.processors:
                raise ProcessorConfigError("tokenize is necessary in "
                                           "configs.processors for "
                                           "pos, depparse or srl")
        cuda_devices = itertools.cycle(configs["cuda_devices"])
        if configs.tag_formalism not in MODEL2URL:
            raise ProcessorConfigError("Incorrect value for tag_formalism")
        if configs.tag_formalism == "stanford":
            self.predictor = {
                "stanford":
                Predictor.from_path(configs["stanford_url"],
                                    cuda_device=next(cuda_devices))
            }
        if "srl" in configs.processors:
            self.predictor = {
                "stanford":
                Predictor.from_path(configs["stanford_url"],
                                    cuda_device=next(cuda_devices)),
                "srl":
                Predictor.from_path(configs["srl_url"],
                                    cuda_device=next(cuda_devices)),
            }

        if configs.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if configs.allow_parallel_entries:
                logger.warning("Both `overwrite_entries` (whether to overwrite"
                               " the entries of the same type as produced by "
                               "this processor) and "
                               "`allow_parallel_entries` (whether to allow "
                               "similar new entries when they already exist) "
                               "are True, all existing conflicting entries "
                               "will be deleted.")
        else:
            if not configs.allow_parallel_entries:
                logger.warning("Both `overwrite_entries` (whether to overwrite"
                               " the entries of the same type as produced by "
                               "this processor) and "
                               "`allow_parallel_entries` (whether to allow "
                               "similar new entries when they already exist) "
                               "are False, processor will only run if there "
                               "are no existing conflicting entries.")
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if configs.tag_formalism not in MODEL2URL:
            raise ProcessorConfigError('Incorrect value for tag_formalism')
        model_url = MODEL2URL[configs.tag_formalism]
        self.predictor: Predictor = Predictor.from_path(model_url)

        if configs.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if configs.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are True, all existing conflicting entries '
                    'will be deleted.')
        else:
            if not configs.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are False, processor will only run if there '
                    'are no existing conflicting entries.')
예제 #8
0
 def initialize(self, config: Config):
     super().initialize(config)
     if self.configs.context_type is None:
         raise ProcessorConfigError(
             f"The 'context_type' config of {self.__class__.__name__} "
             f"cannot be None."
         )
예제 #9
0
    def initialize(self, resource: Resources, configs: HParams):
        self.processors = configs.processors
        if self.processors is None or self.processors == "":
            self.processors = self.default_configs()['processors']

        if configs.output_format not in MODEL2URL:
            raise ProcessorConfigError('Incorrect value for output_format')
        model_url = MODEL2URL[configs.output_format]
        self.predictor = Predictor.from_path(model_url)

        self.overwrite_entries = configs.overwrite_entries
        self.allow_parallel_entries = configs.allow_parallel_entries
        if self.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if self.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are True, all existing conflicting entries '
                    'will be deleted.')
        else:
            if not self.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are False, processor will only run if there '
                    'are no existing conflicting entries.')
예제 #10
0
    def initialize(self, resources: Resources, configs: Config):
        # Populate the _request. The self._request_ready help avoid parsing
        # the feature scheme multiple times during `initialize`.
        if not self._request_ready:
            for key, value in configs.items():
                if key == "feature_scheme":
                    self._request["schemes"] = parse_feature_extractors(
                        configs.feature_scheme)
                else:
                    self._request[key] = value
            self._request_ready = True

        batcher_config = configs.batcher
        # Assign context type from here to make sure batcher is using the
        # same context type as predictor.
        batcher_context = configs["batcher"].get("context_type", None)
        if (batcher_context is None
                or batcher_context == self._request["context_type"]):
            batcher_config.context_type = self._request["context_type"]
        else:
            raise ProcessorConfigError(
                "The 'context_type' configuration value should be the same "
                "for the processor and the batcher, now for the processor the "
                f"value is {self._request['context_type']} and for the "
                f"batcher the value is {batcher_context}. It is also fine if "
                f"this value for batch config is left empty.")
        self.do_eval = configs.do_eval

        # This needs to be called later since batcher config needs to be loaded.
        super().initialize(resources, configs)
        for tag, scheme in self._request["schemes"].items():
            # Add input feature to the batcher.
            if scheme["type"] == extractor_utils.DATA_INPUT:
                self.batcher.add_feature_scheme(tag, scheme)  # type: ignore
예제 #11
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if "pos" in configs.processors or "lemma" in configs.processors:
            if "tokenize" not in configs.processors:
                raise ProcessorConfigError("tokenize is necessary in "
                                           "configs.processors for "
                                           "pos or lemma")
            else:
                if "sentence" not in configs.processors:
                    raise ProcessorConfigError("sentence is necessary in "
                                               "configs.processors for "
                                               "tokenize or pos or lemma")

        self.processors = configs.processors
        self.lang_model = configs.lang
        self.set_up()
예제 #12
0
 def initialize(self, resources, configs):
     super().initialize(resources, configs)
     if "successor" in configs["test"] and "test" not in configs["test"]:
         raise ProcessorConfigError(
             '"test" is necessary as the first '
             'step for "successor" in config '
             "for test case purpose."
         )
     self.initialize_count += 1
예제 #13
0
    def initialize(self, resources: Resources, configs: Optional[Config]):
        super().initialize(resources, configs)

        assert configs is not None
        try:
            self.batcher.initialize(configs.batcher)
        except AttributeError as e:
            raise ProcessorConfigError(
                e, "Error in handling batcher config, please provide the "
                "check the config to see if you have the key 'batcher'.")
예제 #14
0
    def initialize(self, resources: Resources, configs: Optional[Config]):
        super().initialize(resources, configs)

        assert configs is not None
        try:
            self.batcher.initialize(configs.batcher)
        except AttributeError as e:
            raise ProcessorConfigError(
                "Error in handling batcher config, please check the "
                "config of the batcher to see they are correct.") from e
예제 #15
0
    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init
        super().initialize(resources, configs)

        if not configs.stave_db_path:
            raise ProcessorConfigError(
                'The database path to stave is not specified.')

        self.conn = sqlite3.connect(configs.stave_db_path)
        self.data_packs: Dict[int, DataPack] = load_all_datapacks(
            self.conn, configs.datapack_table, configs.pack_content_col)
예제 #16
0
    def initialize(self, config: Union[Dict, Config]):
        # pylint: disable=attribute-defined-outside-init
        super().initialize(config)

        if self.config.attribute is None:
            raise ProcessorConfigError(
                "'attribute' is required in this extractor.")
        if self.config.index_annotation is None:
            raise ProcessorConfigError(
                "'index_annotation' is required in this extractor.")
        if self.config.entry_type is None:
            raise ProcessorConfigError(
                "'entry_type' is required in this extractor.")
        else:
            self._entry_class: Type[Link] = get_class(self.config.entry_type)

            if not issubclass(self._entry_class, Link):
                raise ProcessorConfigError("`entry_class` to this extractor "
                                           "must be a Link tpe.")

            self._parent_class: Type[Annotation] = self._entry_class.ParentType
            if not issubclass(self._parent_class, Annotation):
                raise ProcessorConfigError(
                    f"The parent class of the provided {self.config.entry_type}"
                    " must be an Annotation.")

            self._child_class: Type[Annotation] = self._entry_class.ChildType
            if not issubclass(self._child_class, Annotation):
                raise ProcessorConfigError(
                    f"The child class of the provided {self.config.entry_type}"
                    " must be an Annotation.")
예제 #17
0
    def _process(self, input_pack: DataPack):
        """Perform HuggingFace NER Pipeline on the input data pack.

        Args:
            input_pack: Input pack to fill
        Returns:
        """
        if not self.configs.entry_type:
            raise ProcessorConfigError("Please specify an input entry type!")

        output_entry = get_class(self.configs.output_entry_type)

        for entry_specified in input_pack.get(self.configs.entry_type):
            result = self.classifier(entry_specified.text)

            if self.configs.tagging_scheme == "bio-merge":  # Merge BIO tagging
                result_types, result_indices = self._merge_bio_tokens(result)

            elif self.configs.tagging_scheme == "no-merge":
                result_indices = []
                result_types = []
                for token in result:
                    start, end = token["start"], token["end"]
                    result_types.append(token["entity"])
                    result_indices.append((start, end))
            else:
                raise ProcessorConfigError(
                    f"The tagging_scheme strategy {self.configs.tagging_scheme}"
                    f"was not defined. Please check your input config.")

            for type, (start, end) in zip(result_types, result_indices):
                entity = output_entry(
                    pack=input_pack,
                    begin=entry_specified.span.begin + int(start),
                    end=entry_specified.span.begin + int(end),
                )
                setattr(entity, self.configs.attribute_name, type)
예제 #18
0
    def _process_existing_entries(self, input_pack):
        tokens_exist = any(True for _ in input_pack.get(Token))
        dependencies_exist = any(True for _ in input_pack.get(Dependency))

        if tokens_exist or dependencies_exist:
            if not self.configs.overwrite_entries:
                if not self.configs.allow_parallel_entries:
                    raise ProcessorConfigError(
                        "Found existing entries, either `overwrite_entries` or "
                        "`allow_parallel_entries` should be True")
            else:
                # delete existing tokens and dependencies
                for entry_type in (Token, Dependency):
                    for entry in input_pack.get(entry_type):
                        input_pack.delete_entry(entry)
예제 #19
0
    def make_configs(
        cls,
        configs: Optional[Union[Config, Dict[str, Any]]],
    ) -> Config:
        """
        Create the configuration by merging the
        provided config with the `default_configs`.

        The following config conventions are expected:
          - The top level key can be a special `@config_path`.

          - `@config_path` should be point to a file system path, which will
             be a YAML file containing configurations.

          - Other key values in the configs will be considered as parameters.

        Args:
            configs: The input config to be merged with the default config.

        Returns:
            The merged configuration.
        """
        merged_configs: Dict = {}

        if configs is not None:
            if isinstance(configs, Config):
                configs = configs.todict()

            if configs.get("@config_path", None) is not None:
                with open(configs.pop("@config_path"), encoding="utf-8") as f:
                    filebased_configs = yaml.safe_load(f)
            else:
                filebased_configs = {}

            merged_configs.update(filebased_configs)

            merged_configs.update(configs)

        try:
            final_configs = Config(merged_configs,
                                   cls._default_configs().todict())
        except ValueError as e:
            raise ProcessorConfigError(
                f"Configuration error for the processor "
                f"{get_full_module_name(cls)}.") from e

        return final_configs
예제 #20
0
 def expected_types_and_attributes(self):
     r"""Method to add expected types and attributes for the input of
     `RemoteProcessor`. This should be the `expected_types_and_attributes`
     of the first processor in remote pipeline.
     """
     if self._expectation is None:
         response = self._requests.get(f"{self.configs.url}/expectation")
         if response.status_code != 200 or response.json(
         )["status"] != "OK":
             raise ProcessorConfigError(
                 f"{response.status_code} {response.reason}: "
                 "Fail to fetch expected types and attributes from remote "
                 "service. Please make sure that the remote service at "
                 f"{self.configs.url} is a valid pipeline service that is "
                 "up and running.")
         self._expectation = response.json()["expectation"]
     return self._expectation
예제 #21
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        cuda_devices = itertools.cycle(configs['cuda_devices'])
        if configs.tag_formalism not in MODEL2URL:
            raise ProcessorConfigError('Incorrect value for tag_formalism')
        if configs.tag_formalism == 'stanford':
            self.predictor = {
                'stanford':
                Predictor.from_path(configs['stanford_url'],
                                    cuda_device=next(cuda_devices))
            }
        if 'srl' in configs.processors:
            self.predictor = {
                'stanford':
                Predictor.from_path(configs['stanford_url'],
                                    cuda_device=next(cuda_devices)),
                'srl':
                Predictor.from_path(configs['srl_url'],
                                    cuda_device=next(cuda_devices))
            }

        if configs.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if configs.allow_parallel_entries:
                logger.warning('Both `overwrite_entries` (whether to overwrite'
                               ' the entries of the same type as produced by '
                               'this processor) and '
                               '`allow_parallel_entries` (whether to allow '
                               'similar new entries when they already exist) '
                               'are True, all existing conflicting entries '
                               'will be deleted.')
        else:
            if not configs.allow_parallel_entries:
                logger.warning('Both `overwrite_entries` (whether to overwrite'
                               ' the entries of the same type as produced by '
                               'this processor) and '
                               '`allow_parallel_entries` (whether to allow '
                               'similar new entries when they already exist) '
                               'are False, processor will only run if there '
                               'are no existing conflicting entries.')
예제 #22
0
    def make_configs(
            cls, configs: Optional[Union[Config, Dict[str, Any]]]) -> Config:
        """
        Create the component configuration for this class, by merging the
        provided config with the ``default_config``.

        The following config conventions are expected:
          - The top level key can be a special `config_path`.
          - `config_path` should be point to a file system path, which will
             be a YAML file containing configurations.
          - Other key values in the configs will be considered as parameters.

        Args:
            configs: The input config to be merged with the default config.

        Returns:
            The merged configuration.
        """
        merged_configs: Dict = {}

        if configs is not None:
            if isinstance(configs, Config):
                configs = configs.todict()

            if "config_path" in configs and not configs["config_path"] is None:
                filebased_configs = yaml.safe_load(
                    open(configs.pop("config_path")))
            else:
                filebased_configs = {}

            merged_configs.update(filebased_configs)

            merged_configs.update(configs)

        try:
            final_configs = Config(merged_configs, cls.default_configs())
        except ValueError as e:
            raise ProcessorConfigError(
                f'Configuration error for the processor '
                f'{get_full_module_name(cls)}.') from e

        return final_configs
예제 #23
0
    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of `RemoteProcessor`. The records
        are queried from the remote service. The types and attributes are
        populated from all the components in remote pipeline.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        if self._records is None:
            response = self._requests.get(f"{self.configs.url}/records")
            if response.status_code != 200 or response.json(
            )["status"] != "OK":
                raise ProcessorConfigError(
                    f"{response.status_code} {response.reason}: "
                    "Fail to fetch records from remote service. Please make "
                    f"sure that the remote service at {self.configs.url} is "
                    "a valid pipeline service that is up and running.")
            self._records = response.json()["records"]
        record_meta.update(self._records)
예제 #24
0
 def initialize(self, resources: Resources, configs: Config):
     super().initialize(resources, configs)
     if (
         "pos" in configs.processors
         or "lemma" in configs.processors
         or "depparse" in configs.processors
     ):
         if "tokenize" not in configs.processors:
             raise ProcessorConfigError(
                 "tokenize is necessary in "
                 "configs.processors for "
                 "pos or lemma or depparse"
             )
     self.set_up()
     self.nlp = stanza.Pipeline(
         lang=self.configs.lang,
         dir=self.configs.dir,
         use_gpu=self.configs.use_gpu,
         processors=self.configs.processors,
     )
예제 #25
0
    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init,consider-using-with
        super().initialize(resources, configs)

        if self.configs.output_dir is None:
            raise ProcessorConfigError(
                "`output_dir` is not specified for the writer.")

        pack_paths = os.path.join(self.configs.output_dir, self.pack_idx)
        ensure_dir(pack_paths)
        self.pack_idx_out = open(pack_paths, "w", encoding="utf-8")

        multi_index = os.path.join(self.configs.output_dir, self.multi_idx)
        ensure_dir(multi_index)
        self.multi_idx_out = open(multi_index, "w", encoding="utf-8")

        if self.configs.serialize_method == "jsonpickle":
            self._suffix = ".json.gz" if self.configs.zip_pack else ".json"
        else:
            self._suffix = ".pickle.gz" if self.configs.zip_pack else ".pickle"
예제 #26
0
    def init_from_config(self, configs: List):
        r"""Initialized the pipeline (ontology and processors) from the
        given configurations.

        Args:
            configs: The configs used to initialize the pipeline.
        """

        is_first: bool = True
        for component_config in configs:
            component = create_class_with_kwargs(
                class_name=component_config['type'],
                class_args=component_config.get('kwargs', {}),
            )

            if is_first:
                if not isinstance(component, BaseReader):
                    raise ProcessorConfigError(
                        "The first component of a pipeline must be a reader.")
                self.set_reader(component, component_config.get('configs', {}))
                is_first = False
            else:
                # Can be processor, caster, or evaluator
                self.add(component, component_config.get('configs', {}))
예제 #27
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        # Validate multi_pack project config:
        #   A `multi_pack` project must have `multi_ontology` set.
        if self.configs.project_type != "single_pack" and (
                self.configs.project_type != "multi_pack"
                or self.configs.multi_ontology is None):
            raise ProcessorConfigError("Invalid project type configuration.")

        # Generate default configurations
        self.configs.project_configs = Config(
            hparams=self.configs.project_configs,
            default_hparams=self._default_project_configs(),
        )
        self.configs.multi_ontology = self.configs.multi_ontology or Config({},
                                                                            {})
        self.configs.project_path = os.path.abspath(
            self.configs.project_path or self.configs.project_name)

        self._viewer = StaveViewer(
            project_path=self.configs.project_path,
            host=self.configs.host,
            port=self.configs.port,
            thread_daemon=self.configs.server_thread_daemon,
        )

        #  Write meta data to project folder
        self._project_writer = StaveProjectWriter(
            project_path=self.configs.project_path,
            project_name=self.configs.project_name,
            project_type=self.configs.project_type,
            ontology=self.resources.get("onto_specs_dict"),
            project_configs=self.configs.project_configs.todict(),
            multi_ontology=self.configs.multi_ontology.todict(),
        )
예제 #28
0
    def add(
        self,
        component: PipelineComponent,
        config: Optional[Union[Config, Dict[str, Any]]] = None,
        selector: Optional[Selector] = None,
    ) -> "Pipeline":
        """
        Adds a pipeline component to the pipeline. The pipeline components
        will form a chain based on the insertion order. The customized
        `config` and `selector` (:class:`~forte.data.selector.Selector`)
        will be associated with this particular component. If the `config`
        or the `selector` is not provided, the default ones will be used.

        Here, note that the same component instance can be added multiple
        times to the pipeline. In such cases, the instance will only be
        setup at the first insertion (i.e. its `initialize` function will
        only be called once). The subsequent insertion of the same component
        instance will not change the behavior nor the states of the instance.
        Thus, a different `config` cannot be provided (should be `None`) when
        added the second time, otherwise a `ProcessorConfigError` will be
        thrown. In the case where one want to them to behave differently, a
        different instance should be used.

        Args:
            component (PipelineComponent): The component to be inserted next
              to the pipeline.
            config (Union[Config, Dict[str, Any]): The custom configuration
              to be used for the added component. Default None, which means
              the `default_configs()` of the component will be used.
            selector (Selector): The selector used to pick the corresponding
              data pack to be consumed by the component. Default None, which
              means the whole pack will be used.

        Returns:
            The pipeline itself, which enables one to chain the creation of
            the pipeline, i.e., you can do:

            .. code-block:: python

                Pipeline().set_reader(your_reader()).add(
                    your_processor()).add(anther_processor())
        """
        if isinstance(component, BaseReader):
            raise ProcessFlowException(
                "Reader need to be set via set_reader()")

        if isinstance(component, Evaluator):
            # This will ask the job to keep a copy of the gold standard.
            self.evaluator_indices.append(len(self.components))

        if component not in self.__component_set:
            # The case where the component is not found.
            self._components.append(component)
            self.__component_set.add(component)
            self.component_configs.append(component.make_configs(config))
        else:
            if config is None:
                self._components.append(component)
                # We insert a `None` value here just to make the config list
                # to match the component list, but this config should not be
                # used.
                self.component_configs.append(None)
            else:
                raise ProcessorConfigError(
                    f"The same instance of a component named {component.name} "
                    f" has already been added to"
                    f" the pipeline, we do not accept a different configuration"
                    f" for it. If you would like to use a differently"
                    f" configured component, please create another instance."
                    f" If you intend to re-use the component instance, please"
                    f" do not provide the `config` (or provide a `None`).")

        if selector is None:
            self._selectors.append(self.__default_selector)
        else:
            self._selectors.append(selector)

        return self
예제 #29
0
    def _default_project_configs(self):
        # pylint: disable=line-too-long
        """
        Create default project configuration based on ontology.
        This is translated from JavaScript function `createDefaultConfig` in
        https://github.com/asyml/stave/blob
        /d82383de3d74bf09c0d30f33d8a902595f5aff80/src/app/pages/Projects.tsx
        #L140

        Returns:
            configs: A dictionary with the default config for project.

        """
        # pylint: enable=line-too-long

        if not (self.resources.contains("onto_specs_dict")
                and self.resources.contains("merged_entry_tree")):
            raise ProcessorConfigError(
                "onto_specs_dict/merged_entry_tree is not set in resources.")
        ontology = self.resources.get("onto_specs_dict")
        entry_tree = self.resources.get("merged_entry_tree")

        configs: Dict[str, Any] = {
            "legendConfigs": {},
            "scopeConfigs": {},
            "layoutConfigs": {
                "center-middle": "default-nlp",
                "left": "default-meta",
                "right": "default-attribute",
                "center-bottom": "disable",
            },
            "remoteConfigs": {
                "pipelineUrl": "",
                "doValidation": False,
                "expectedName": "",
                "inputFormat": "string",
                "expectedRecords": {},
            },
        }

        # Create legend configs
        legend_configs: Dict[str, Any] = {}
        entry_name_set: Set[str] = set()
        for entry in ontology["definitions"]:
            entry_name = entry["entry_name"]
            entry_name_set.add(entry_name)
            legend_configs[entry_name] = {
                "is_selected": False,
                "is_shown": True,
            }
            if "attributes" in entry and len(entry["attributes"]) > 0:
                attributes_configs = {}
                for attribute in entry["attributes"]:
                    if attribute["type"] == "str":
                        attributes_configs[attribute["name"]] = False
                legend_configs[entry_name]["attributes"] = attributes_configs
        configs["legendConfigs"] = legend_configs

        # Find all subclass of `forte.data.ontology.top.Annotation` and
        # update `scopeConfigs` accordingly.
        queue = collections.deque(
            [search(entry_tree.root, "forte.data.ontology.top.Annotation")])
        while queue:
            size = len(queue)
            for _ in range(size):
                node = queue.pop()
                if node.name in entry_name_set:
                    configs["scopeConfigs"][node.name] = False
                for entry in node.children:
                    queue.appendleft(entry)
        return configs