def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) _validation: Config = self.configs.validation # Verify the service is running response = self._requests.get(self.configs.url) if response.status_code != 200 or response.json()["status"] != "OK": raise ProcessorConfigError( f"{response.status_code} {response.reason}: Please double " "check your endpoint URL configuration and make sure that the " f"remote service at {self.configs.url} is a valid pipeline " "service that is up and running.") service_name: str = response.json()["service_name"] input_format: str = response.json()["input_format"] if _validation.do_init_type_check: # Validate service name and input format if service_name != _validation.expected_name: raise ProcessorConfigError( "Validation fail: The expected service name " f"('{_validation.expected_name}') does not match the " "actual name returned by remote service " f"('{service_name}'). Please double check your endpoint " f"URL {self.configs.url} or consider updating the configs " "of RemoteProcessor so that 'validation.expected_name' " f"equals to '{service_name}'.") if input_format != _validation.input_format: raise ProcessorConfigError( "Validation fail: The expected input format " f"('{_validation.input_format}') does not match the " "actual input format returned by remote service " f"('{input_format}'). Please double check your endpoint " f"URL {self.configs.url} or consider updating the configs " "of RemoteProcessor so that 'validation.input_format' " f"equals to '{input_format}'.")
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.index = ElasticSearchIndexer(self.configs.indexer.hparams) if self.configs.query_result_project_id < 0: raise ProcessorConfigError("Query Result Project is not set.") if not os.path.exists(self.configs.stave_db_path): raise ProcessorConfigError( f"Cannot find Stave DB at: {self.configs.stave_db_path}")
def initialize(self, config: Union[Dict, Config]): super().initialize(config) if self.config.entry_type is None: raise ProcessorConfigError( "The `entry_type` configuration must be " "provided and cannot be None.") if self.config.attribute is None: raise ProcessorConfigError("The `attribute` configuration must be " "provided and cannot be None.")
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if not configs.stave_db_path: raise ProcessorConfigError( 'The database path to stave is not specified.') if not configs.datapack_table: raise ProcessorConfigError( 'The table name that stores the data pack is not stored.')
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if not configs.stave_db_path: raise ProcessorConfigError( 'The database path to stave is not specified.') if not configs.datapack_table: raise ProcessorConfigError( 'The table name that stores the data pack is not stored.') if not configs.target_project_name: logging.info( "No project specified, will attempt to read all proejcts.")
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if ("pos" in configs.processors or "depparse" in configs.processors or "depparse" in configs.processors): if "tokenize" not in self.configs.processors: raise ProcessorConfigError("tokenize is necessary in " "configs.processors for " "pos, depparse or srl") cuda_devices = itertools.cycle(configs["cuda_devices"]) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError("Incorrect value for tag_formalism") if configs.tag_formalism == "stanford": self.predictor = { "stanford": Predictor.from_path(configs["stanford_url"], cuda_device=next(cuda_devices)) } if "srl" in configs.processors: self.predictor = { "stanford": Predictor.from_path(configs["stanford_url"], cuda_device=next(cuda_devices)), "srl": Predictor.from_path(configs["srl_url"], cuda_device=next(cuda_devices)), } if configs.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if configs.allow_parallel_entries: logger.warning("Both `overwrite_entries` (whether to overwrite" " the entries of the same type as produced by " "this processor) and " "`allow_parallel_entries` (whether to allow " "similar new entries when they already exist) " "are True, all existing conflicting entries " "will be deleted.") else: if not configs.allow_parallel_entries: logger.warning("Both `overwrite_entries` (whether to overwrite" " the entries of the same type as produced by " "this processor) and " "`allow_parallel_entries` (whether to allow " "similar new entries when they already exist) " "are False, processor will only run if there " "are no existing conflicting entries.")
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError('Incorrect value for tag_formalism') model_url = MODEL2URL[configs.tag_formalism] self.predictor: Predictor = Predictor.from_path(model_url) if configs.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if configs.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are True, all existing conflicting entries ' 'will be deleted.') else: if not configs.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are False, processor will only run if there ' 'are no existing conflicting entries.')
def initialize(self, config: Config): super().initialize(config) if self.configs.context_type is None: raise ProcessorConfigError( f"The 'context_type' config of {self.__class__.__name__} " f"cannot be None." )
def initialize(self, resource: Resources, configs: HParams): self.processors = configs.processors if self.processors is None or self.processors == "": self.processors = self.default_configs()['processors'] if configs.output_format not in MODEL2URL: raise ProcessorConfigError('Incorrect value for output_format') model_url = MODEL2URL[configs.output_format] self.predictor = Predictor.from_path(model_url) self.overwrite_entries = configs.overwrite_entries self.allow_parallel_entries = configs.allow_parallel_entries if self.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if self.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are True, all existing conflicting entries ' 'will be deleted.') else: if not self.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are False, processor will only run if there ' 'are no existing conflicting entries.')
def initialize(self, resources: Resources, configs: Config): # Populate the _request. The self._request_ready help avoid parsing # the feature scheme multiple times during `initialize`. if not self._request_ready: for key, value in configs.items(): if key == "feature_scheme": self._request["schemes"] = parse_feature_extractors( configs.feature_scheme) else: self._request[key] = value self._request_ready = True batcher_config = configs.batcher # Assign context type from here to make sure batcher is using the # same context type as predictor. batcher_context = configs["batcher"].get("context_type", None) if (batcher_context is None or batcher_context == self._request["context_type"]): batcher_config.context_type = self._request["context_type"] else: raise ProcessorConfigError( "The 'context_type' configuration value should be the same " "for the processor and the batcher, now for the processor the " f"value is {self._request['context_type']} and for the " f"batcher the value is {batcher_context}. It is also fine if " f"this value for batch config is left empty.") self.do_eval = configs.do_eval # This needs to be called later since batcher config needs to be loaded. super().initialize(resources, configs) for tag, scheme in self._request["schemes"].items(): # Add input feature to the batcher. if scheme["type"] == extractor_utils.DATA_INPUT: self.batcher.add_feature_scheme(tag, scheme) # type: ignore
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if "pos" in configs.processors or "lemma" in configs.processors: if "tokenize" not in configs.processors: raise ProcessorConfigError("tokenize is necessary in " "configs.processors for " "pos or lemma") else: if "sentence" not in configs.processors: raise ProcessorConfigError("sentence is necessary in " "configs.processors for " "tokenize or pos or lemma") self.processors = configs.processors self.lang_model = configs.lang self.set_up()
def initialize(self, resources, configs): super().initialize(resources, configs) if "successor" in configs["test"] and "test" not in configs["test"]: raise ProcessorConfigError( '"test" is necessary as the first ' 'step for "successor" in config ' "for test case purpose." ) self.initialize_count += 1
def initialize(self, resources: Resources, configs: Optional[Config]): super().initialize(resources, configs) assert configs is not None try: self.batcher.initialize(configs.batcher) except AttributeError as e: raise ProcessorConfigError( e, "Error in handling batcher config, please provide the " "check the config to see if you have the key 'batcher'.")
def initialize(self, resources: Resources, configs: Optional[Config]): super().initialize(resources, configs) assert configs is not None try: self.batcher.initialize(configs.batcher) except AttributeError as e: raise ProcessorConfigError( "Error in handling batcher config, please check the " "config of the batcher to see they are correct.") from e
def initialize(self, resources: Resources, configs: Config): # pylint: disable=attribute-defined-outside-init super().initialize(resources, configs) if not configs.stave_db_path: raise ProcessorConfigError( 'The database path to stave is not specified.') self.conn = sqlite3.connect(configs.stave_db_path) self.data_packs: Dict[int, DataPack] = load_all_datapacks( self.conn, configs.datapack_table, configs.pack_content_col)
def initialize(self, config: Union[Dict, Config]): # pylint: disable=attribute-defined-outside-init super().initialize(config) if self.config.attribute is None: raise ProcessorConfigError( "'attribute' is required in this extractor.") if self.config.index_annotation is None: raise ProcessorConfigError( "'index_annotation' is required in this extractor.") if self.config.entry_type is None: raise ProcessorConfigError( "'entry_type' is required in this extractor.") else: self._entry_class: Type[Link] = get_class(self.config.entry_type) if not issubclass(self._entry_class, Link): raise ProcessorConfigError("`entry_class` to this extractor " "must be a Link tpe.") self._parent_class: Type[Annotation] = self._entry_class.ParentType if not issubclass(self._parent_class, Annotation): raise ProcessorConfigError( f"The parent class of the provided {self.config.entry_type}" " must be an Annotation.") self._child_class: Type[Annotation] = self._entry_class.ChildType if not issubclass(self._child_class, Annotation): raise ProcessorConfigError( f"The child class of the provided {self.config.entry_type}" " must be an Annotation.")
def _process(self, input_pack: DataPack): """Perform HuggingFace NER Pipeline on the input data pack. Args: input_pack: Input pack to fill Returns: """ if not self.configs.entry_type: raise ProcessorConfigError("Please specify an input entry type!") output_entry = get_class(self.configs.output_entry_type) for entry_specified in input_pack.get(self.configs.entry_type): result = self.classifier(entry_specified.text) if self.configs.tagging_scheme == "bio-merge": # Merge BIO tagging result_types, result_indices = self._merge_bio_tokens(result) elif self.configs.tagging_scheme == "no-merge": result_indices = [] result_types = [] for token in result: start, end = token["start"], token["end"] result_types.append(token["entity"]) result_indices.append((start, end)) else: raise ProcessorConfigError( f"The tagging_scheme strategy {self.configs.tagging_scheme}" f"was not defined. Please check your input config.") for type, (start, end) in zip(result_types, result_indices): entity = output_entry( pack=input_pack, begin=entry_specified.span.begin + int(start), end=entry_specified.span.begin + int(end), ) setattr(entity, self.configs.attribute_name, type)
def _process_existing_entries(self, input_pack): tokens_exist = any(True for _ in input_pack.get(Token)) dependencies_exist = any(True for _ in input_pack.get(Dependency)) if tokens_exist or dependencies_exist: if not self.configs.overwrite_entries: if not self.configs.allow_parallel_entries: raise ProcessorConfigError( "Found existing entries, either `overwrite_entries` or " "`allow_parallel_entries` should be True") else: # delete existing tokens and dependencies for entry_type in (Token, Dependency): for entry in input_pack.get(entry_type): input_pack.delete_entry(entry)
def make_configs( cls, configs: Optional[Union[Config, Dict[str, Any]]], ) -> Config: """ Create the configuration by merging the provided config with the `default_configs`. The following config conventions are expected: - The top level key can be a special `@config_path`. - `@config_path` should be point to a file system path, which will be a YAML file containing configurations. - Other key values in the configs will be considered as parameters. Args: configs: The input config to be merged with the default config. Returns: The merged configuration. """ merged_configs: Dict = {} if configs is not None: if isinstance(configs, Config): configs = configs.todict() if configs.get("@config_path", None) is not None: with open(configs.pop("@config_path"), encoding="utf-8") as f: filebased_configs = yaml.safe_load(f) else: filebased_configs = {} merged_configs.update(filebased_configs) merged_configs.update(configs) try: final_configs = Config(merged_configs, cls._default_configs().todict()) except ValueError as e: raise ProcessorConfigError( f"Configuration error for the processor " f"{get_full_module_name(cls)}.") from e return final_configs
def expected_types_and_attributes(self): r"""Method to add expected types and attributes for the input of `RemoteProcessor`. This should be the `expected_types_and_attributes` of the first processor in remote pipeline. """ if self._expectation is None: response = self._requests.get(f"{self.configs.url}/expectation") if response.status_code != 200 or response.json( )["status"] != "OK": raise ProcessorConfigError( f"{response.status_code} {response.reason}: " "Fail to fetch expected types and attributes from remote " "service. Please make sure that the remote service at " f"{self.configs.url} is a valid pipeline service that is " "up and running.") self._expectation = response.json()["expectation"] return self._expectation
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) cuda_devices = itertools.cycle(configs['cuda_devices']) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError('Incorrect value for tag_formalism') if configs.tag_formalism == 'stanford': self.predictor = { 'stanford': Predictor.from_path(configs['stanford_url'], cuda_device=next(cuda_devices)) } if 'srl' in configs.processors: self.predictor = { 'stanford': Predictor.from_path(configs['stanford_url'], cuda_device=next(cuda_devices)), 'srl': Predictor.from_path(configs['srl_url'], cuda_device=next(cuda_devices)) } if configs.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if configs.allow_parallel_entries: logger.warning('Both `overwrite_entries` (whether to overwrite' ' the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are True, all existing conflicting entries ' 'will be deleted.') else: if not configs.allow_parallel_entries: logger.warning('Both `overwrite_entries` (whether to overwrite' ' the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are False, processor will only run if there ' 'are no existing conflicting entries.')
def make_configs( cls, configs: Optional[Union[Config, Dict[str, Any]]]) -> Config: """ Create the component configuration for this class, by merging the provided config with the ``default_config``. The following config conventions are expected: - The top level key can be a special `config_path`. - `config_path` should be point to a file system path, which will be a YAML file containing configurations. - Other key values in the configs will be considered as parameters. Args: configs: The input config to be merged with the default config. Returns: The merged configuration. """ merged_configs: Dict = {} if configs is not None: if isinstance(configs, Config): configs = configs.todict() if "config_path" in configs and not configs["config_path"] is None: filebased_configs = yaml.safe_load( open(configs.pop("config_path"))) else: filebased_configs = {} merged_configs.update(filebased_configs) merged_configs.update(configs) try: final_configs = Config(merged_configs, cls.default_configs()) except ValueError as e: raise ProcessorConfigError( f'Configuration error for the processor ' f'{get_full_module_name(cls)}.') from e return final_configs
def record(self, record_meta: Dict[str, Set[str]]): r"""Method to add output type record of `RemoteProcessor`. The records are queried from the remote service. The types and attributes are populated from all the components in remote pipeline. Args: record_meta: the field in the datapack for type record that need to fill in for consistency checking. """ if self._records is None: response = self._requests.get(f"{self.configs.url}/records") if response.status_code != 200 or response.json( )["status"] != "OK": raise ProcessorConfigError( f"{response.status_code} {response.reason}: " "Fail to fetch records from remote service. Please make " f"sure that the remote service at {self.configs.url} is " "a valid pipeline service that is up and running.") self._records = response.json()["records"] record_meta.update(self._records)
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if ( "pos" in configs.processors or "lemma" in configs.processors or "depparse" in configs.processors ): if "tokenize" not in configs.processors: raise ProcessorConfigError( "tokenize is necessary in " "configs.processors for " "pos or lemma or depparse" ) self.set_up() self.nlp = stanza.Pipeline( lang=self.configs.lang, dir=self.configs.dir, use_gpu=self.configs.use_gpu, processors=self.configs.processors, )
def initialize(self, resources: Resources, configs: Config): # pylint: disable=attribute-defined-outside-init,consider-using-with super().initialize(resources, configs) if self.configs.output_dir is None: raise ProcessorConfigError( "`output_dir` is not specified for the writer.") pack_paths = os.path.join(self.configs.output_dir, self.pack_idx) ensure_dir(pack_paths) self.pack_idx_out = open(pack_paths, "w", encoding="utf-8") multi_index = os.path.join(self.configs.output_dir, self.multi_idx) ensure_dir(multi_index) self.multi_idx_out = open(multi_index, "w", encoding="utf-8") if self.configs.serialize_method == "jsonpickle": self._suffix = ".json.gz" if self.configs.zip_pack else ".json" else: self._suffix = ".pickle.gz" if self.configs.zip_pack else ".pickle"
def init_from_config(self, configs: List): r"""Initialized the pipeline (ontology and processors) from the given configurations. Args: configs: The configs used to initialize the pipeline. """ is_first: bool = True for component_config in configs: component = create_class_with_kwargs( class_name=component_config['type'], class_args=component_config.get('kwargs', {}), ) if is_first: if not isinstance(component, BaseReader): raise ProcessorConfigError( "The first component of a pipeline must be a reader.") self.set_reader(component, component_config.get('configs', {})) is_first = False else: # Can be processor, caster, or evaluator self.add(component, component_config.get('configs', {}))
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) # Validate multi_pack project config: # A `multi_pack` project must have `multi_ontology` set. if self.configs.project_type != "single_pack" and ( self.configs.project_type != "multi_pack" or self.configs.multi_ontology is None): raise ProcessorConfigError("Invalid project type configuration.") # Generate default configurations self.configs.project_configs = Config( hparams=self.configs.project_configs, default_hparams=self._default_project_configs(), ) self.configs.multi_ontology = self.configs.multi_ontology or Config({}, {}) self.configs.project_path = os.path.abspath( self.configs.project_path or self.configs.project_name) self._viewer = StaveViewer( project_path=self.configs.project_path, host=self.configs.host, port=self.configs.port, thread_daemon=self.configs.server_thread_daemon, ) # Write meta data to project folder self._project_writer = StaveProjectWriter( project_path=self.configs.project_path, project_name=self.configs.project_name, project_type=self.configs.project_type, ontology=self.resources.get("onto_specs_dict"), project_configs=self.configs.project_configs.todict(), multi_ontology=self.configs.multi_ontology.todict(), )
def add( self, component: PipelineComponent, config: Optional[Union[Config, Dict[str, Any]]] = None, selector: Optional[Selector] = None, ) -> "Pipeline": """ Adds a pipeline component to the pipeline. The pipeline components will form a chain based on the insertion order. The customized `config` and `selector` (:class:`~forte.data.selector.Selector`) will be associated with this particular component. If the `config` or the `selector` is not provided, the default ones will be used. Here, note that the same component instance can be added multiple times to the pipeline. In such cases, the instance will only be setup at the first insertion (i.e. its `initialize` function will only be called once). The subsequent insertion of the same component instance will not change the behavior nor the states of the instance. Thus, a different `config` cannot be provided (should be `None`) when added the second time, otherwise a `ProcessorConfigError` will be thrown. In the case where one want to them to behave differently, a different instance should be used. Args: component (PipelineComponent): The component to be inserted next to the pipeline. config (Union[Config, Dict[str, Any]): The custom configuration to be used for the added component. Default None, which means the `default_configs()` of the component will be used. selector (Selector): The selector used to pick the corresponding data pack to be consumed by the component. Default None, which means the whole pack will be used. Returns: The pipeline itself, which enables one to chain the creation of the pipeline, i.e., you can do: .. code-block:: python Pipeline().set_reader(your_reader()).add( your_processor()).add(anther_processor()) """ if isinstance(component, BaseReader): raise ProcessFlowException( "Reader need to be set via set_reader()") if isinstance(component, Evaluator): # This will ask the job to keep a copy of the gold standard. self.evaluator_indices.append(len(self.components)) if component not in self.__component_set: # The case where the component is not found. self._components.append(component) self.__component_set.add(component) self.component_configs.append(component.make_configs(config)) else: if config is None: self._components.append(component) # We insert a `None` value here just to make the config list # to match the component list, but this config should not be # used. self.component_configs.append(None) else: raise ProcessorConfigError( f"The same instance of a component named {component.name} " f" has already been added to" f" the pipeline, we do not accept a different configuration" f" for it. If you would like to use a differently" f" configured component, please create another instance." f" If you intend to re-use the component instance, please" f" do not provide the `config` (or provide a `None`).") if selector is None: self._selectors.append(self.__default_selector) else: self._selectors.append(selector) return self
def _default_project_configs(self): # pylint: disable=line-too-long """ Create default project configuration based on ontology. This is translated from JavaScript function `createDefaultConfig` in https://github.com/asyml/stave/blob /d82383de3d74bf09c0d30f33d8a902595f5aff80/src/app/pages/Projects.tsx #L140 Returns: configs: A dictionary with the default config for project. """ # pylint: enable=line-too-long if not (self.resources.contains("onto_specs_dict") and self.resources.contains("merged_entry_tree")): raise ProcessorConfigError( "onto_specs_dict/merged_entry_tree is not set in resources.") ontology = self.resources.get("onto_specs_dict") entry_tree = self.resources.get("merged_entry_tree") configs: Dict[str, Any] = { "legendConfigs": {}, "scopeConfigs": {}, "layoutConfigs": { "center-middle": "default-nlp", "left": "default-meta", "right": "default-attribute", "center-bottom": "disable", }, "remoteConfigs": { "pipelineUrl": "", "doValidation": False, "expectedName": "", "inputFormat": "string", "expectedRecords": {}, }, } # Create legend configs legend_configs: Dict[str, Any] = {} entry_name_set: Set[str] = set() for entry in ontology["definitions"]: entry_name = entry["entry_name"] entry_name_set.add(entry_name) legend_configs[entry_name] = { "is_selected": False, "is_shown": True, } if "attributes" in entry and len(entry["attributes"]) > 0: attributes_configs = {} for attribute in entry["attributes"]: if attribute["type"] == "str": attributes_configs[attribute["name"]] = False legend_configs[entry_name]["attributes"] = attributes_configs configs["legendConfigs"] = legend_configs # Find all subclass of `forte.data.ontology.top.Annotation` and # update `scopeConfigs` accordingly. queue = collections.deque( [search(entry_tree.root, "forte.data.ontology.top.Annotation")]) while queue: size = len(queue) for _ in range(size): node = queue.pop() if node.name in entry_name_set: configs["scopeConfigs"][node.name] = False for entry in node.children: queue.appendleft(entry) return configs