def _parse_configs(self, configs): parsed_configs = self.default_configs() parsed_configs["batch_size"] = configs.batch_size parsed_configs["scope"] = get_class(configs.scope) parsed_configs["do_eval"] = configs.do_eval parsed_configs["feature_scheme"] = {} for tag, scheme in configs.feature_scheme.items(): parsed_configs["feature_scheme"][tag] = {} if scheme["type"] == "data_input": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_INPUT elif scheme["type"] == "data_output": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_OUTPUT extractor = get_class(scheme["extractor"]["class_name"])() extractor.initialize(config=scheme["extractor"]["config"]) if "vocab_path" in scheme["extractor"]: vocab_file = open(scheme["extractor"]["vocab_path"], "rb") extractor.vocab = pickle.load(vocab_file) vocab_file.close() parsed_configs["feature_scheme"][tag]["extractor"] = extractor if "converter" not in scheme: parsed_configs["feature_scheme"][tag]["converter"] = Converter( {}) else: parsed_configs["feature_scheme"][tag]["converter"] = scheme[ "converter"] return Config(parsed_configs, default_hparams=self.default_configs())
def __init__(self, config: Optional[Union[Dict, Config]] = None): super().__init__() self._config = Config(hparams=config, default_hparams=self.default_configs()) self._meta_data: Dict[int, str] = {} index_type = self._config.index_type device = self._config.device dim = self._config.dim if device.lower().startswith("gpu"): if isinstance(index_type, str) and not index_type.startswith("Gpu"): index_type = "Gpu" + index_type index_class = utils.get_class(index_type, module_paths=["faiss"]) gpu_resource = faiss.StandardGpuResources() gpu_id = int(device[3:]) if faiss.get_num_gpus() < gpu_id: gpu_id = 0 logging.warning("Cannot create the index on device %s. " "Total number of GPUs on this machine is " "%s. Using gpu0 for the index.", self._config.device, faiss.get_num_gpus()) config_class_name = \ self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__) config = utils.get_class(config_class_name, # type: ignore module_paths=["faiss"])() config.device = gpu_id self._index = index_class(gpu_resource, dim, config) else: index_class = utils.get_class(index_type, module_paths=["faiss"]) self._index = index_class(dim)
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]: feature_requests: Dict[str, Any] = {} for tag, scheme_config in scheme_configs.items(): assert ( "extractor" in scheme_config ), "Field not found for data request scheme: `extractor`" assert ( "type" in scheme_config ), "Field not found for data request scheme: `type`" assert scheme_config["type"] in [ "data_input", "data_output", ], "Type field must be either data_input or data_output." feature_requests[tag] = {} if scheme_config["type"] == "data_input": feature_requests[tag]["type"] = DATA_INPUT elif scheme_config["type"] == "data_output": feature_requests[tag]["type"] = DATA_OUTPUT extractor_class = get_class(scheme_config["extractor"]["class_name"]) extractor: BaseExtractor = extractor_class() if not isinstance(extractor, BaseExtractor): raise RuntimeError( "Invalid extractor: ", scheme_config["extractor"] ) extractor.initialize(config=scheme_config["extractor"]["config"]) # Load vocab from disk if provided. if "vocab_path" in scheme_config["extractor"]: with open( scheme_config["extractor"]["vocab_path"], "rb" ) as vocab_file: extractor.vocab = pickle.load(vocab_file) feature_requests[tag]["extractor"] = extractor if "converter" not in scheme_config: # Create default converter if there is no given converter feature_requests[tag]["converter"] = Converter({}) else: converter_class = get_class( scheme_config["converter"]["class_name"] ) converter: Converter = converter_class() if not isinstance(converter, Converter): raise RuntimeError( "Invalid converter: ", scheme_config["converter"] ) feature_requests[tag]["converter"] = converter return feature_requests
def initialize(self, config: Union[Dict, Config]): # pylint: disable=attribute-defined-outside-init super().initialize(config) if self.config.attribute is None: raise ProcessorConfigError( "'attribute' is required in this extractor.") if self.config.index_annotation is None: raise ProcessorConfigError( "'index_annotation' is required in this extractor.") if self.config.entry_type is None: raise ProcessorConfigError( "'entry_type' is required in this extractor.") else: self._entry_class: Type[Link] = get_class(self.config.entry_type) if not issubclass(self._entry_class, Link): raise ProcessorConfigError("`entry_class` to this extractor " "must be a Link tpe.") self._parent_class: Type[Annotation] = self._entry_class.ParentType if not issubclass(self._parent_class, Annotation): raise ProcessorConfigError( f"The parent class of the provided {self.config.entry_type}" " must be an Annotation.") self._child_class: Type[Annotation] = self._entry_class.ChildType if not issubclass(self._child_class, Annotation): raise ProcessorConfigError( f"The child class of the provided {self.config.entry_type}" " must be an Annotation.")
def create_class_with_kwargs(class_name: str, class_args: Dict, h_params: Optional[Dict] = None): cls = get_class(class_name) if not class_args: class_args = {} obj = cls(**class_args) p_params: Dict = {} if h_params is not None and \ "config_path" in h_params and \ h_params["config_path"] is not None: filebased_hparams = yaml.safe_load(open(h_params["config_path"])) else: filebased_hparams = {} p_params.update(filebased_hparams) if h_params is not None: p_params.update(h_params.get("overwrite_configs", {})) default_processor_hparams = cls.default_hparams() processor_hparams = HParams(p_params, default_processor_hparams) return obj, processor_hparams
def init_from_config(self, configs: Dict): """ Parse the configuration sections from the input config, into a list of [processor, config] Initialize the pipeline with the configurations """ if "Reader" not in configs or configs["Reader"] is None: raise KeyError('No reader in the configuration') reader_config = configs["Reader"] reader, reader_hparams = create_class_with_kwargs( class_name=reader_config["type"], class_args=reader_config.get("kwargs", {}), h_params=reader_config.get("hparams", {})) self.set_reader(reader, reader_hparams) # HParams cannot create HParams from the inner dict of list if "Processors" in configs and configs["Processors"] is not None: for processor_configs in configs["Processors"]: p, processor_hparams = create_class_with_kwargs( class_name=processor_configs["type"], class_args=processor_configs.get("kwargs", {}), h_params=processor_configs.get("hparams", {})) selector_hparams = processor_hparams.selector selector_class = get_class(selector_hparams['type']) selector_kwargs = selector_hparams["kwargs"] selector = selector_class(**selector_kwargs) self.add_processor(p, processor_hparams, selector) self.initialize()
def initialize(self, config: Config): super().initialize(config) if config["context_type"] is None: raise AttributeError("'context_type' cannot be None.") if config["batch_size"] is None: raise AttributeError("'batch_size' cannot be None.") if isinstance(config["context_type"], str): self._context_type = get_class(config["context_type"]) else: self._context_type = config["context_type"] if not issubclass(self._context_type, Annotation): raise ValidationError( f"The provided context type {self._context_type} " f"is not an Annotation type." ) self.batch_size = config["batch_size"] self.instance_pool.clear() self.feature_pool.clear() self.pool_size = 0 self.batch_is_full = False
def _process(self, input_pack: DataPack): for type_name in self.configs.removal_types: type_cls = get_class(type_name) # Note: we cannot delete during iteration, which may cause # un-expected behavior in the iterator. for t in list(input_pack.get(type_cls)): input_pack.delete_entry(t)
def init_from_config(self, configs: Dict): """ Initialize the pipeline with the configurations Args: configs: The configurations used to create the pipeline. Returns: """ if "Reader" not in configs or configs["Reader"] is None: raise KeyError('No reader in the configuration') reader_config = configs["Reader"] reader, reader_hparams = create_class_with_kwargs( class_name=reader_config["type"], class_args=reader_config.get("kwargs", {}), h_params=reader_config.get("hparams", {})) self.set_reader(reader, reader_hparams) if "Processors" in configs and configs["Processors"] is not None: for processor_configs in configs["Processors"]: p_class = get_class(processor_configs["type"]) if processor_configs.get("kwargs"): processor_kwargs = processor_configs["kwargs"] else: processor_kwargs = {} p = p_class(**processor_kwargs) hparams: Dict = {} if processor_configs.get("hparams"): # Extract the hparams section and build hparams processor_hparams = processor_configs["hparams"] if processor_hparams.get("config_path"): filebased_hparams = yaml.safe_load( open(processor_hparams["config_path"])) else: filebased_hparams = {} hparams.update(filebased_hparams) if processor_hparams.get("overwrite_configs"): overwrite_hparams = processor_hparams[ "overwrite_configs"] else: overwrite_hparams = {} hparams.update(overwrite_hparams) default_processor_hparams = p_class.default_hparams() processor_hparams = HParams(hparams, default_processor_hparams) self.add_processor(p, processor_hparams) self.initialize()
def initialize(self, config: Union[Dict, Config]): # pylint: disable=attribute-defined-outside-init super().initialize(config=config) if self.config.attribute is None: raise AttributeError("attribute is required " "in BioSeqTaggingExtractor.") if not self.config.tagging_unit: raise AttributeError("tagging_unit is required in " "BioSeqTaggingExtractor.") self.attribute: str = self.config.attribute self.tagging_unit: Type[Annotation] = get_class( self.config.tagging_unit) self.is_bert: bool = self.config.is_bert
def initialize(self, config: Union[Dict, Config]): """ Initialize the extractor based on the provided configuration. Args: config: The configuration of the extractor, it can be a `Dict` or :class:`~forte.common.configuration.Config`. See :meth:`default_configs` for available options and default values. """ # pylint: disable=attribute-defined-outside-init super().initialize(config=config) if self.config.attribute is None: raise AttributeError("attribute is required " "in BioSeqTaggingExtractor.") if not self.config.tagging_unit: raise AttributeError("tagging_unit is required in " "BioSeqTaggingExtractor.") self._attribute: str = self.config.attribute self._tagging_unit: Type[Annotation] = get_class( self.config.tagging_unit) self._entry_type: Type[Annotation] = get_class(self.config.entry_type)
def initialize(self, resources: Resources, configs: Config): # pylint: disable=attribute-defined-outside-init,unused-argument r"""Initialize the evaluator with `resources` and `configs`. This method is called by the pipeline during the initialization. Args: resources (Resources): An object of class :class:`~forte.common.Resources` that holds references to objects that can be shared throughout the pipeline. configs (Config): A configuration to initialize the evaluator. This evaluator is expected to hold the following (key, value) pairs - `"entry_type"` (str): The entry to be evaluated. - `"tagging_unit"` (str): The tagging unit that the evaluation is performed on. e.g. `"ft.onto.base_ontology.Sentence"` - `"attribute"` (str): The attribute of the entry to be evaluated. """ super().initialize(resources, configs) self.entry_type = get_class(configs.entry_type) self.tagging_unit = get_class(configs.tagging_unit) self.attribute = configs.attribute
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if not self.configs.entry_type: raise ProcessorConfigError("Please specify an entity mention type!") self.entry_type = get_class(self.configs.entry_type) if not issubclass(self.entry_type, Annotation): raise AttributeError( f"The entry type to delete [{self.entry_type}] " f"is not a sub-class of " f"'forte.data.ontology.top.Annotation' class." )
def init_from_config(self, configs: Dict): """ Initialize the pipeline with the configurations Args: configs: The configurations used to create the pipeline. Returns: """ # HParams cannot create HParams from the inner dict of list if "Processors" in configs and configs["Processors"] is not None: for processor_configs in configs["Processors"]: p_class = get_class(processor_configs["type"]) if processor_configs.get("kwargs"): processor_kwargs = processor_configs["kwargs"] else: processor_kwargs = {} p = p_class(**processor_kwargs) hparams: Dict = {} if processor_configs.get("hparams"): # Extract the hparams section and build hparams processor_hparams = processor_configs["hparams"] if processor_hparams.get("config_path"): filebased_hparams = yaml.safe_load( open(processor_hparams["config_path"])) else: filebased_hparams = {} hparams.update(filebased_hparams) if processor_hparams.get("overwrite_configs"): overwrite_hparams = processor_hparams[ "overwrite_configs"] else: overwrite_hparams = {} hparams.update(overwrite_hparams) default_processor_hparams = p_class.default_hparams() processor_hparams = HParams(hparams, default_processor_hparams) self.add_processor(p, processor_hparams) self.initialize()
def initialize(self, config: Union[Dict, Config]): # pylint: disable=attribute-defined-outside-init self.config = Config(config, self.default_configs()) if self.config.entry_type is None: raise AttributeError("`entry_type` needs to be specified in " "the configuration of an extractor.") self._entry_type = get_class(self.config.entry_type) if self.config.vocab_method != "custom": self._vocab = Vocabulary( method=self.config.vocab_method, use_pad=self.config.need_pad, use_unk=self.config.vocab_use_unk, pad_value=self.config.pad_value, unk_value=self.config.unk_value, ) else: self._vocab = None self._vocab_method = self.config.vocab_method
def _process(self, input_pack: DataPack): """Perform HuggingFace NER Pipeline on the input data pack. Args: input_pack: Input pack to fill Returns: """ if not self.configs.entry_type: raise ProcessorConfigError("Please specify an input entry type!") output_entry = get_class(self.configs.output_entry_type) for entry_specified in input_pack.get(self.configs.entry_type): result = self.classifier(entry_specified.text) if self.configs.tagging_scheme == "bio-merge": # Merge BIO tagging result_types, result_indices = self._merge_bio_tokens(result) elif self.configs.tagging_scheme == "no-merge": result_indices = [] result_types = [] for token in result: start, end = token["start"], token["end"] result_types.append(token["entity"]) result_indices.append((start, end)) else: raise ProcessorConfigError( f"The tagging_scheme strategy {self.configs.tagging_scheme}" f"was not defined. Please check your input config.") for type, (start, end) in zip(result_types, result_indices): entity = output_entry( pack=input_pack, begin=entry_specified.span.begin + int(start), end=entry_specified.span.begin + int(end), ) setattr(entity, self.configs.attribute_name, type)
def get( # type: ignore self, entry_type: Union[str, Type[EntryType]], components: Optional[Union[str, List[str]]] = None, include_sub_type=True, ) -> Iterator[EntryType]: """Get entries of `entry_type` from this multi pack. Example: .. code-block:: python for relation in pack.get( CrossDocEntityRelation, component="relation_creator" ): print(relation.get_parent()) In the above code snippet, we get entries of type ``CrossDocEntityRelation`` which were generated by a component named ``relation_creator`` Args: entry_type (type): The type of the entries requested. components (str or list, optional): The component generating the entries requested. If `None`, all valid entries generated by any component will be returned. include_sub_type (bool): whether to return the sub types of the queried `entry_type`. True by default. Returns: An iterator of the entries matching the arguments, following the order of entries (first sort by entry comparison, then by insertion) """ entry_type_: Type[EntryType] if isinstance(entry_type, str): entry_type_ = get_class(entry_type) if not issubclass(entry_type_, Entry): raise AttributeError( f"The specified entry type [{entry_type}] " f"does not correspond to a " f"'forte.data.ontology.core.Entry' class" ) else: entry_type_ = entry_type entry_iter: Iterator[Entry] if not include_sub_type: entry_iter = self.get_entries_of(entry_type_) elif issubclass(entry_type_, MultiPackLink): entry_iter = self.links elif issubclass(entry_type_, MultiPackGroup): entry_iter = self.groups elif issubclass(entry_type_, MultiPackGeneric): entry_iter = self.generics else: raise ValueError( f"The entry type: {entry_type_} is not supported by MultiPack." ) all_types: Set[Type] if include_sub_type: all_types = self._expand_to_sub_types(entry_type_) if components is not None: if isinstance(components, str): components = [components] for e in entry_iter: # Will check for the type matching if sub types are also requested. if include_sub_type and type(e) not in all_types: continue # Check for the component. if components is not None: if not self.is_created_by(e, components): continue yield e # type: ignore
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) for entry_type, entry_attributes in self.configs.requests.items(): entry_class = get_class(entry_type) self.fields[entry_class] = entry_attributes
def initialize(self, resources: Resources, configs: HParams): super().initialize(resources, configs) cls = utils.get_class(self.config.indexer.name, module_paths=["forte.indexers"]) self.indexer = cls(hparams=self.config.indexer.hparams)
def _parse_request(self, request: Dict): """ This method has two responsibilities: 1. parse the given data request and stored it internally 2. validate if the given data request is valid """ parsed_request: Dict[str, Any] = {} assert "scope" in request, "Field not found for data request: `scope`" assert ( "feature_scheme" in request ), "Field not found for data request: `schemes`" parsed_request["scope"] = get_class(request["scope"]) parsed_request["schemes"] = {} # Used for check dependency between different extractors scheme_group: Dict[str, Dict] = {"dependent": {}, "dependee": {}} for tag, scheme in request["feature_scheme"].items(): assert ( "extractor" in scheme ), "Field not found for data request scheme: `extractor`" parsed_request["schemes"][tag] = {} assert ( "type" in scheme ), "Field not found for data request scheme: `type`" assert scheme["type"] in [ "data_input", "data_output", ], "Type field must be either data_input or data_output." if scheme["type"] == "data_input": parsed_request["schemes"][tag][ "type" ] = TrainPreprocessor.DATA_INPUT if scheme["type"] == "data_output": parsed_request["schemes"][tag][ "type" ] = TrainPreprocessor.DATA_OUTPUT extractor_class = scheme["extractor"]["class_name"] if not isinstance(get_class(extractor_class)(), BaseExtractor): raise RuntimeError("Invalid extractor: ", scheme["extractor"]) extractor: BaseExtractor = get_class(extractor_class)() extractor.initialize(config=scheme["extractor"]["config"]) parsed_request["schemes"][tag]["extractor"] = extractor # Track dependency if hasattr(extractor, "based_on"): if extractor.entry_type not in scheme_group["dependent"]: scheme_group["dependent"][extractor.entry_type] = set() scheme_group["dependent"][extractor.entry_type].add(extractor) else: if extractor.entry_type not in scheme_group["dependee"]: scheme_group["dependee"][extractor.entry_type] = set() scheme_group["dependee"][extractor.entry_type].add(extractor) # Create default converter if there is no given converter if "converter" not in scheme: converter: Converter = Converter({}) parsed_request["schemes"][tag]["converter"] = converter # Check dependency for _, dependent_extractors in scheme_group["dependent"].items(): for dependent_extractor in dependent_extractors: based_on: Entry = dependent_extractor.based_on if based_on not in scheme_group["dependee"]: raise ValueError( "Extractor {} needs the entry {} to do extraction " "processing but it is not extracted by any other " "extractors given in request".format( based_on, dependent_extractor.tag ) ) self._request = parsed_request self._request_ready = True
def _process(self, input_pack: DataPack): for type_name in self.configs.removal_types: type_cls = get_class(type_name) for t in input_pack.get(type_cls): input_pack.delete_entry(t)