def _default_configs(cls) -> Config: # pylint: disable=protected-access merged = Config(cls.default_configs(), {}, allow_new_hparam=True) for base in cls.__bases__: if hasattr(base, "_default_configs"): merged = Config( merged, base._default_configs().todict(), # type: ignore allow_new_hparam=True, ) break return merged
def initialize(self, resources: Resources, configs: Config): # Populate the _request. The self._request_ready help avoid parsing # the feature scheme multiple times during `initialize`. if not self._request_ready: for key, value in configs.items(): if key == "feature_scheme": self._request["schemes"] = parse_feature_extractors( configs.feature_scheme) else: self._request[key] = value self._request_ready = True batcher_config = configs.batcher # Assign context type from here to make sure batcher is using the # same context type as predictor. batcher_context = configs["batcher"].get("context_type", None) if (batcher_context is None or batcher_context == self._request["context_type"]): batcher_config.context_type = self._request["context_type"] else: raise ProcessorConfigError( "The 'context_type' configuration value should be the same " "for the processor and the batcher, now for the processor the " f"value is {self._request['context_type']} and for the " f"batcher the value is {batcher_context}. It is also fine if " f"this value for batch config is left empty.") self.do_eval = configs.do_eval # This needs to be called later since batcher config needs to be loaded. super().initialize(resources, configs) for tag, scheme in self._request["schemes"].items(): # Add input feature to the batcher. if scheme["type"] == extractor_utils.DATA_INPUT: self.batcher.add_feature_scheme(tag, scheme) # type: ignore
def __init__(self): self.resources: Resources = Resources() self.configs: Config = Config({}, {}) # Determine whether to check the consistencies between components. self._check_type_consistency: bool = False # The flag indicating whether the component is initialized. self.__is_initialized: bool = False
def __init__(self, config: Optional[Union[Dict, Config]] = None): super().__init__() self._config = Config(hparams=config, default_hparams=self.default_configs()) self._meta_data: Dict[int, str] = {} index_type = self._config.index_type device = self._config.device dim = self._config.dim if device.lower().startswith("gpu"): if isinstance(index_type, str) and not index_type.startswith("Gpu"): index_type = "Gpu" + index_type index_class = utils.get_class(index_type, module_paths=["faiss"]) gpu_resource = faiss.StandardGpuResources() gpu_id = int(device[3:]) if faiss.get_num_gpus() < gpu_id: gpu_id = 0 logging.warning("Cannot create the index on device %s. " "Total number of GPUs on this machine is " "%s. Using gpu0 for the index.", self._config.device, faiss.get_num_gpus()) config_class_name = \ self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__) config = utils.get_class(config_class_name, # type: ignore module_paths=["faiss"])() config.device = gpu_id self._index = index_class(gpu_resource, dim, config) else: index_class = utils.get_class(index_type, module_paths=["faiss"]) self._index = index_class(dim)
def initialize(self, resources: Resources, configs: Config): self.resources = resources self.config = Config(configs, self.default_configs()) # TODO: At the time of writing, no way in texar to set encoder in # `texar.torch.modules.classifiers.BertClassifier`. Should not ideally # be changing a private variable. # pylint: disable=protected-access BERTClassifier._ENCODER_CLASS = BERTEncoder # pylint: enable=protected-access cache_dir = os.path.join(os.path.dirname(__file__), self.config.model_dir) self.device = torch.device('cuda:0') \ if torch.cuda.is_available() else torch.device('cpu') self.model = BERTClassifier( pretrained_model_name=self.config.pretrained_model_name, cache_dir=cache_dir, hparams=self.config).to(self.device) self.tokenizer = BERTTokenizer( pretrained_model_name=self.config.pretrained_model_name, cache_dir=cache_dir, hparams=None)
def main(input_path: str, output_path: str, max_packs: int = -1): pl = Pipeline[DataPack]() pl.set_reader(Mimic3DischargeNoteReader(), config={'max_num_notes': max_packs}) pl.add(NLTKSentenceSegmenter()) config = yaml.safe_load(open("bio_ner_config.yml", "r")) config = Config(config, default_hparams=None) pl.add(BERTTokenizer(), config=config.BERTTokenizer) pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor) pl.add(ElasticSearchPackIndexProcessor()) pl.add( PackIdJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, 'drop_record': True, 'zip_pack': True }) pl.initialize() for idx, pack in enumerate(pl.process_dataset(input_path)): if (idx + 1) % 50 == 0: print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs")
def initialize(self, config: Optional[Union[Config, Dict]] = None): # pylint: disable=attribute-defined-outside-init,unused-argument self._config = Config(config, default_hparams=self.default_configs()) self._user_request = self._config.request self._validate_config() self._parse_request(self._user_request) self._build_vocab()
def stanford_nlp_example(lang: str, text: str): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) models_path = os.getcwd() config = Config( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_configs()) pl.add(component=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
def _query_tweets(self, query: str): """ This function searches tweets using Tweepy. Args: query: user's input query for twitter API search Returns: List of tweets """ credentials = yaml.safe_load(open(self.configs.credential_file, "r")) credentials = Config(credentials, default_hparams=None) auth = tw.OAuthHandler(credentials.consumer_key, credentials.consumer_secret) auth.set_access_token(credentials.access_token, credentials.access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) # Collect tweets tweets = tw.Cursor( api.search, q=query, lang=self.configs.lang, since=self.configs.date_since, result_type=self.configs.result_type, tweet_mode="extended", ).items(self.configs.num_tweets_returned) return tweets
def _parse_configs(self, configs): parsed_configs = self.default_configs() parsed_configs["batch_size"] = configs.batch_size parsed_configs["scope"] = get_class(configs.scope) parsed_configs["do_eval"] = configs.do_eval parsed_configs["feature_scheme"] = {} for tag, scheme in configs.feature_scheme.items(): parsed_configs["feature_scheme"][tag] = {} if scheme["type"] == "data_input": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_INPUT elif scheme["type"] == "data_output": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_OUTPUT extractor = get_class(scheme["extractor"]["class_name"])() extractor.initialize(config=scheme["extractor"]["config"]) if "vocab_path" in scheme["extractor"]: vocab_file = open(scheme["extractor"]["vocab_path"], "rb") extractor.vocab = pickle.load(vocab_file) vocab_file.close() parsed_configs["feature_scheme"][tag]["extractor"] = extractor if "converter" not in scheme: parsed_configs["feature_scheme"][tag]["converter"] = Converter( {}) else: parsed_configs["feature_scheme"][tag]["converter"] = scheme[ "converter"] return Config(parsed_configs, default_hparams=self.default_configs())
def __init__(self, reader, reader_config, indexer_config=None): self.reader = reader self.reader_config = reader_config self.config = indexer_config if indexer_config is not None \ else self.default_config() self.config = Config(self.config, default_hparams=None) self.create_pipeline()
def __init__(self): super().__init__() self.current_batch: Dict = {} self.data_pack_pool: List[PackType] = [] self.current_batch_sources: List[int] = [] self._cross_pack: bool = True self.configs: Config = Config({}, {})
def initialize(self, resources: Resources, configs: Config): self.resources = resources self.config = Config(configs, self.default_configs()) #print(self.config) self.device = torch.device('cuda:0') \ if torch.cuda.is_available() else torch.device('cpu') self.model = AutoModelForSequenceClassification.from_pretrained(self.config.model_name).to(self.device) self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
def initialize(self, config: Optional[Union[Config, Dict]] = None): self._config = Config( config, default_hparams=self.default_configs(), allow_new_hparam=True, ) self._user_request = self._config.request self._validate_config() self._parse_request(self._user_request) self._build_vocab()
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]: feature_requests: Dict[str, Any] = {} for tag, scheme_config in scheme_configs.items(): assert ( "extractor" in scheme_config ), "Field not found for data request scheme: `extractor`" assert ( "type" in scheme_config ), "Field not found for data request scheme: `type`" assert scheme_config["type"] in [ "data_input", "data_output", ], "Type field must be either data_input or data_output." feature_requests[tag] = {} if scheme_config["type"] == "data_input": feature_requests[tag]["type"] = DATA_INPUT elif scheme_config["type"] == "data_output": feature_requests[tag]["type"] = DATA_OUTPUT extractor_class = get_class(scheme_config["extractor"]["class_name"]) extractor: BaseExtractor = extractor_class() if not isinstance(extractor, BaseExtractor): raise RuntimeError( "Invalid extractor: ", scheme_config["extractor"] ) extractor.initialize(config=scheme_config["extractor"]["config"]) # Load vocab from disk if provided. if "vocab_path" in scheme_config["extractor"]: with open( scheme_config["extractor"]["vocab_path"], "rb" ) as vocab_file: extractor.vocab = pickle.load(vocab_file) feature_requests[tag]["extractor"] = extractor if "converter" not in scheme_config: # Create default converter if there is no given converter feature_requests[tag]["converter"] = Converter({}) else: converter_class = get_class( scheme_config["converter"]["class_name"] ) converter: Converter = converter_class() if not isinstance(converter, Converter): raise RuntimeError( "Invalid converter: ", scheme_config["converter"] ) feature_requests[tag]["converter"] = converter return feature_requests
def main(): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second)." ) pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() child: PredicateArgument = link.get_child() print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.resources = resources self.config = Config(configs, self.default_configs()) if not self.config.pretrained_model_name: raise ValueError("Please specify a pretrained bert model") self.tokenizer = BERTTokenizer( pretrained_model_name=self.config.pretrained_model_name, cache_dir=None, hparams=None, )
def initialize(self, config: Union[Dict, Config]): self.config = Config(config, self.default_configs()) if self.config.vocab_method != "custom": self._vocab = Vocabulary( method=self.config.vocab_method, use_pad=self.config.need_pad, use_unk=self.config.vocab_use_unk, pad_value=self.config.pad_value, unk_value=self.config.unk_value, ) else: self._vocab = None self._vocab_method = self.config.vocab_method
def __init__(self, config: Union[Dict, Config]): self.config = Config(config, self.default_configs()) if self.config.entry_type is None: raise AttributeError("entry_type needs to be specified in " "the configuration of an extractor.") if self.config.vocab_method != "raw": self.vocab: Optional[Vocabulary] = \ Vocabulary(method=self.config.vocab_method, need_pad=self.config.need_pad, use_unk=self.config.vocab_use_unk) else: self.vocab = None
def main(args): """ Build a pipeline to process MS_MARCO dataset using MSMarcoPassageReader and build elastic indexer. """ # config_file = os.path.join(os.path.dirname(__file__), 'config.yml') config_file = args.config_file config = yaml.safe_load(open(config_file, "r")) config = Config(config, default_hparams=None) pipeline = Pipeline[DataPack]() pipeline.set_reader(MSMarcoPassageReader()) pipeline.add(ElasticSearchTextIndexProcessor(), config=config.create_index) pipeline.run(args.data_dir)
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) # Validate multi_pack project config: # A `multi_pack` project must have `multi_ontology` set. if self.configs.project_type != "single_pack" and ( self.configs.project_type != "multi_pack" or self.configs.multi_ontology is None): raise ProcessorConfigError("Invalid project type configuration.") # Generate default configurations self.configs.project_configs = Config( hparams=self.configs.project_configs, default_hparams=self._default_project_configs(), ) self.configs.multi_ontology = self.configs.multi_ontology or Config({}, {}) self.configs.project_path = os.path.abspath( self.configs.project_path or self.configs.project_name) self._viewer = StaveViewer( project_path=self.configs.project_path, host=self.configs.host, port=self.configs.port, thread_daemon=self.configs.server_thread_daemon, ) # Write meta data to project folder self._project_writer = StaveProjectWriter( project_path=self.configs.project_path, project_name=self.configs.project_name, project_type=self.configs.project_type, ontology=self.resources.get("onto_specs_dict"), project_configs=self.configs.project_configs.todict(), multi_ontology=self.configs.multi_ontology.todict(), )
def make_configs( cls, configs: Optional[Union[Config, Dict[str, Any]]], ) -> Config: """ Create the configuration by merging the provided config with the `default_configs`. The following config conventions are expected: - The top level key can be a special `@config_path`. - `@config_path` should be point to a file system path, which will be a YAML file containing configurations. - Other key values in the configs will be considered as parameters. Args: configs: The input config to be merged with the default config. Returns: The merged configuration. """ merged_configs: Dict = {} if configs is not None: if isinstance(configs, Config): configs = configs.todict() if configs.get("@config_path", None) is not None: with open(configs.pop("@config_path"), encoding="utf-8") as f: filebased_configs = yaml.safe_load(f) else: filebased_configs = {} merged_configs.update(filebased_configs) merged_configs.update(configs) try: final_configs = Config(merged_configs, cls._default_configs().todict()) except ValueError as e: raise ProcessorConfigError( f"Configuration error for the processor " f"{get_full_module_name(cls)}.") from e return final_configs
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl = Pipeline[DataPack]() pl.set_reader(PlainTextReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", "red"), pack.pack_name) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def __init__(self, pack_iterator: Iterator[DataPack], request: Dict, config: Optional[Union[Config, Dict]] = None): self._config: Config = \ Config(config, default_hparams=self.default_configs()) self._validate_config() self._pack_iterator: Iterator[DataPack] = pack_iterator self._cached_packs: List[DataPack] = [] self._user_request: Dict = request self._request: Dict = {} self._request_ready: bool = False self._vocab_ready: bool = False self._parse_request(self._user_request) self._build_vocab()
def initialize(self, config: Union[Dict, Config]): # pylint: disable=attribute-defined-outside-init self.config = Config(config, self.default_configs()) if self.config.entry_type is None: raise AttributeError("`entry_type` needs to be specified in " "the configuration of an extractor.") self._entry_type = get_class(self.config.entry_type) if self.config.vocab_method != "custom": self._vocab = Vocabulary( method=self.config.vocab_method, use_pad=self.config.need_pad, use_unk=self.config.vocab_use_unk, pad_value=self.config.pad_value, unk_value=self.config.unk_value, ) else: self._vocab = None self._vocab_method = self.config.vocab_method
def make_configs( cls, configs: Optional[Union[Config, Dict[str, Any]]]) -> Config: """ Create the component configuration for this class, by merging the provided config with the ``default_config``. The following config conventions are expected: - The top level key can be a special `config_path`. - `config_path` should be point to a file system path, which will be a YAML file containing configurations. - Other key values in the configs will be considered as parameters. Args: configs: The input config to be merged with the default config. Returns: The merged configuration. """ merged_configs: Dict = {} if configs is not None: if isinstance(configs, Config): configs = configs.todict() if "config_path" in configs and not configs["config_path"] is None: filebased_configs = yaml.safe_load( open(configs.pop("config_path"))) else: filebased_configs = {} merged_configs.update(filebased_configs) merged_configs.update(configs) try: final_configs = Config(merged_configs, cls.default_configs()) except ValueError as e: raise ProcessorConfigError( f'Configuration error for the processor ' f'{get_full_module_name(cls)}.') from e return final_configs
def load_pretrained_config( self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None, ): r"""Load paths and configurations of the pre-trained model. Args: pretrained_model_name (optional): A str with the name of a pre-trained model to load. If `None`, will use the model name in :attr:`hparams`. cache_dir (optional): The path to a folder in which the fine-tuned model is present. hparams (dict or HParams, optional): Hyperparameters. Missing hyperparameter will be set to default values. See :meth:`default_hparams` for the hyperparameter structure and default values. """ self.pretrained_model_name = (hparams["pretrained_model_name"] if pretrained_model_name is None else pretrained_model_name) rel_dir = hparams["model_dir"] if cache_dir is None else cache_dir self.cache_dir = os.path.join(os.path.dirname(__file__), rel_dir) if self.pretrained_model_name is None or self.cache_dir is None: raise ValueError("Pre-trained model name and directory should" "be defined in the fine tuned BERT model.") self.pretrained_model_dir = os.path.join(self.cache_dir, self.pretrained_model_name) pretrained_model_hparams = self._transform_config( self.pretrained_model_name, self.pretrained_model_dir) super_params = self.default_hparams() if "prefix" not in super_params: super_params["prefix"] = "_encoder.encoder." self._hparams = Config(pretrained_model_hparams, super_params)
def stanford_nlp_example(lang: str, text: str): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) config = Config( { "processors": "tokenize,pos,lemma,depparse", "lang": lang, # Language code for the language to build the Pipeline "use_gpu": False, }, StandfordNLPProcessor.default_configs(), ) pl.add(component=StandfordNLPProcessor(), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", "red"), tokens, "\n") print(colored("Dependency Relations:", "red")) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print( colored(child.text, "cyan"), "has relation", colored(link.rel_type, "green"), # type: ignore "of parent", colored(parent.text, "cyan"), ) print("\n----------------------\n")
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs: str, output_path: str): # Load redirects. print_progress('Loading redirects', '\n') logging.info("Loading redirects") redirect_pickle = os.path.join(output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) print_progress('\nLoading redirects', '\n') logging.info("Done loading.") # The datasets are read in two steps. raw_pack_dir = os.path.join(output_path, 'nif_raw') # First, we create the NIF reader that read the NIF in order. nif_pl = Pipeline[DataPack]() nif_pl.resource.update(redirects=redirect_map) nif_pl.set_reader(DBpediaWikiReader(), config=Config( { 'redirect_path': redirects, 'nif_page_structure': nif_page_structure, 'nif_text_links': nif_text_links, }, DBpediaWikiReader.default_configs() )) nif_pl.add(WikiArticleWriter(), config=Config( { 'output_dir': raw_pack_dir, 'zip_pack': True, }, WikiArticleWriter.default_configs() )) nif_pl.initialize() logging.info('Start running the DBpedia text pipeline.') print_progress('Start running the DBpedia text pipeline.', '\n') nif_pl.run(nif_context) # Second, we add info boxes to the packs with NIF. ib_pl = Pipeline[DataPack]() ib_pl.resource.update(redirects=redirect_map) ib_pl.set_reader(DBpediaInfoBoxReader(), config=Config( { 'pack_index': os.path.join(raw_pack_dir, 'article.idx'), 'pack_dir': raw_pack_dir, 'mapping_literals': mapping_literals, 'mapping_objects': mapping_objects, 'reading_log': os.path.join(output_path, 'infobox.log') }, DBpediaInfoBoxReader.default_configs() )) ib_pl.add(WikiArticleWriter(), config=Config( { 'output_dir': os.path.join(output_path, 'nif_info_box'), 'zip_pack': True, }, WikiArticleWriter.default_configs() )) # Now we run the info box pipeline. ib_pl.run(info_boxs)
def __init__(self) -> None: super().__init__() self.config = Config(None, self.default_configs())