def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams( { "input_pack_name": "query", "output_pack_name": "output" }, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) config = HParams( { "model": { "name": "bert-base-uncased" }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" }, None) nlp.add_processor(BertBasedQueryCreator(), config=config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack = m_pack.get_pack("query") self.assertEqual(len(query_pack.generics), 1) self.assertIsInstance(query_pack.generics[0], Query) query = query_pack.generics[0].value self.assertEqual(query.shape, (1, 768))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams( { "input_pack_name": "input", "output_pack_name": "output" }, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) translator_config = HParams( { "src_language": "de", "target_language": "en", "in_pack_name": "input", "out_pack_name": "result" }, None) nlp.add_processor(MicrosoftBingTranslator(), config=translator_config) nlp.initialize() english_results = ["Hey good morning", "This is Forte. A tool for NLP"] for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(set(m_pack._pack_names), set(["input", "output", "result"])) self.assertEqual( m_pack.get_pack("result").text, english_results[idx] + "\n")
def string_processor_example(ner_model_dir: str, srl_model_dir: str): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) ner_predictor = CoNLLNERPredictor() pl.add_processor(ner_predictor, ner_configs) srl_configs = HParams({ 'storage_path': srl_model_dir, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() text = ( "The plain green Norway spruce is displayed in the gallery's foyer. " "Wentworth worked as an assistant to sculptor Henry Moore in the " "late 1960s. His reputation as a sculptor grew in the 1980s.") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_document_and_passage_mode(self, doc_mode): resources: Resources = Resources() config: HParams = HParams({"doc_mode": doc_mode}, default_hparams=None) self.reader.initialize(resources, config) data_packs: List[DataPack] = \ [data_pack for data_pack in self.reader.iter(self.data_dir, 'dev')] # get all queries and all documents queries: List[Query] = [] documents: Dict[str, Document] = dict() for data_pack in data_packs: query_entries = list(data_pack.get_entries_by_type(Query)) doc_entries = list(data_pack.get_entries_by_type(Document)) self.assertTrue(len(query_entries) + len(doc_entries) == 1) if len(query_entries) > 0: query_entry: Query = query_entries[0] queries.append(query_entry) else: doc_entry: Document = doc_entries[0] documents[data_pack.meta.doc_id] = doc_entry # match text of documents relevant to the queries to the actual text for i, query in enumerate(queries): expected_query = self.expected_queries[i] expected_ids = self.expected_doc_ids[doc_mode][i] self.assertEqual(query.query, expected_query) self.assertCountEqual(query.doc_ids["relevant_docs"], expected_ids) for doc_id in expected_ids: expected_text = self.get_expected_text(doc_id, doc_mode) self.assertEqual(documents[doc_id].text, expected_text)
def create_class_with_kwargs(class_name: str, class_args: Dict, h_params: Optional[Dict] = None): cls = get_class(class_name) if not class_args: class_args = {} obj = cls(**class_args) if h_params is None: h_params = {} p_params: Dict = {} if "config_path" in h_params and not h_params["config_path"] is None: filebased_hparams = yaml.safe_load(open(h_params["config_path"])) else: filebased_hparams = {} p_params.update(filebased_hparams) p_params.update(h_params.get("overwrite_configs", {})) default_processor_hparams = cls.default_hparams() processor_hparams = HParams(p_params, default_processor_hparams) return obj, processor_hparams
def create_class_with_kwargs(class_name: str, class_args: Dict, h_params: Optional[Dict] = None): r"""Create class with the given arguments. Args: class_name (str): Class name. class_args (Dict): Class arguments. h_params (Dict): Hyperparameters for the processor. Returns: The class object and the hyperparameters for the processor. """ cls = get_class(class_name) if not class_args: class_args = {} obj = cls(**class_args) if h_params is None: h_params = {} p_params: Dict = {} if "config_path" in h_params and not h_params["config_path"] is None: filebased_hparams = yaml.safe_load(open(h_params["config_path"])) else: filebased_hparams = {} p_params.update(filebased_hparams) p_params.update(h_params.get("overwrite_configs", {})) default_processor_hparams = cls.default_configs() processor_hparams = HParams(p_params, default_processor_hparams) return obj, processor_hparams
def stanford_nlp_example(lang: str, text: str): pl = Pipeline() pl.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_hparams()) pl.add_processor(processor=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
def __init__(self, hparams: Optional[Union[Dict, HParams]] = None): super().__init__() self._hparams = HParams(hparams=hparams, default_hparams=self.default_hparams()) self._meta_data: Dict[int, str] = {} index_type = self._hparams.index_type device = self._hparams.device dim = self._hparams.dim if device.lower().startswith("gpu"): if isinstance(index_type, str) and not index_type.startswith("Gpu"): index_type = "Gpu" + index_type index_class = utils.get_class(index_type, module_paths=["faiss"]) gpu_resource = faiss.StandardGpuResources() gpu_id = int(device[3:]) if faiss.get_num_gpus() < gpu_id: gpu_id = 0 logging.warning( "Cannot create the index on device %s. " "Total number of GPUs on this machine is " "%s. Using gpu0 for the index.", self._hparams.device, faiss.get_num_gpus()) config_class_name = \ self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__) config = utils.get_class(config_class_name, module_paths=["faiss"])() config.device = gpu_id self._index = index_class(gpu_resource, dim, config) else: index_class = utils.get_class(index_type, module_paths=["faiss"]) self._index = index_class(dim)
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str): pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_path, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) pl.add_processor(CoNLLNERPredictor(), ner_configs) srl_configs = HParams({ 'storage_path': srl_model_path, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def __init__(self, cross_pack: bool = True): super().__init__(cross_pack) self.batch_is_full = False default_config = HParams(None, self.default_configs()) self.input_pack_name = default_config.input_pack_name self.batch_size = default_config.batch_size self.initialize(default_config)
def set_reader(self, reader: BaseReader, config: Optional[Union[HParams, Dict[str, Any]]] = None): self._reader = reader if config is None: config = reader.default_configs() config = HParams(config, reader.default_configs()) self._reader_config = config
def __init__(self, cross_pack: bool = True): super().__init__(cross_pack) # self.instance_num_in_current_batch = 0 self.batch_is_full = False self.last_batch = False default_config = HParams(None, self.default_hparams()) self.input_pack_name = default_config.input_pack_name self.batch_size = default_config.batch_size self.initialize(default_config)
def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = HParams({"batcher": { "batch_size": 5 }}, dummy.default_hparams()) self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = \ "forte/processors/base/tests/data_samples/ontonotes/00/"
def setUp(self): self.stanford_nlp = Pipeline() self.stanford_nlp.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False }, StandfordNLPProcessor.default_hparams()) self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path), config=config) self.stanford_nlp.initialize()
def test_processor(self, batch_size): config = HParams({"batcher": { "batch_size": batch_size }}, self.dummy.default_hparams()) self.nlp.add_processor(NLTKSentenceSegmenter()) self.nlp.add_processor(self.dummy, config=config) self.nlp.initialize() sentences = [ "This tool is called Forte. The goal of this project to " "help you build NLP pipelines. NLP has never been made " "this easy before." ] pack = self.nlp.process(sentences) sent_len = len(list(pack.get(Sentence))) self.assertEqual(self.dummy.counter, (sent_len // batch_size + (sent_len % batch_size > 0)))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams({"input_pack_name": "input", "output_pack_name": "output"}, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(m_pack._pack_names, ["input", "output"]) self.assertEqual(m_pack.get_pack("input").text, texts[idx] + "\n")
def add_processor(self, processor: BaseProcessor, config: Optional[Union[HParams, Dict[str, Any]]] = None, selector: Optional[Selector] = None): self._processors_index[processor.component_name] = len(self.processors) self._processors.append(processor) if config is None: config = processor.default_configs() config = HParams(config, processor.default_configs()) self.processor_configs.append(config) if selector is None: self._selectors.append(DummySelector()) else: self._selectors.append(selector)
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def main(): import sys ner_dir, srl_dir = sys.argv[ # pylint: disable=unbalanced-tuple-unpacking 1:3] output_config = HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), ) eng_text = "The plain green Norway spruce is displayed in the gallery's " \ "foyer. Wentworth worked as an assistant to sculptor Henry " \ "Moore in the late 1960s. His reputation as a sculptor grew " \ "in the 1980s." fr_text = "Van Gogh grandit au sein d'une famille de " \ "l'ancienne bourgeoisie." stanford_nlp_example1('en', eng_text, output_config) stanford_nlp_example1('fr', fr_text, output_config) string_processor_example(ner_dir, srl_dir)
def initialize(self, config: HParams): config_ = HParams(config, self.default_hparams()) self.batch_size = config_.batch_size # self.instance_num_in_current_batch = 0 self.batch_is_full = False
def __init__(self, cross_pack=True): super().__init__(cross_pack) # self.instance_num_in_current_batch = 0 self.batch_is_full = False default_config = HParams(None, self.default_hparams()) self.batch_size = default_config.batch_size
def initialize(self, config: HParams): config_ = HParams(config, self.default_hparams()) self.batch_size = config_.batch_size self.batch_is_full = False
def __init__(self, cross_pack=True): super().__init__(cross_pack) self.batch_is_full = False self.last_batch = False default_config = HParams(None, self.default_hparams()) self.batch_size = default_config.batch_size
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor(DocIdJsonPackWriter(), HParams( { 'output_dir': '.' }, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add_processor(processor=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add_processor(processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack(config.translator.in_pack_name) if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack(config.back_translator.in_pack_name) if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack(config.indexer.response_pack_name[0]) print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
def __init__(self) -> None: super().__init__() self.config = HParams(None, self.default_configs())
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) if not os.path.exists(config.indexer.model_dir): print(f"Creating a new index...") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "segment_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "text": ["str", "FixedLenFeature"] } hparam = { "allow_smaller_final_batch": True, "batch_size": config.indexer.batch_size, "dataset": { "data_name": "data", "feature_original_types": feature_original_types, "files": config.indexer.pickle_data_dir }, "shuffle": False } print(f"Embedding the text using BERTEncoder...") record_data = RecordData(hparams=hparam, device=device) data_iterator = DataIterator(record_data) index = EmbeddingBasedIndexer(hparams={ "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0" }) for idx, batch in enumerate(data_iterator): ids = batch["id"] input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] text = batch["text"] _, pooled_output = get_embeddings(encoder, input_ids, segment_ids) index.add(vectors=pooled_output, meta_data={k.item(): v for k, v in zip(ids, text)}) if (idx + 1) % 50 == 0: print(f"Completed {idx+1} batches of size " f"{config.indexer.batch_size}") index.save(path=config.indexer.model_dir) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(MultiPackTerminalReader()) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.translator) query_pipeline.add_processor( processor=QueryCreator(), config=config.query_creator) query_pipeline.add_processor( processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name="doc_0")) # query_pipeline.add_processor( # processor=CoNLLNERPredictor(), config=config.NER, # selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack("query") if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack("response") if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack("doc_0") print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
from termcolor import colored from texar.torch import HParams from forte.data.readers import StringReader from forte.pipeline import Pipeline from forte.processors import CoNLLNERPredictor, SRLPredictor from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from ft.onto.base_ontology import Token, Sentence, PredicateLink, \ PredicateMention, PredicateArgument, EntityMention config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) def main(): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() text = (
def create_hparams(hparams_string=None, verbose=False): """Create model hyperparameters. Parse nondefault from given string.""" hparams = { ################################ # Experiment Parameters # ################################ "epochs": 50000, "iters_per_checkpoint": 500, "seed": 1234, "dynamic_loss_scaling": True, "fp16_run": False, "distributed_run": False, "dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "cudnn_enabled": True, "cudnn_benchmark": False, "ignore_layers": ['speaker_embedding.weight'], ################################ # Data Parameters # ################################ "training_files": 'filelists/tts_dataset/info_tsukuyomi.txt', "validation_files": 'filelists/tts_dataset/info_tsukuyomi.txt', "text_cleaners": ['basic_cleaners'], "p_arpabet": 1.0, "cmudict_path": "data/cmu_dictionary", ################################ # Audio Parameters # ################################ # librosa=(-1,1)notame,hennkou "max_wav_value": 1.0, "sampling_rate": 22050, "filter_length": 1024, "hop_length": 256, "win_length": 1024, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": 8000.0, "f0_min": 80, "f0_max": 880, "harm_thresh": 0.25, ################################ # Model Parameters # ################################ "n_symbols": len(symbols), "symbols_embedding_dim": 512, # Encoder parameters "encoder_kernel_size": 5, "encoder_n_convolutions": 3, "encoder_embedding_dim": 512, # Decoder parameters "n_frames_per_step": 1, # currently only 1 is supported "decoder_rnn_dim": 1024, "prenet_dim": 256, "prenet_f0_n_layers": 1, "prenet_f0_dim": 1, "prenet_f0_kernel_size": 1, "prenet_rms_dim": 0, "prenet_rms_kernel_size": 1, "max_decoder_steps": 1000, "gate_threshold": 0.5, "p_attention_dropout": 0.1, "p_decoder_dropout": 0.1, "p_teacher_forcing": 1.0, # Attention parameters "attention_rnn_dim": 1024, "attention_dim": 128, # Location Layer parameters "attention_location_n_filters": 32, "attention_location_kernel_size": 31, # Mel-post processing network parameters "postnet_embedding_dim": 512, "postnet_kernel_size": 5, "postnet_n_convolutions": 5, # Speaker embedding "n_speakers": 123, "speaker_embedding_dim": 128, # Reference encoder "with_gst": True, "ref_enc_filters": [32, 32, 64, 64, 128, 128], "ref_enc_size": [3, 3], "ref_enc_strides": [2, 2], "ref_enc_pad": [1, 1], "ref_enc_gru_size": 128, # Style Token Layer "token_embedding_size": 256, "token_num": 10, "num_heads": 8, ################################ # Optimization Hyperparameters # ################################ "use_saved_learning_rate": False, "learning_rate": 1e-3, "learning_rate_min": 1e-5, "learning_rate_anneal": 50000, "weight_decay": 1e-6, "grad_clip_thresh": 1.0, "batch_size": 32, "mask_padding": True, # set model's padded outputs to padded values } """ if hparams_string: tf.compat.v1.logging.info( 'Parsing command line hparams: %s', hparams_string) hparams.parse(hparams_string) if verbose: tf.compat.v1.logging.info('Final parsed hparams: %s', hparams.values()) """ hparams = HParams(hparams, hparams) return hparams