def main(): config_data = yaml.safe_load(open("config_data.yml", "r")) config_model = yaml.safe_load(open("config_model.yml", "r")) config_evaluator = yaml.safe_load(open("config_evaluator.yml", "r")) config_preprocess = yaml.safe_load(open("config_preprocessor.yml", "r")) # All the configs config = Config({}, default_hparams=None) config.add_hparam("config_data", config_data) config.add_hparam("config_model", config_model) config.add_hparam("preprocessor", config_preprocess) config.add_hparam("reader", {}) config.add_hparam("evaluator", config_evaluator) reader = CoNLL03Reader() # Keep the vocabulary processor as a simple counter vocab_processor = CoNLL03VocabularyProcessor() ner_trainer = CoNLLNERTrainer() ner_predictor = CoNLLNERPredictor() ner_evaluator = CoNLLNEREvaluator() train_pipe = TrainPipeline( train_reader=reader, trainer=ner_trainer, dev_reader=reader, configs=config, preprocessors=[vocab_processor], predictor=ner_predictor, evaluator=ner_evaluator, ) train_pipe.run()
def setUp(self): self.config = { "max_char_length": 45, "train_path": "data_samples/train_pipeline_test", "val_path": "data_samples/train_pipeline_test", "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True } text_extractor: AttributeExtractor = \ AttributeExtractor(config={"entry_type": Token, "vocab_method": "indexing", "attribute": "text"}) char_extractor: CharExtractor = \ CharExtractor( config={"entry_type": Token, "vocab_method": "indexing", "max_char_length": self.config["max_char_length"]}) # Add output part in request based on different task type ner_extractor: BioSeqTaggingExtractor = \ BioSeqTaggingExtractor(config={"entry_type": EntityMention, "attribute": "ner_type", "tagging_unit": Token, "vocab_method": "indexing"}) self.tp_request = { "scope": Sentence, "schemes": { "text_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": text_extractor }, "char_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": char_extractor }, "ner_tag": { "type": TrainPreprocessor.DATA_OUTPUT, "extractor": ner_extractor } } } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() self.tp_config = { "dataset": { "batch_size": self.config["batch_size_tokens"] } } train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = \ train_pl.process_dataset(self.config["train_path"]) self.train_preprocessor = \ TrainPreprocessor(pack_iterator=pack_iterator, request=self.tp_request, config=self.tp_config)
def setUp(self): root_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir)) self.config = { "max_char_length": 45, "train_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "val_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True, } text_extractor = ( "forte.data.extractors.attribute_extractor.AttributeExtractor") text_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "attribute": "text", } char_extractor = "forte.data.extractors.char_extractor.CharExtractor" char_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "max_char_length": self.config["max_char_length"], } # Add output part in request based on different task type ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor" # pylint: disable=line-too-long ner_extractor_config = { "entry_type": "ft.onto.base_ontology.EntityMention", "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token", "vocab_method": "indexing", } self.tp_request = { "scope": "ft.onto.base_ontology.Sentence", "feature_scheme": { "text_tag": { "type": "data_input", "extractor": { "class_name": text_extractor, "config": text_extractor_config, }, }, "char_tag": { "type": "data_input", "extractor": { "class_name": char_extractor, "config": char_extractor_config, }, }, "ner_tag": { "type": "data_output", "extractor": { "class_name": ner_extractor, "config": ner_extractor_config, }, }, }, } self.tp_config = { "request": self.tp_request, "dataset": { "batch_size": self.config["batch_size_tokens"] }, } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = train_pl.process_dataset( self.config["train_path"]) self.train_preprocessor = TrainPreprocessor( pack_iterator=pack_iterator) self.train_preprocessor.initialize(config=self.tp_config)
def test_Predictor(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) ner_extractor = BioSeqTaggingExtractor({ "entry_type": EntityMention, "need_pad": True, "attribute": "ner_type", "tagging_unit": Token, }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ner_extractor.update_vocab(pack, instance) expected_ners = [ ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)] model = DummyModel() predictor_pipeline = Pipeline[DataPack]() predictor_pipeline.set_reader(CoNLL03Reader()) predictor_config = { "scope": Sentence, "batch_size": 2, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT }, "ner_tag": { "extractor": ner_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_OUTPUT }, }, } # dummy = DummyRelationExtractor() # config = {"batcher": {"batch_size": 5}} predictor = NERPredictor() predictor.load(model) predictor_pipeline.add(predictor, predictor_config) # predictor_pipeline.add(dummy, config) predictor_pipeline.add(CoNLLNEREvaluator()) predictor_pipeline.initialize() for pack in predictor_pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ners = [e.ner_type for e in list(pack.get(EntityMention, instance))] self.assertListEqual(ners, expected_ners)
def test_Predictor(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor_name = ( "forte.data.extractors.attribute_extractor.AttributeExtractor") text_extractor_config = { "need_pad": True, "entry_type": "ft.onto.base_ontology.Token", "attribute": "text", } ner_extractor_name = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor" # pylint: disable=line-too-long ner_extractor_config = { "entry_type": "ft.onto.base_ontology.EntityMention", "need_pad": True, "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token", } model = DummyModel() predictor_pipeline = Pipeline[DataPack]() predictor_pipeline.set_reader(CoNLL03Reader()) predictor_config = { "scope": "ft.onto.base_ontology.Sentence", "batch_size": 2, "feature_scheme": { "text_tag": { "type": "data_input", "extractor": { "class_name": text_extractor_name, "config": text_extractor_config, }, }, "ner_tag": { "type": "data_output", "extractor": { "class_name": ner_extractor_name, "config": ner_extractor_config, }, }, }, "do_eval": True, } # dummy = DummyRelationExtractor() # config = {"batcher": {"batch_size": 5}} evaluator_config = { "entry_type": "ft.onto.base_ontology.EntityMention", "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token", } predictor = NERPredictor() predictor.load(model) predictor_pipeline.add(predictor, predictor_config) # predictor_pipeline.add(dummy, config) predictor_pipeline.add(CoNLLNEREvaluator(), evaluator_config) predictor_pipeline.initialize() text_extractor = predictor.configs.feature_scheme.text_tag.extractor ner_extractor = predictor.configs.feature_scheme.ner_tag.extractor for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ner_extractor.update_vocab(pack, instance) expected_ners = [ ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30) ] for pack in predictor_pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ners = [ e.ner_type for e in list(pack.get(EntityMention, instance)) ] self.assertListEqual(ners, expected_ners)