예제 #1
0
def main():
    config_data = yaml.safe_load(open("config_data.yml", "r"))
    config_model = yaml.safe_load(open("config_model.yml", "r"))
    config_evaluator = yaml.safe_load(open("config_evaluator.yml", "r"))
    config_preprocess = yaml.safe_load(open("config_preprocessor.yml", "r"))

    # All the configs
    config = Config({}, default_hparams=None)
    config.add_hparam("config_data", config_data)
    config.add_hparam("config_model", config_model)
    config.add_hparam("preprocessor", config_preprocess)
    config.add_hparam("reader", {})
    config.add_hparam("evaluator", config_evaluator)

    reader = CoNLL03Reader()

    # Keep the vocabulary processor as a simple counter
    vocab_processor = CoNLL03VocabularyProcessor()

    ner_trainer = CoNLLNERTrainer()
    ner_predictor = CoNLLNERPredictor()
    ner_evaluator = CoNLLNEREvaluator()

    train_pipe = TrainPipeline(
        train_reader=reader,
        trainer=ner_trainer,
        dev_reader=reader,
        configs=config,
        preprocessors=[vocab_processor],
        predictor=ner_predictor,
        evaluator=ner_evaluator,
    )
    train_pipe.run()
예제 #2
0
    def setUp(self):
        self.config = {
            "max_char_length": 45,
            "train_path": "data_samples/train_pipeline_test",
            "val_path": "data_samples/train_pipeline_test",
            "num_epochs": 1,
            "batch_size_tokens": 5,
            "learning_rate": 0.01,
            "momentum": 0.9,
            "nesterov": True
        }

        text_extractor: AttributeExtractor = \
            AttributeExtractor(config={"entry_type": Token,
                                       "vocab_method": "indexing",
                                       "attribute": "text"})

        char_extractor: CharExtractor = \
            CharExtractor(
                config={"entry_type": Token,
                        "vocab_method": "indexing",
                        "max_char_length": self.config["max_char_length"]})

        # Add output part in request based on different task type
        ner_extractor: BioSeqTaggingExtractor = \
            BioSeqTaggingExtractor(config={"entry_type": EntityMention,
                                           "attribute": "ner_type",
                                           "tagging_unit": Token,
                                           "vocab_method": "indexing"})

        self.tp_request = {
            "scope": Sentence,
            "schemes": {
                "text_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": text_extractor
                },
                "char_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": char_extractor
                },
                "ner_tag": {
                    "type": TrainPreprocessor.DATA_OUTPUT,
                    "extractor": ner_extractor
                }
            }
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        self.tp_config = {
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            }
        }

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = \
            train_pl.process_dataset(self.config["train_path"])

        self.train_preprocessor = \
            TrainPreprocessor(pack_iterator=pack_iterator,
                              request=self.tp_request,
                              config=self.tp_config)
예제 #3
0
    def setUp(self):
        root_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         os.pardir))

        self.config = {
            "max_char_length":
            45,
            "train_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "val_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "num_epochs":
            1,
            "batch_size_tokens":
            5,
            "learning_rate":
            0.01,
            "momentum":
            0.9,
            "nesterov":
            True,
        }

        text_extractor = (
            "forte.data.extractors.attribute_extractor.AttributeExtractor")
        text_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "attribute": "text",
        }

        char_extractor = "forte.data.extractors.char_extractor.CharExtractor"
        char_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "max_char_length": self.config["max_char_length"],
        }

        # Add output part in request based on different task type
        ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor"  # pylint: disable=line-too-long
        ner_extractor_config = {
            "entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute": "ner_type",
            "tagging_unit": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
        }

        self.tp_request = {
            "scope": "ft.onto.base_ontology.Sentence",
            "feature_scheme": {
                "text_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": text_extractor,
                        "config": text_extractor_config,
                    },
                },
                "char_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": char_extractor,
                        "config": char_extractor_config,
                    },
                },
                "ner_tag": {
                    "type": "data_output",
                    "extractor": {
                        "class_name": ner_extractor,
                        "config": ner_extractor_config,
                    },
                },
            },
        }

        self.tp_config = {
            "request": self.tp_request,
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            },
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = train_pl.process_dataset(
            self.config["train_path"])

        self.train_preprocessor = TrainPreprocessor(
            pack_iterator=pack_iterator)
        self.train_preprocessor.initialize(config=self.tp_config)
예제 #4
0
    def test_Predictor(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(CoNLL03Reader())
        pipeline.initialize()

        text_extractor = AttributeExtractor({
            "need_pad": True,
            "entry_type": Token,
            "attribute": "text",
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)

        ner_extractor = BioSeqTaggingExtractor({
            "entry_type": EntityMention,
            "need_pad": True,
            "attribute": "ner_type",
            "tagging_unit": Token,
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ner_extractor.update_vocab(pack, instance)

        expected_ners = [
            ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)]

        model = DummyModel()

        predictor_pipeline = Pipeline[DataPack]()
        predictor_pipeline.set_reader(CoNLL03Reader())

        predictor_config = {
            "scope": Sentence,
            "batch_size": 2,
            "feature_scheme": {
                "text_tag": {
                    "extractor": text_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_INPUT
                },
                "ner_tag": {
                    "extractor": ner_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_OUTPUT
                },
            },
        }

        # dummy = DummyRelationExtractor()
        # config = {"batcher": {"batch_size": 5}}

        predictor = NERPredictor()
        predictor.load(model)
        predictor_pipeline.add(predictor, predictor_config)
        # predictor_pipeline.add(dummy, config)

        predictor_pipeline.add(CoNLLNEREvaluator())

        predictor_pipeline.initialize()
        for pack in predictor_pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ners = [e.ner_type for e in
                        list(pack.get(EntityMention, instance))]
                self.assertListEqual(ners, expected_ners)
예제 #5
0
    def test_Predictor(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(CoNLL03Reader())
        pipeline.initialize()

        text_extractor_name = (
            "forte.data.extractors.attribute_extractor.AttributeExtractor")
        text_extractor_config = {
            "need_pad": True,
            "entry_type": "ft.onto.base_ontology.Token",
            "attribute": "text",
        }

        ner_extractor_name = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor"  # pylint: disable=line-too-long
        ner_extractor_config = {
            "entry_type": "ft.onto.base_ontology.EntityMention",
            "need_pad": True,
            "attribute": "ner_type",
            "tagging_unit": "ft.onto.base_ontology.Token",
        }

        model = DummyModel()

        predictor_pipeline = Pipeline[DataPack]()
        predictor_pipeline.set_reader(CoNLL03Reader())

        predictor_config = {
            "scope": "ft.onto.base_ontology.Sentence",
            "batch_size": 2,
            "feature_scheme": {
                "text_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": text_extractor_name,
                        "config": text_extractor_config,
                    },
                },
                "ner_tag": {
                    "type": "data_output",
                    "extractor": {
                        "class_name": ner_extractor_name,
                        "config": ner_extractor_config,
                    },
                },
            },
            "do_eval": True,
        }
        # dummy = DummyRelationExtractor()
        # config = {"batcher": {"batch_size": 5}}
        evaluator_config = {
            "entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute": "ner_type",
            "tagging_unit": "ft.onto.base_ontology.Token",
        }
        predictor = NERPredictor()
        predictor.load(model)
        predictor_pipeline.add(predictor, predictor_config)
        # predictor_pipeline.add(dummy, config)
        predictor_pipeline.add(CoNLLNEREvaluator(), evaluator_config)
        predictor_pipeline.initialize()

        text_extractor = predictor.configs.feature_scheme.text_tag.extractor
        ner_extractor = predictor.configs.feature_scheme.ner_tag.extractor

        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)

        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ner_extractor.update_vocab(pack, instance)
        expected_ners = [
            ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)
        ]

        for pack in predictor_pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ners = [
                    e.ner_type for e in list(pack.get(EntityMention, instance))
                ]
                self.assertListEqual(ners, expected_ners)