示例#1
0
    def initialize(self):
        # Check if initialize has already been called before
        if self._initialized:
            return

        self._tp_config: Dict = self.create_tp_config()
        self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator()
        self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator)
        self._tp.initialize(config=self._tp_config)
        self._initialized = True
示例#2
0
    def initialize(self):
        # Check if initialize has already been called before
        if self._initialized:
            return

        logging.info("Initializing the trainer...")
        self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator()
        self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator)
        self._tp.initialize(config=self.create_tp_config())
        self._initialized = True
        logging.info("Done Initializing.")
    def setUp(self):
        self.config = {
            "max_char_length": 45,
            "train_path": "data_samples/train_pipeline_test",
            "val_path": "data_samples/train_pipeline_test",
            "num_epochs": 1,
            "batch_size_tokens": 5,
            "learning_rate": 0.01,
            "momentum": 0.9,
            "nesterov": True
        }

        text_extractor: AttributeExtractor = \
            AttributeExtractor(config={"entry_type": Token,
                                       "vocab_method": "indexing",
                                       "attribute": "text"})

        char_extractor: CharExtractor = \
            CharExtractor(
                config={"entry_type": Token,
                        "vocab_method": "indexing",
                        "max_char_length": self.config["max_char_length"]})

        # Add output part in request based on different task type
        ner_extractor: BioSeqTaggingExtractor = \
            BioSeqTaggingExtractor(config={"entry_type": EntityMention,
                                           "attribute": "ner_type",
                                           "tagging_unit": Token,
                                           "vocab_method": "indexing"})

        self.tp_request = {
            "scope": Sentence,
            "schemes": {
                "text_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": text_extractor
                },
                "char_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": char_extractor
                },
                "ner_tag": {
                    "type": TrainPreprocessor.DATA_OUTPUT,
                    "extractor": ner_extractor
                }
            }
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        self.tp_config = {
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            }
        }

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = \
            train_pl.process_dataset(self.config["train_path"])

        self.train_preprocessor = \
            TrainPreprocessor(pack_iterator=pack_iterator,
                              request=self.tp_request,
                              config=self.tp_config)
class TrainPreprocessorTest(unittest.TestCase):
    def setUp(self):
        self.config = {
            "max_char_length": 45,
            "train_path": "data_samples/train_pipeline_test",
            "val_path": "data_samples/train_pipeline_test",
            "num_epochs": 1,
            "batch_size_tokens": 5,
            "learning_rate": 0.01,
            "momentum": 0.9,
            "nesterov": True
        }

        text_extractor: AttributeExtractor = \
            AttributeExtractor(config={"entry_type": Token,
                                       "vocab_method": "indexing",
                                       "attribute": "text"})

        char_extractor: CharExtractor = \
            CharExtractor(
                config={"entry_type": Token,
                        "vocab_method": "indexing",
                        "max_char_length": self.config["max_char_length"]})

        # Add output part in request based on different task type
        ner_extractor: BioSeqTaggingExtractor = \
            BioSeqTaggingExtractor(config={"entry_type": EntityMention,
                                           "attribute": "ner_type",
                                           "tagging_unit": Token,
                                           "vocab_method": "indexing"})

        self.tp_request = {
            "scope": Sentence,
            "schemes": {
                "text_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": text_extractor
                },
                "char_tag": {
                    "type": TrainPreprocessor.DATA_INPUT,
                    "extractor": char_extractor
                },
                "ner_tag": {
                    "type": TrainPreprocessor.DATA_OUTPUT,
                    "extractor": ner_extractor
                }
            }
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        self.tp_config = {
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            }
        }

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = \
            train_pl.process_dataset(self.config["train_path"])

        self.train_preprocessor = \
            TrainPreprocessor(pack_iterator=pack_iterator,
                              request=self.tp_request,
                              config=self.tp_config)

    def test_parse_request(self):
        self.assertTrue(self.train_preprocessor.request is not None)
        self.assertTrue("scope" in self.train_preprocessor.request)
        self.assertTrue("schemes" in self.train_preprocessor.request)

        self.assertTrue(len(self.train_preprocessor.request["schemes"]), 3)
        self.assertTrue(
            "text_tag" in self.train_preprocessor.request["schemes"])
        self.assertTrue(
            "char_tag" in self.train_preprocessor.request["schemes"])
        self.assertTrue(
            "ner_tag" in self.train_preprocessor.request["schemes"])

        for tag, scheme in \
                self.train_preprocessor.request["schemes"].items():
            self.assertTrue("extractor" in scheme)
            self.assertTrue("converter" in scheme)
            self.assertTrue(
                issubclass(type(scheme["extractor"]), BaseExtractor))
            self.assertTrue(isinstance(scheme["converter"], Converter))

    def test_build_vocab(self):
        schemes: Dict[str, Any] = \
            self.train_preprocessor.request["schemes"]

        text_extractor: AttributeExtractor = schemes["text_tag"]["extractor"]
        vocab: Vocabulary = text_extractor.vocab
        self.assertTrue(vocab.has_element("EU"))
        self.assertTrue(vocab.has_element("Peter"))

        char_extractor: CharExtractor = schemes["char_tag"]["extractor"]
        vocab: Vocabulary = char_extractor.vocab
        self.assertTrue(vocab.has_element("a"))
        self.assertTrue(vocab.has_element("b"))
        self.assertTrue(vocab.has_element("."))

        ner_extractor: BioSeqTaggingExtractor = schemes["ner_tag"]["extractor"]
        vocab: Vocabulary = ner_extractor.vocab
        self.assertTrue(vocab.has_element(("PER", "B")))
        self.assertTrue(vocab.has_element((None, "O")))
        self.assertTrue(vocab.has_element(("MISC", "I")))

    def test_build_dataset_iterator(self):
        train_iterator = \
            self.train_preprocessor._build_dataset_iterator()

        batchs = []
        for batch in train_iterator:
            batchs.append(batch)

        self.assertEqual(len(batchs), 2)
        self.assertEqual(batchs[0].batch_size, 5)
        self.assertEqual(batchs[1].batch_size, 2)

        for batch in batchs:
            self.assertTrue(hasattr(batch, "text_tag"))
            self.assertTrue(hasattr(batch, "char_tag"))
            self.assertTrue(hasattr(batch, "ner_tag"))

            for tag, batch_t in batch.items():
                self.assertTrue("data" in batch_t)
                self.assertEqual(type(batch_t["data"]), torch.Tensor)
                self.assertTrue("masks" in batch_t)
                if tag == "text_tag" or tag == "ner_tag":
                    self.assertEqual(len(batch_t["masks"]), 1)
                    self.assertEqual(type(batch_t["masks"][0]), torch.Tensor)
                else:
                    self.assertEqual(len(batch_t["masks"]), 2)
                    self.assertEqual(type(batch_t["masks"][0]), torch.Tensor)
                    self.assertEqual(type(batch_t["masks"][1]), torch.Tensor)
示例#5
0
    def setUp(self):
        root_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         os.pardir))

        self.config = {
            "max_char_length":
            45,
            "train_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "val_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "num_epochs":
            1,
            "batch_size_tokens":
            5,
            "learning_rate":
            0.01,
            "momentum":
            0.9,
            "nesterov":
            True,
        }

        text_extractor = (
            "forte.data.extractors.attribute_extractor.AttributeExtractor")
        text_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "attribute": "text",
        }

        char_extractor = "forte.data.extractors.char_extractor.CharExtractor"
        char_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "max_char_length": self.config["max_char_length"],
        }

        # Add output part in request based on different task type
        ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor"  # pylint: disable=line-too-long
        ner_extractor_config = {
            "entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute": "ner_type",
            "tagging_unit": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
        }

        self.tp_request = {
            "scope": "ft.onto.base_ontology.Sentence",
            "feature_scheme": {
                "text_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": text_extractor,
                        "config": text_extractor_config,
                    },
                },
                "char_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": char_extractor,
                        "config": char_extractor_config,
                    },
                },
                "ner_tag": {
                    "type": "data_output",
                    "extractor": {
                        "class_name": ner_extractor,
                        "config": ner_extractor_config,
                    },
                },
            },
        }

        self.tp_config = {
            "request": self.tp_request,
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            },
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = train_pl.process_dataset(
            self.config["train_path"])

        self.train_preprocessor = TrainPreprocessor(
            pack_iterator=pack_iterator)
        self.train_preprocessor.initialize(config=self.tp_config)
示例#6
0
class TrainPreprocessorTest(unittest.TestCase):
    def setUp(self):
        root_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         os.pardir))

        self.config = {
            "max_char_length":
            45,
            "train_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "val_path":
            os.path.join(root_path, "data_samples/train_pipeline_test"),
            "num_epochs":
            1,
            "batch_size_tokens":
            5,
            "learning_rate":
            0.01,
            "momentum":
            0.9,
            "nesterov":
            True,
        }

        text_extractor = (
            "forte.data.extractors.attribute_extractor.AttributeExtractor")
        text_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "attribute": "text",
        }

        char_extractor = "forte.data.extractors.char_extractor.CharExtractor"
        char_extractor_config = {
            "entry_type": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
            "max_char_length": self.config["max_char_length"],
        }

        # Add output part in request based on different task type
        ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor"  # pylint: disable=line-too-long
        ner_extractor_config = {
            "entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute": "ner_type",
            "tagging_unit": "ft.onto.base_ontology.Token",
            "vocab_method": "indexing",
        }

        self.tp_request = {
            "scope": "ft.onto.base_ontology.Sentence",
            "feature_scheme": {
                "text_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": text_extractor,
                        "config": text_extractor_config,
                    },
                },
                "char_tag": {
                    "type": "data_input",
                    "extractor": {
                        "class_name": char_extractor,
                        "config": char_extractor_config,
                    },
                },
                "ner_tag": {
                    "type": "data_output",
                    "extractor": {
                        "class_name": ner_extractor,
                        "config": ner_extractor_config,
                    },
                },
            },
        }

        self.tp_config = {
            "request": self.tp_request,
            "dataset": {
                "batch_size": self.config["batch_size_tokens"]
            },
        }

        self.reader = CoNLL03Reader()

        self.evaluator = CoNLLNEREvaluator()

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(self.reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = train_pl.process_dataset(
            self.config["train_path"])

        self.train_preprocessor = TrainPreprocessor(
            pack_iterator=pack_iterator)
        self.train_preprocessor.initialize(config=self.tp_config)

    def test_parse_request(self):
        self.assertTrue(self.train_preprocessor.request is not None)
        self.assertTrue("scope" in self.train_preprocessor.request)
        self.assertTrue("schemes" in self.train_preprocessor.request)

        self.assertTrue(len(self.train_preprocessor.request["schemes"]), 3)
        self.assertTrue(
            "text_tag" in self.train_preprocessor.request["schemes"])
        self.assertTrue(
            "char_tag" in self.train_preprocessor.request["schemes"])
        self.assertTrue(
            "ner_tag" in self.train_preprocessor.request["schemes"])

        for tag, scheme in self.train_preprocessor.request["schemes"].items():
            self.assertTrue("extractor" in scheme)
            self.assertTrue("converter" in scheme)
            self.assertTrue(
                issubclass(type(scheme["extractor"]), BaseExtractor))
            self.assertTrue(isinstance(scheme["converter"], Converter))

    def test_build_vocab(self):
        schemes: Dict[str, Any] = self.train_preprocessor.request["schemes"]

        text_extractor: AttributeExtractor = schemes["text_tag"]["extractor"]
        vocab: Vocabulary = text_extractor.vocab
        self.assertTrue(vocab.has_element("EU"))
        self.assertTrue(vocab.has_element("Peter"))

        char_extractor: CharExtractor = schemes["char_tag"]["extractor"]
        vocab: Vocabulary = char_extractor.vocab
        self.assertTrue(vocab.has_element("a"))
        self.assertTrue(vocab.has_element("b"))
        self.assertTrue(vocab.has_element("."))

        ner_extractor: BioSeqTaggingExtractor = schemes["ner_tag"]["extractor"]
        vocab: Vocabulary = ner_extractor.vocab
        self.assertTrue(vocab.has_element(("PER", "B")))
        self.assertTrue(vocab.has_element((None, "O")))
        self.assertTrue(vocab.has_element(("MISC", "I")))

    def test_build_dataset_iterator(self):
        train_iterator = self.train_preprocessor._build_dataset_iterator()

        batchs = []
        for batch in train_iterator:
            batchs.append(batch)

        self.assertEqual(len(batchs), 2)
        self.assertEqual(batchs[0].batch_size, 5)
        self.assertEqual(batchs[1].batch_size, 2)

        for batch in batchs:
            self.assertTrue(hasattr(batch, "text_tag"))
            self.assertTrue(hasattr(batch, "char_tag"))
            self.assertTrue(hasattr(batch, "ner_tag"))

            for tag, batch_t in batch.items():
                self.assertTrue("data" in batch_t)
                self.assertEqual(type(batch_t["data"]), torch.Tensor)
                self.assertTrue("masks" in batch_t)
                if tag == "text_tag" or tag == "ner_tag":
                    self.assertEqual(len(batch_t["masks"]), 1)
                    self.assertEqual(type(batch_t["masks"][0]), torch.Tensor)
                else:
                    self.assertEqual(len(batch_t["masks"]), 2)
                    self.assertEqual(type(batch_t["masks"][0]), torch.Tensor)
                    self.assertEqual(type(batch_t["masks"][1]), torch.Tensor)
示例#7
0
class BaseTrainer:
    r"""
    `BaseTrainer` is the main entry for using Forte training framework. Users
    should inherit this class and overwrite multiple methods defined in this
    class. Internally, it will make use of
    :class:`~forte.train_preprocessor.TrainPreprocessor` to do the the actual
    training. Please refer to the documentation of that class for details. A
    concrete example is provided below showing how to use this class.

    Below is an example for how to use this class. A fully documented example is
    also provided in
    :class:`~forte.examples.tagging.tagging_trainer.TaggingTrainer`.

    .. code-block:: python

        class TaggingTrainer(BaseTrainer):
            def create_tp_request(self) -> Dict:
                # Generate request
                text_extractor: AttributeExtractor = \
                    AttributeExtractor(config={"entry_type": Token,
                                               "vocab_method": "indexing",
                                               "attribute": "text"})

                char_extractor: CharExtractor = \
                    CharExtractor(config={"entry_type": Token,
                                          "vocab_method": "indexing",
                                          "max_char_length": 45})

                output_extractor: BaseExtractor = \
                    BioSeqTaggingExtractor(config={"entry_type": EntityMention,
                                                   "attribute": "ner_type",
                                                   "based_on": Token,
                                                   "vocab_method": "indexing"})

                tp_request: Dict = {
                    "scope": Sentence,
                    "schemes": {
                        "text_tag": {
                            "type": TrainPreprocessor.DATA_INPUT,
                            "extractor": text_extractor
                        },
                        "char_tag": {
                            "type": TrainPreprocessor.DATA_INPUT,
                            "extractor": char_extractor
                        },
                        "output_tag": {
                            "type": TrainPreprocessor.DATA_OUTPUT,
                            "extractor": output_extractor
                        }
                    }
                }

                return tp_request

            def create_tp_config(self) -> Dict:
                tp_config: Dict = {
                    "dataset": {
                        "batch_size": 512
                    }
                }

                return tp_config

            def create_pack_iterator(self) -> Iterator[DataPack]:
                reader = CoNLL03Reader()
                train_pl: Pipeline = Pipeline()
                train_pl.set_reader(reader)
                train_pl.initialize()
                pack_iterator: Iterator[DataPack] = \
                    train_pl.process_dataset(self.config_data.train_path)

                return pack_iterator

            def train(self):
                schemes: Dict = self.train_preprocessor.request["schemes"]
                text_extractor: BaseExtractor = \
                    schemes["text_tag"]["extractor"]
                char_extractor: BaseExtractor = \
                    schemes["char_tag"]["extractor"]
                output_extractor: BaseExtractor = \
                    schemes["output_tag"]["extractor"]

                model: BiRecurrentConvCRF = \
                    BiRecurrentConvCRF(word_vocab=text_extractor.get_dict(),
                                       char_vocab_size=char_extractor.size(),
                                       tag_vocab_size=output_extractor.size(),
                                       config_model=self.config_model)
                model.to(self.device)

                optim: Optimizer = SGD(model.parameters())

                tp = self.train_preprocessor

                # Train for 10 epochs
                for epoch in range(10):
                    # Get iterator of preprocessed batch of train data
                    batch_iter: Iterator[Batch] = tp.get_train_batch_iterator()

                    for batch in tqdm(batch_iter):
                        word = batch["text_tag"]["data"]
                        char = batch["char_tag"]["data"]
                        output = batch["output_tag"]["data"]
                        word_masks = batch["text_tag"]["masks"][0]

                        optim.zero_grad()

                        loss = model(word, char, output, mask=word_masks)

                        loss.backward()
                        optim.step()
    """
    def __init__(self):
        self._tp_request: Dict = {}
        self._tp_config: Dict = {}
        self._pack_iterator: Optional[Iterator[DataPack]] = None
        self._tp: Optional[TrainPreprocessor] = None
        self._initialized: bool = False

    def initialize(self):
        # Check if initialize has already been called before
        if self._initialized:
            return

        self._tp_config: Dict = self.create_tp_config()
        self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator()
        self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator)
        self._tp.initialize(config=self._tp_config)
        self._initialized = True

    @property
    def train_preprocessor(self) -> Optional[TrainPreprocessor]:
        r"""The instance of type
        :class:`~forte.train_preprocessor.TrainPreprocessor`. The Trainer will
        internally create an instance of this class to do the actual training.
        """
        if not self._initialized:
            raise ValueError("initialize should be called to "
                             "build train preprocessor.")
        return self._tp

    def run(self):
        r"""The main entry for starting a training process."""
        self.initialize()
        self.train()

    @abstractmethod
    def create_tp_config(self) -> Dict:
        r"""Users should overwrite this method to provide a concrete train
        preprocessor config. An example config is given in the example above.
        Please refer to :meth:`default_configs` in class
        :class:`~forte.train_preprocessor.TrainPreprocessor` for detailed
        specification of each options in the config.
        """
        raise NotImplementedError

    @abstractmethod
    def create_pack_iterator(self) -> Iterator[DataPack]:
        r"""Users should overwrite this method to provide an iterator of
        :class:`~forte.data.data_pack.DataPack`. This iterator will be used to
        produce each input data pack consumed for training. Typically, users
        can create a reader of type
        :class:`~forte.data.readers.base_reader.BaseReader`. The reader can be
        wrapped as an iterator of data pack via forte pipeline system. Please
        refer to the above example for how to create this.
        """
        raise NotImplementedError

    @abstractmethod
    def train(self):
        r"""Users should overwrite this method to provide the detail logic of
        doing the training (forward and backward processing). Users can use the
        :meth:`get_train_batch_iterator` in class
        :class:`~forte.train_preprocessor.TrainPreprocessor` to get an iterator
        of pre-processed batch of data. Please refer to that method for details.
        An example is also provided above.
        """
        raise NotImplementedError

    def save(self, *args: Any, **kwargs: Any):
        r"""Save the training states to disk for the usage of later
        predicting phase. The default training states is the request inside
        TrainPreprocessor. Please refer to :meth:`request` in class
        :class:`~forte.train_preprocessor.TrainPreprocessor` for details.
        Typically users do not need to overwrite this method as default saved
        training state is enough for predicting usage. But users can also
        overwrite this method to achieve special purpose.
        """

        # Check arg type. Default behavior only supports str as args[0] which
        # is considered as a disk file path.
        if not isinstance(args[0], str):
            raise ValueError(
                "Do not support input args: {} and kwargs: {}".format(
                    args, kwargs))

        file_path = args[0]

        if not isinstance(self.train_preprocessor, TrainPreprocessor):
            raise ValueError("Invalid TrainPreprocessor type: {}".format(
                self.train_preprocessor))

        request: Dict = self.train_preprocessor.request

        with open(file_path, "wb") as f:
            pickle.dump(request, f)