def initialize(self): # Check if initialize has already been called before if self._initialized: return self._tp_config: Dict = self.create_tp_config() self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator() self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator) self._tp.initialize(config=self._tp_config) self._initialized = True
def initialize(self): # Check if initialize has already been called before if self._initialized: return logging.info("Initializing the trainer...") self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator() self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator) self._tp.initialize(config=self.create_tp_config()) self._initialized = True logging.info("Done Initializing.")
def setUp(self): self.config = { "max_char_length": 45, "train_path": "data_samples/train_pipeline_test", "val_path": "data_samples/train_pipeline_test", "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True } text_extractor: AttributeExtractor = \ AttributeExtractor(config={"entry_type": Token, "vocab_method": "indexing", "attribute": "text"}) char_extractor: CharExtractor = \ CharExtractor( config={"entry_type": Token, "vocab_method": "indexing", "max_char_length": self.config["max_char_length"]}) # Add output part in request based on different task type ner_extractor: BioSeqTaggingExtractor = \ BioSeqTaggingExtractor(config={"entry_type": EntityMention, "attribute": "ner_type", "tagging_unit": Token, "vocab_method": "indexing"}) self.tp_request = { "scope": Sentence, "schemes": { "text_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": text_extractor }, "char_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": char_extractor }, "ner_tag": { "type": TrainPreprocessor.DATA_OUTPUT, "extractor": ner_extractor } } } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() self.tp_config = { "dataset": { "batch_size": self.config["batch_size_tokens"] } } train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = \ train_pl.process_dataset(self.config["train_path"]) self.train_preprocessor = \ TrainPreprocessor(pack_iterator=pack_iterator, request=self.tp_request, config=self.tp_config)
class TrainPreprocessorTest(unittest.TestCase): def setUp(self): self.config = { "max_char_length": 45, "train_path": "data_samples/train_pipeline_test", "val_path": "data_samples/train_pipeline_test", "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True } text_extractor: AttributeExtractor = \ AttributeExtractor(config={"entry_type": Token, "vocab_method": "indexing", "attribute": "text"}) char_extractor: CharExtractor = \ CharExtractor( config={"entry_type": Token, "vocab_method": "indexing", "max_char_length": self.config["max_char_length"]}) # Add output part in request based on different task type ner_extractor: BioSeqTaggingExtractor = \ BioSeqTaggingExtractor(config={"entry_type": EntityMention, "attribute": "ner_type", "tagging_unit": Token, "vocab_method": "indexing"}) self.tp_request = { "scope": Sentence, "schemes": { "text_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": text_extractor }, "char_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": char_extractor }, "ner_tag": { "type": TrainPreprocessor.DATA_OUTPUT, "extractor": ner_extractor } } } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() self.tp_config = { "dataset": { "batch_size": self.config["batch_size_tokens"] } } train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = \ train_pl.process_dataset(self.config["train_path"]) self.train_preprocessor = \ TrainPreprocessor(pack_iterator=pack_iterator, request=self.tp_request, config=self.tp_config) def test_parse_request(self): self.assertTrue(self.train_preprocessor.request is not None) self.assertTrue("scope" in self.train_preprocessor.request) self.assertTrue("schemes" in self.train_preprocessor.request) self.assertTrue(len(self.train_preprocessor.request["schemes"]), 3) self.assertTrue( "text_tag" in self.train_preprocessor.request["schemes"]) self.assertTrue( "char_tag" in self.train_preprocessor.request["schemes"]) self.assertTrue( "ner_tag" in self.train_preprocessor.request["schemes"]) for tag, scheme in \ self.train_preprocessor.request["schemes"].items(): self.assertTrue("extractor" in scheme) self.assertTrue("converter" in scheme) self.assertTrue( issubclass(type(scheme["extractor"]), BaseExtractor)) self.assertTrue(isinstance(scheme["converter"], Converter)) def test_build_vocab(self): schemes: Dict[str, Any] = \ self.train_preprocessor.request["schemes"] text_extractor: AttributeExtractor = schemes["text_tag"]["extractor"] vocab: Vocabulary = text_extractor.vocab self.assertTrue(vocab.has_element("EU")) self.assertTrue(vocab.has_element("Peter")) char_extractor: CharExtractor = schemes["char_tag"]["extractor"] vocab: Vocabulary = char_extractor.vocab self.assertTrue(vocab.has_element("a")) self.assertTrue(vocab.has_element("b")) self.assertTrue(vocab.has_element(".")) ner_extractor: BioSeqTaggingExtractor = schemes["ner_tag"]["extractor"] vocab: Vocabulary = ner_extractor.vocab self.assertTrue(vocab.has_element(("PER", "B"))) self.assertTrue(vocab.has_element((None, "O"))) self.assertTrue(vocab.has_element(("MISC", "I"))) def test_build_dataset_iterator(self): train_iterator = \ self.train_preprocessor._build_dataset_iterator() batchs = [] for batch in train_iterator: batchs.append(batch) self.assertEqual(len(batchs), 2) self.assertEqual(batchs[0].batch_size, 5) self.assertEqual(batchs[1].batch_size, 2) for batch in batchs: self.assertTrue(hasattr(batch, "text_tag")) self.assertTrue(hasattr(batch, "char_tag")) self.assertTrue(hasattr(batch, "ner_tag")) for tag, batch_t in batch.items(): self.assertTrue("data" in batch_t) self.assertEqual(type(batch_t["data"]), torch.Tensor) self.assertTrue("masks" in batch_t) if tag == "text_tag" or tag == "ner_tag": self.assertEqual(len(batch_t["masks"]), 1) self.assertEqual(type(batch_t["masks"][0]), torch.Tensor) else: self.assertEqual(len(batch_t["masks"]), 2) self.assertEqual(type(batch_t["masks"][0]), torch.Tensor) self.assertEqual(type(batch_t["masks"][1]), torch.Tensor)
def setUp(self): root_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir)) self.config = { "max_char_length": 45, "train_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "val_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True, } text_extractor = ( "forte.data.extractors.attribute_extractor.AttributeExtractor") text_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "attribute": "text", } char_extractor = "forte.data.extractors.char_extractor.CharExtractor" char_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "max_char_length": self.config["max_char_length"], } # Add output part in request based on different task type ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor" # pylint: disable=line-too-long ner_extractor_config = { "entry_type": "ft.onto.base_ontology.EntityMention", "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token", "vocab_method": "indexing", } self.tp_request = { "scope": "ft.onto.base_ontology.Sentence", "feature_scheme": { "text_tag": { "type": "data_input", "extractor": { "class_name": text_extractor, "config": text_extractor_config, }, }, "char_tag": { "type": "data_input", "extractor": { "class_name": char_extractor, "config": char_extractor_config, }, }, "ner_tag": { "type": "data_output", "extractor": { "class_name": ner_extractor, "config": ner_extractor_config, }, }, }, } self.tp_config = { "request": self.tp_request, "dataset": { "batch_size": self.config["batch_size_tokens"] }, } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = train_pl.process_dataset( self.config["train_path"]) self.train_preprocessor = TrainPreprocessor( pack_iterator=pack_iterator) self.train_preprocessor.initialize(config=self.tp_config)
class TrainPreprocessorTest(unittest.TestCase): def setUp(self): root_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir)) self.config = { "max_char_length": 45, "train_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "val_path": os.path.join(root_path, "data_samples/train_pipeline_test"), "num_epochs": 1, "batch_size_tokens": 5, "learning_rate": 0.01, "momentum": 0.9, "nesterov": True, } text_extractor = ( "forte.data.extractors.attribute_extractor.AttributeExtractor") text_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "attribute": "text", } char_extractor = "forte.data.extractors.char_extractor.CharExtractor" char_extractor_config = { "entry_type": "ft.onto.base_ontology.Token", "vocab_method": "indexing", "max_char_length": self.config["max_char_length"], } # Add output part in request based on different task type ner_extractor = "forte.data.extractors.seqtagging_extractor.BioSeqTaggingExtractor" # pylint: disable=line-too-long ner_extractor_config = { "entry_type": "ft.onto.base_ontology.EntityMention", "attribute": "ner_type", "tagging_unit": "ft.onto.base_ontology.Token", "vocab_method": "indexing", } self.tp_request = { "scope": "ft.onto.base_ontology.Sentence", "feature_scheme": { "text_tag": { "type": "data_input", "extractor": { "class_name": text_extractor, "config": text_extractor_config, }, }, "char_tag": { "type": "data_input", "extractor": { "class_name": char_extractor, "config": char_extractor_config, }, }, "ner_tag": { "type": "data_output", "extractor": { "class_name": ner_extractor, "config": ner_extractor_config, }, }, }, } self.tp_config = { "request": self.tp_request, "dataset": { "batch_size": self.config["batch_size_tokens"] }, } self.reader = CoNLL03Reader() self.evaluator = CoNLLNEREvaluator() train_pl: Pipeline = Pipeline() train_pl.set_reader(self.reader) train_pl.initialize() pack_iterator: Iterator[PackType] = train_pl.process_dataset( self.config["train_path"]) self.train_preprocessor = TrainPreprocessor( pack_iterator=pack_iterator) self.train_preprocessor.initialize(config=self.tp_config) def test_parse_request(self): self.assertTrue(self.train_preprocessor.request is not None) self.assertTrue("scope" in self.train_preprocessor.request) self.assertTrue("schemes" in self.train_preprocessor.request) self.assertTrue(len(self.train_preprocessor.request["schemes"]), 3) self.assertTrue( "text_tag" in self.train_preprocessor.request["schemes"]) self.assertTrue( "char_tag" in self.train_preprocessor.request["schemes"]) self.assertTrue( "ner_tag" in self.train_preprocessor.request["schemes"]) for tag, scheme in self.train_preprocessor.request["schemes"].items(): self.assertTrue("extractor" in scheme) self.assertTrue("converter" in scheme) self.assertTrue( issubclass(type(scheme["extractor"]), BaseExtractor)) self.assertTrue(isinstance(scheme["converter"], Converter)) def test_build_vocab(self): schemes: Dict[str, Any] = self.train_preprocessor.request["schemes"] text_extractor: AttributeExtractor = schemes["text_tag"]["extractor"] vocab: Vocabulary = text_extractor.vocab self.assertTrue(vocab.has_element("EU")) self.assertTrue(vocab.has_element("Peter")) char_extractor: CharExtractor = schemes["char_tag"]["extractor"] vocab: Vocabulary = char_extractor.vocab self.assertTrue(vocab.has_element("a")) self.assertTrue(vocab.has_element("b")) self.assertTrue(vocab.has_element(".")) ner_extractor: BioSeqTaggingExtractor = schemes["ner_tag"]["extractor"] vocab: Vocabulary = ner_extractor.vocab self.assertTrue(vocab.has_element(("PER", "B"))) self.assertTrue(vocab.has_element((None, "O"))) self.assertTrue(vocab.has_element(("MISC", "I"))) def test_build_dataset_iterator(self): train_iterator = self.train_preprocessor._build_dataset_iterator() batchs = [] for batch in train_iterator: batchs.append(batch) self.assertEqual(len(batchs), 2) self.assertEqual(batchs[0].batch_size, 5) self.assertEqual(batchs[1].batch_size, 2) for batch in batchs: self.assertTrue(hasattr(batch, "text_tag")) self.assertTrue(hasattr(batch, "char_tag")) self.assertTrue(hasattr(batch, "ner_tag")) for tag, batch_t in batch.items(): self.assertTrue("data" in batch_t) self.assertEqual(type(batch_t["data"]), torch.Tensor) self.assertTrue("masks" in batch_t) if tag == "text_tag" or tag == "ner_tag": self.assertEqual(len(batch_t["masks"]), 1) self.assertEqual(type(batch_t["masks"][0]), torch.Tensor) else: self.assertEqual(len(batch_t["masks"]), 2) self.assertEqual(type(batch_t["masks"][0]), torch.Tensor) self.assertEqual(type(batch_t["masks"][1]), torch.Tensor)
class BaseTrainer: r""" `BaseTrainer` is the main entry for using Forte training framework. Users should inherit this class and overwrite multiple methods defined in this class. Internally, it will make use of :class:`~forte.train_preprocessor.TrainPreprocessor` to do the the actual training. Please refer to the documentation of that class for details. A concrete example is provided below showing how to use this class. Below is an example for how to use this class. A fully documented example is also provided in :class:`~forte.examples.tagging.tagging_trainer.TaggingTrainer`. .. code-block:: python class TaggingTrainer(BaseTrainer): def create_tp_request(self) -> Dict: # Generate request text_extractor: AttributeExtractor = \ AttributeExtractor(config={"entry_type": Token, "vocab_method": "indexing", "attribute": "text"}) char_extractor: CharExtractor = \ CharExtractor(config={"entry_type": Token, "vocab_method": "indexing", "max_char_length": 45}) output_extractor: BaseExtractor = \ BioSeqTaggingExtractor(config={"entry_type": EntityMention, "attribute": "ner_type", "based_on": Token, "vocab_method": "indexing"}) tp_request: Dict = { "scope": Sentence, "schemes": { "text_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": text_extractor }, "char_tag": { "type": TrainPreprocessor.DATA_INPUT, "extractor": char_extractor }, "output_tag": { "type": TrainPreprocessor.DATA_OUTPUT, "extractor": output_extractor } } } return tp_request def create_tp_config(self) -> Dict: tp_config: Dict = { "dataset": { "batch_size": 512 } } return tp_config def create_pack_iterator(self) -> Iterator[DataPack]: reader = CoNLL03Reader() train_pl: Pipeline = Pipeline() train_pl.set_reader(reader) train_pl.initialize() pack_iterator: Iterator[DataPack] = \ train_pl.process_dataset(self.config_data.train_path) return pack_iterator def train(self): schemes: Dict = self.train_preprocessor.request["schemes"] text_extractor: BaseExtractor = \ schemes["text_tag"]["extractor"] char_extractor: BaseExtractor = \ schemes["char_tag"]["extractor"] output_extractor: BaseExtractor = \ schemes["output_tag"]["extractor"] model: BiRecurrentConvCRF = \ BiRecurrentConvCRF(word_vocab=text_extractor.get_dict(), char_vocab_size=char_extractor.size(), tag_vocab_size=output_extractor.size(), config_model=self.config_model) model.to(self.device) optim: Optimizer = SGD(model.parameters()) tp = self.train_preprocessor # Train for 10 epochs for epoch in range(10): # Get iterator of preprocessed batch of train data batch_iter: Iterator[Batch] = tp.get_train_batch_iterator() for batch in tqdm(batch_iter): word = batch["text_tag"]["data"] char = batch["char_tag"]["data"] output = batch["output_tag"]["data"] word_masks = batch["text_tag"]["masks"][0] optim.zero_grad() loss = model(word, char, output, mask=word_masks) loss.backward() optim.step() """ def __init__(self): self._tp_request: Dict = {} self._tp_config: Dict = {} self._pack_iterator: Optional[Iterator[DataPack]] = None self._tp: Optional[TrainPreprocessor] = None self._initialized: bool = False def initialize(self): # Check if initialize has already been called before if self._initialized: return self._tp_config: Dict = self.create_tp_config() self._pack_iterator: Iterator[DataPack] = self.create_pack_iterator() self._tp = TrainPreprocessor(pack_iterator=self._pack_iterator) self._tp.initialize(config=self._tp_config) self._initialized = True @property def train_preprocessor(self) -> Optional[TrainPreprocessor]: r"""The instance of type :class:`~forte.train_preprocessor.TrainPreprocessor`. The Trainer will internally create an instance of this class to do the actual training. """ if not self._initialized: raise ValueError("initialize should be called to " "build train preprocessor.") return self._tp def run(self): r"""The main entry for starting a training process.""" self.initialize() self.train() @abstractmethod def create_tp_config(self) -> Dict: r"""Users should overwrite this method to provide a concrete train preprocessor config. An example config is given in the example above. Please refer to :meth:`default_configs` in class :class:`~forte.train_preprocessor.TrainPreprocessor` for detailed specification of each options in the config. """ raise NotImplementedError @abstractmethod def create_pack_iterator(self) -> Iterator[DataPack]: r"""Users should overwrite this method to provide an iterator of :class:`~forte.data.data_pack.DataPack`. This iterator will be used to produce each input data pack consumed for training. Typically, users can create a reader of type :class:`~forte.data.readers.base_reader.BaseReader`. The reader can be wrapped as an iterator of data pack via forte pipeline system. Please refer to the above example for how to create this. """ raise NotImplementedError @abstractmethod def train(self): r"""Users should overwrite this method to provide the detail logic of doing the training (forward and backward processing). Users can use the :meth:`get_train_batch_iterator` in class :class:`~forte.train_preprocessor.TrainPreprocessor` to get an iterator of pre-processed batch of data. Please refer to that method for details. An example is also provided above. """ raise NotImplementedError def save(self, *args: Any, **kwargs: Any): r"""Save the training states to disk for the usage of later predicting phase. The default training states is the request inside TrainPreprocessor. Please refer to :meth:`request` in class :class:`~forte.train_preprocessor.TrainPreprocessor` for details. Typically users do not need to overwrite this method as default saved training state is enough for predicting usage. But users can also overwrite this method to achieve special purpose. """ # Check arg type. Default behavior only supports str as args[0] which # is considered as a disk file path. if not isinstance(args[0], str): raise ValueError( "Do not support input args: {} and kwargs: {}".format( args, kwargs)) file_path = args[0] if not isinstance(self.train_preprocessor, TrainPreprocessor): raise ValueError("Invalid TrainPreprocessor type: {}".format( self.train_preprocessor)) request: Dict = self.train_preprocessor.request with open(file_path, "wb") as f: pickle.dump(request, f)