def build_data_loaders( train_data: List[Instance], dev_data: List[Instance], ) -> Tuple[DataLoader, DataLoader]: train_loader = SimpleDataLoader(train_data, 8, shuffle=True) dev_loader = SimpleDataLoader(dev_data, 8, shuffle=False) return train_loader, dev_loader
def __init__(self) -> None: self.config: Config = Config().parse_args(known_only=True) bert_token_indexers = PretrainedTransformerIndexer( model_name=self.config.model_name) bert_tokenizer = PretrainedTransformerTokenizer( model_name=self.config.model_name) reader = TextClassificationJsonReader( token_indexers={"tokens": bert_token_indexers}, tokenizer=bert_tokenizer) train_instances = list(reader.read(self.config.train_file)) dev_instances = list(reader.read(self.config.dev_file)) test_instances = list(reader.read(self.config.test_file)) self.vocab: Vocabulary = Vocabulary.from_instances(train_instances) # 2. init the data loader self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True) self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False) self.train_data_loader.index_with(self.vocab) self.dev_data_loader.index_with(self.vocab) # 3. init the model self.model = self.init_model() self.trainer = self.init_trainer()
def build_data_loaders(train_data: List[Instance], dev_data: List[Instance], batch_size: int) -> Tuple[DataLoader, DataLoader]: """ Creates data loaders which loads data in batches of size batch_size for training and validation Adapted from https://guide.allennlp.org/training-and-prediction """ train_loader = SimpleDataLoader(train_data, batch_size, shuffle=True) dev_loader = SimpleDataLoader(dev_data, batch_size, shuffle=False) return train_loader, dev_loader
def test_from_params_in_trainer(self): # This is more of an integration test, making sure that a bunch of pieces fit together # correctly, but it matters most for this learning rate scheduler, so we're testing it here. params = Params({ "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, }) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. instances = [1] * 40 optim = self._get_optimizer() trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=SimpleDataLoader(instances, batch_size=10), ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and # that num_steps_per_epoch is computed and passed correctly. This logic happens inside of # `Trainer.from_partial_objects`. assert trainer._learning_rate_scheduler.num_epochs == 5 assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4 # And we'll do one more to make sure that we can override num_epochs in the scheduler if we # really want to. Not sure why you would ever want to in this case; this is just testing # the functionality. params = Params({ "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": 3, "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, }) trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=SimpleDataLoader(instances, batch_size=10), ) assert trainer._learning_rate_scheduler.num_epochs == 3
def __init__( self, model: Model, train_data_path: DatasetReaderInput, train_dataset_reader: DatasetReader, *, test_dataset_reader: Optional[DatasetReader] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: List[str] = None, cuda_device: int = -1, lissa_batch_size: int = 8, damping: float = 3e-3, num_samples: int = 1, recursion_depth: Union[float, int] = 0.25, scale: float = 1e4, ) -> None: super().__init__( model=model, train_data_path=train_data_path, train_dataset_reader=train_dataset_reader, test_dataset_reader=test_dataset_reader, train_data_loader=train_data_loader, test_data_loader=test_data_loader, params_to_freeze=params_to_freeze, cuda_device=cuda_device, ) self._lissa_dataloader = SimpleDataLoader( list(self._train_loader.iter_instances()), lissa_batch_size, shuffle=True, vocab=self.vocab, ) self._lissa_dataloader.set_target_device(self.device) if isinstance(recursion_depth, float) and recursion_depth > 0.0: self._lissa_dataloader.batches_per_epoch = int( len(self._lissa_dataloader) * recursion_depth) elif isinstance(recursion_depth, int) and recursion_depth > 0: self._lissa_dataloader.batches_per_epoch = recursion_depth else: raise ValueError( "'recursion_depth' should be a positive int or float") self._damping = damping self._num_samples = num_samples self._recursion_depth = recursion_depth self._scale = scale
def test_can_optimise_model_with_dense_and_sparse_params(self): optimizer_params = Params({"type": "dense_sparse_adam"}) parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params) for instance in self.instances: instance.index_fields(self.vocab) GradientDescentTrainer(self.model, optimizer, SimpleDataLoader(self.instances, 2)).train()
def test_console_log_callback(self): total_instances = 1000 batch_size = 25 reader = FakeDatasetReader(total_instances, batch_size) data_loader = SimpleDataLoader.from_dataset_reader( reader, "fake_path", batch_size=batch_size) instances = list(data_loader.iter_instances()) vocab = Vocabulary.from_instances(instances) data_loader.index_with(vocab) model = FakeModel(vocab) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9) trainer = GradientDescentTrainer( model, optimizer, data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=[ ConsoleLoggerCallback.from_params( Params({"should_log_inputs": True}), serialization_dir=self.TEST_DIR, ) ], ) trainer.train()
def test_trainer_can_log_batch_inputs(self): total_instances = 1000 batch_size = 25 reader = FakeDatasetReader(total_instances, batch_size) data_loader = SimpleDataLoader.from_dataset_reader( reader, "fake_path", batch_size=batch_size) instances = list(data_loader.iter_instances()) vocab = Vocabulary.from_instances(instances) data_loader.index_with(vocab) model = FakeModel(vocab) optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9) trainer = GradientDescentTrainer( model, optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=[ TensorBoardCallback( serialization_dir=self.TEST_DIR, distribution_interval=2, ) ], ) trainer.train()
def test_get_inverse_hvp_lissa(): vs = [torch.tensor([1.0, 1.0])] # create a fake model vocab = Vocabulary() params = torch.tensor([1, 2]).float() model = DummyBilinearModelForTestingIF(vocab, params) used_params = list(model.parameters()) # create a fake instance: just a matrix A = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) fake_instance = Instance({"tensors": TensorField(A)}) # wrap fake instance into dataloader lissa_data_loader = SimpleDataLoader([fake_instance], batch_size=1, batches_per_epoch=1) inverse_hvp = get_inverse_hvp_lissa( vs=vs, model=model, used_params=used_params, lissa_data_loader=lissa_data_loader, damping=0.0, num_samples=1, scale=1.0, ) # I tried to increase recursion depth to actually approx the inverse Hessian vector product, # but I suspect due to extremely small number of data point, the algorithm doesn't work well # on this toy example ans = torch.tensor([-1.5, -4.5]) assert torch.equal(inverse_hvp, ans)
def test_batch_of_entirely_empty_lists_works(self): instances = [self.empty_instance, self.empty_instance] model = DummyModel(self.vocab) model.eval() loader = SimpleDataLoader(instances, 2, vocab=self.vocab) batch = next(iter(loader)) model.forward(**batch)
def build_data_loaders( config, train_data: List[Instance], dev_data: List[Instance], test_data: List[Instance] ) -> Tuple[DataLoader, DataLoader, DataLoader]: train_loader = SimpleDataLoader(train_data, config.batch_size_for_train, shuffle=True) dev_loader = SimpleDataLoader(dev_data, config.batch_size_for_eval, shuffle=False) test_loader = SimpleDataLoader(test_data, config.batch_size_for_eval, shuffle=False) return train_loader, dev_loader, test_loader
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. loader = SimpleDataLoader(instances, 3) loader.index_with(vocab) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] assert lengths.tolist() == expected_lengths # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): assert numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, )
def test_trainer_saves_models_at_specified_interval(self): data_loader = SimpleDataLoader(self.instances, 4) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer( serialization_dir=self.TEST_DIR, model_save_interval=0.0001, num_serialized_models_to_keep=10, ), ) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = "model_state_epoch_*" file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [ re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names ] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == "1" assert "." in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove( os.path.join(self.TEST_DIR, "model_state_epoch_{}.th".format(k))) os.remove( os.path.join(self.TEST_DIR, "training_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "best.th")) restore_trainer = GradientDescentTrainer( self.model, self.optimizer, self.data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, model_save_interval=0.0001), ) epoch = restore_trainer._restore_checkpoint() assert epoch == 2 # One batch per epoch. assert restore_trainer._batch_num_total == 2
def benchmark_xlmr_mdl(): from allennlp.data import DataLoader from allennlp.training.util import evaluate xlmr = load_xlmr_coref_model() instances = xlmr.dataset_reader.load_dataset(testset) data_loader = SimpleDataLoader(instances, 1) data_loader.index_with(xlmr.model.vocab) start = time.time() metrics = evaluate(xlmr.model, data_loader) print('**XLM-R model**') print_speed_performance(start, num_sentences, num_tokens) print('Precision : ', metrics['coref_precision']) print('Recall : ', metrics['coref_recall']) print('F1 : ', metrics['coref_f1']) print('Mention Recall : ', metrics['mention_recall'])
class SlowDataLoader: data_loader = SimpleDataLoader(self.instances, batch_size=2) def __iter__(self): time.sleep(2.5) return iter(self.data_loader) def __len__(self): return len(self.data_loader) def set_target_device(self, _): pass
def test_sanity_check_callback(self): model_with_bias = FakeModelForTestingNormalizationBiasVerification( use_bias=True) inst = Instance({"x": TensorField(torch.rand(3, 1, 4))}) data_loader = SimpleDataLoader([inst, inst], 2) trainer = GradientDescentTrainer( model_with_bias, self.optimizer, data_loader, num_epochs=1, serialization_dir=self.TEST_DIR, callbacks=[SanityChecksCallback(serialization_dir=self.TEST_DIR)], ) with pytest.raises(SanityCheckError): trainer.train()
def test_trainer_can_log_learning_rates_tensorboard(self): data_loader = SimpleDataLoader(self.instances, 4) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=[ TensorBoardCallback( serialization_dir=self.TEST_DIR, summary_interval=2, should_log_learning_rate=True, ) ], ) trainer.train()
def test_regularization(self): penalty = self.model.get_regularization_penalty() assert penalty is None data_loader = SimpleDataLoader(self.instances, batch_size=32) trainer = GradientDescentTrainer(self.model, None, data_loader) # optimizer, # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iter(data_loader)) validation_batch = next(iter(data_loader)) training_loss = trainer.batch_outputs( training_batch, for_training=True)["loss"].item() validation_loss = trainer.batch_outputs( validation_batch, for_training=False)["loss"].item() # Training loss should have the regularization penalty, but validation loss should not. numpy.testing.assert_almost_equal(training_loss, validation_loss)
def test_trainer_respects_epoch_size_smaller_tnan_total(self): batches_per_epoch = 1 num_epochs = 2 data_loader_smaller_epoch = SimpleDataLoader( self.instances, 2, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_smaller_epoch, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def test_sanity_check_default(self): model_with_bias = FakeModelForTestingNormalizationBiasVerification(use_bias=True) inst = Instance({"x": TensorField(torch.rand(3, 1, 4))}) data_loader = SimpleDataLoader([inst, inst], 2) trainer = GradientDescentTrainer.from_partial_objects( model_with_bias, serialization_dir=self.TEST_DIR, data_loader=data_loader, num_epochs=1, ) with pytest.raises(SanityCheckError): trainer.train() trainer = GradientDescentTrainer.from_partial_objects( model_with_bias, serialization_dir=self.TEST_DIR, data_loader=data_loader, num_epochs=1, run_sanity_checks=False, ) # Check is not run, so no failure. trainer.train()
class SimpleInfluence(InfluenceInterpreter): """ Registered as an `InfluenceInterpreter` with name "simple-influence". This goes through every example in the train set to calculate the influence score. It uses [LiSSA (Linear time Stochastic Second-Order Algorithm)](https://api.semanticscholar.org/CorpusID:10569090) to approximate the inverse of the Hessian used for the influence score calculation. # Parameters lissa_batch_size : `int`, optional (default = `8`) The batch size to use for LiSSA. According to [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974), it is better to use batched samples for approximation for better stability. damping : `float`, optional (default = `3e-3`) This is a hyperparameter for LiSSA. A damping termed added in case the approximated Hessian (during LiSSA) has negative eigenvalues. num_samples : `int`, optional (default = `1`) This is a hyperparameter for LiSSA that we determine how many rounds of the recursion process we would like to run for approxmation. recursion_depth : `Union[float, int]`, optional (default = `0.25`) This is a hyperparameter for LiSSA that determines the recursion depth we would like to go through. If a `float`, it means X% of the training examples. If an `int`, it means recurse for X times. scale : `float`, optional, (default = `1e4`) This is a hyperparameter for LiSSA to tune such that the Taylor expansion converges. It is applied to scale down the loss during LiSSA to ensure that `H <= I`, where `H` is the Hessian and `I` is the identity matrix. See footnote 2 of [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974). !!! Note We choose the same default values for the LiSSA hyperparameters as [Han, Xiaochuang et al. (2020)](https://api.semanticscholar.org/CorpusID:218628619). """ def __init__( self, model: Model, train_data_path: DatasetReaderInput, train_dataset_reader: DatasetReader, *, test_dataset_reader: Optional[DatasetReader] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: List[str] = None, cuda_device: int = -1, lissa_batch_size: int = 8, damping: float = 3e-3, num_samples: int = 1, recursion_depth: Union[float, int] = 0.25, scale: float = 1e4, ) -> None: super().__init__( model=model, train_data_path=train_data_path, train_dataset_reader=train_dataset_reader, test_dataset_reader=test_dataset_reader, train_data_loader=train_data_loader, test_data_loader=test_data_loader, params_to_freeze=params_to_freeze, cuda_device=cuda_device, ) self._lissa_dataloader = SimpleDataLoader( list(self._train_loader.iter_instances()), lissa_batch_size, shuffle=True, vocab=self.vocab, ) self._lissa_dataloader.set_target_device(self.device) if isinstance(recursion_depth, float) and recursion_depth > 0.0: self._lissa_dataloader.batches_per_epoch = int( len(self._lissa_dataloader) * recursion_depth) elif isinstance(recursion_depth, int) and recursion_depth > 0: self._lissa_dataloader.batches_per_epoch = recursion_depth else: raise ValueError( "'recursion_depth' should be a positive int or float") self._damping = damping self._num_samples = num_samples self._recursion_depth = recursion_depth self._scale = scale @overrides def _calculate_influence_scores( self, test_instance: Instance, test_loss: float, test_grads: Sequence[torch.Tensor]) -> List[float]: # Approximate the inverse of Hessian-Vector Product through LiSSA inv_hvp = get_inverse_hvp_lissa( test_grads, self.model, self.used_params, self._lissa_dataloader, self._damping, self._num_samples, self._scale, ) return [ # dL_test * d theta as in 2.2 of [https://arxiv.org/pdf/2005.06676.pdf] # TODO (epwalsh): should we divide `x.grads` by `self._scale`? torch.dot(inv_hvp, _flatten_tensors(x.grads)).item() for x in Tqdm.tqdm(self.train_instances, desc="scoring train instances") ]
train_data, dev_data = read_data(dataset_reader) vocab = build_vocab(train_data + dev_data) model = build_model(vocab) train_loader, dev_loader = build_data_loaders(train_data, dev_data) train_loader.index_with(vocab) dev_loader.index_with(vocab) # You obviously won't want to create a temporary file for your training # results, but for execution in binder for this guide, we need to do this. with tempfile.TemporaryDirectory() as serialization_dir: trainer = build_trainer(model, serialization_dir, train_loader, dev_loader) trainer.train() return model, dataset_reader # We've copied the training loop from an earlier example, with updated model # code, above in the Setup section. We run the training loop to get a trained # model. model, dataset_reader = run_training_loop() # Now we can evaluate the model on a new dataset. test_data = list(dataset_reader.read("quick_start/data/movie_review/test.tsv")) data_loader = SimpleDataLoader(test_data, 8) data_loader.index_with(model.vocab) results = evaluate(model, data_loader) print(results)
class TaggerTrainer: def __init__(self) -> None: self.config: Config = Config().parse_args(known_only=True) bert_token_indexers = PretrainedTransformerIndexer(model_name=self.config.model_name) reader = SequenceTaggingDatasetReader(token_indexers={"tokens": bert_token_indexers}) train_instances = list(reader.read(self.config.train_file)) dev_instances = list(reader.read(self.config.dev_file)) test_instances = list(reader.read(self.config.test_file)) self.vocab: Vocabulary = Vocabulary.from_instances(train_instances) # 2. init the data loader self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True) self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False) self.train_data_loader.index_with(self.vocab) self.dev_data_loader.index_with(self.vocab) # 3. init the model self.model = self.init_model() self.trainer = self.init_trainer() def init_crf_model(self) -> Model: """init crf tagger model """ # 1. import related modules from allennlp bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name) bert_text_field_embedder tagger = SimpleTagger( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={ 'tokens': bert_text_field_embedder } ), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), verbose_metrics=True, calculate_span_f1=True, label_encoding="BMES", ) tagger.to(device=self.config.device) return tagger def init_model(self) -> Model: """build the model Args: vocab (Vocabulary): the vocabulary of corpus Returns: Model: the final models """ bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name) bert_text_field_embedder tagger = SimpleTagger( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={ 'tokens': bert_text_field_embedder } ), encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()), verbose_metrics=True, calculate_span_f1=True, label_encoding="BMES", ) tagger.to(device=self.config.device) return tagger def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer def train(self): self.trainer.train()
vocab=vocab) tgt_char_encoder = TokenCharactersEncoder(embedding=tgt_char_embedding, encoder=GruSeq2VecEncoder(input_size=args.emb_dim, hidden_size=args.hid_dim)) src_embedders = BasicTextFieldEmbedder({ "tokens": src_embedding, "character_tokens": src_char_encoder }) tgt_embedders = BasicTextFieldEmbedder({ "tokens": tgt_embedding, "character_tokens": tgt_char_encoder }) train_loader = SimpleDataLoader.from_dataset_reader( reader=dataset_reader, data_path=args.train_file, batch_size=args.bs, shuffle=True) train_loader.index_with(vocab) val_loader = SimpleDataLoader.from_dataset_reader(reader=dataset_reader, data_path=args.valid_file, batch_size=args.bs) val_loader.index_with(vocab) model = create_seq2seqmodel(vocab, src_embedders=src_embedders, tgt_embedders=tgt_embedders, hidden_dim=args.hid_dim, max_decoding_steps=args.maxlen, device=device) def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"The model has {count_parameters(model)} parameters.") save_dir = None if args.save:
def main(serialization_directory: str, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file( os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config["dataset_reader"]) evaluation_data_path = data if data else config["validation_data_path"] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) dataset = list(dataset_reader.read(evaluation_data_path)) with torch.autograd.no_grad(): loader = SimpleDataLoader(dataset, 32) model_predictions: List[List[str]] = [] for batch in Tqdm.tqdm(loader): batch = move_to_device(batch, device) result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(dataset, model_predictions): fields = instance.fields verb_index = fields["metadata"]["verb_index"] gold_tags = fields["metadata"]["gold_tags"] sentence = fields["metadata"]["words"] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
class TaggerTrainer: def __init__(self) -> None: self.config: Config = Config().parse_args(known_only=True) bert_token_indexers = PretrainedTransformerIndexer( model_name=self.config.model_name) bert_tokenizer = PretrainedTransformerTokenizer( model_name=self.config.model_name) reader = TextClassificationJsonReader( token_indexers={"tokens": bert_token_indexers}, tokenizer=bert_tokenizer) train_instances = list(reader.read(self.config.train_file)) dev_instances = list(reader.read(self.config.dev_file)) test_instances = list(reader.read(self.config.test_file)) self.vocab: Vocabulary = Vocabulary.from_instances(train_instances) # 2. init the data loader self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True) self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False) self.train_data_loader.index_with(self.vocab) self.dev_data_loader.index_with(self.vocab) # 3. init the model self.model = self.init_model() self.trainer = self.init_trainer() def init_model(self) -> Model: """build the model Args: vocab (Vocabulary): the vocabulary of corpus Returns: Model: the final models """ bert_text_field_embedder = PretrainedTransformerEmbedder( model_name=self.config.model_name) tagger = BasicClassifier( vocab=self.vocab, text_field_embedder=BasicTextFieldEmbedder( token_embedders={'tokens': bert_text_field_embedder}), seq2vec_encoder=ClsPooler( embedding_dim=bert_text_field_embedder.get_output_dim()), ) tagger.to(device=self.config.device) return tagger def init_trainer(self) -> Trainer: parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad] group_parameter_group = [(['_text_field_embedder.*'], { 'lr': self.config.lr }), (['_classification_layer.*'], { 'lr': self.config.classifier_lr })] optimizer = AdamOptimizer(parameters, parameter_groups=group_parameter_group, lr=self.config.lr) # type: ignore trainer = GradientDescentTrainer( model=self.model, serialization_dir='./output', data_loader=self.train_data_loader, validation_data_loader=self.dev_data_loader, num_epochs=self.config.epoch, optimizer=optimizer, cuda_device=self.config.device, ) return trainer def train(self): self.trainer.train()