Exemplo n.º 1
0
def build_data_loaders(
    train_data: List[Instance],
    dev_data: List[Instance],
) -> Tuple[DataLoader, DataLoader]:
    train_loader = SimpleDataLoader(train_data, 8, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, 8, shuffle=False)
    return train_loader, dev_loader
Exemplo n.º 2
0
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
Exemplo n.º 3
0
def build_data_loaders(train_data: List[Instance], dev_data: List[Instance],
                       batch_size: int) -> Tuple[DataLoader, DataLoader]:
    """
    Creates data loaders which loads data in batches of size batch_size for training and validation
    Adapted from https://guide.allennlp.org/training-and-prediction
    """
    train_loader = SimpleDataLoader(train_data, batch_size, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, batch_size, shuffle=False)

    return train_loader, dev_loader
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = [1] * 40
        optim = self._get_optimizer()
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=SimpleDataLoader(instances, batch_size=10),
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "num_epochs": 3,
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=SimpleDataLoader(instances, batch_size=10),
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
def build_data_loaders(
        config, train_data: List[Instance], dev_data: List[Instance],
        test_data: List[Instance]
) -> Tuple[DataLoader, DataLoader, DataLoader]:

    train_loader = SimpleDataLoader(train_data,
                                    config.batch_size_for_train,
                                    shuffle=True)
    dev_loader = SimpleDataLoader(dev_data,
                                  config.batch_size_for_eval,
                                  shuffle=False)
    test_loader = SimpleDataLoader(test_data,
                                   config.batch_size_for_eval,
                                   shuffle=False)

    return train_loader, dev_loader, test_loader
Exemplo n.º 6
0
 def test_batch_of_entirely_empty_lists_works(self):
     instances = [self.empty_instance, self.empty_instance]
     model = DummyModel(self.vocab)
     model.eval()
     loader = SimpleDataLoader(instances, 2, vocab=self.vocab)
     batch = next(iter(loader))
     model.forward(**batch)
Exemplo n.º 7
0
 def test_can_optimise_model_with_dense_and_sparse_params(self):
     optimizer_params = Params({"type": "dense_sparse_adam"})
     parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
     optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params)
     for instance in self.instances:
         instance.index_fields(self.vocab)
     GradientDescentTrainer(self.model, optimizer, SimpleDataLoader(self.instances, 2)).train()
Exemplo n.º 8
0
def test_get_inverse_hvp_lissa():
    vs = [torch.tensor([1.0, 1.0])]
    # create a fake model
    vocab = Vocabulary()
    params = torch.tensor([1, 2]).float()
    model = DummyBilinearModelForTestingIF(vocab, params)
    used_params = list(model.parameters())

    # create a fake instance: just a matrix
    A = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
    fake_instance = Instance({"tensors": TensorField(A)})

    # wrap fake instance into dataloader
    lissa_data_loader = SimpleDataLoader([fake_instance],
                                         batch_size=1,
                                         batches_per_epoch=1)

    inverse_hvp = get_inverse_hvp_lissa(
        vs=vs,
        model=model,
        used_params=used_params,
        lissa_data_loader=lissa_data_loader,
        damping=0.0,
        num_samples=1,
        scale=1.0,
    )
    # I tried to increase recursion depth to actually approx the inverse Hessian vector product,
    # but I suspect due to extremely small number of data point, the algorithm doesn't work well
    # on this toy example
    ans = torch.tensor([-1.5, -4.5])
    assert torch.equal(inverse_hvp, ans)
Exemplo n.º 9
0
    def test_trainer_saves_models_at_specified_interval(self):
        data_loader = SimpleDataLoader(self.instances, 4)

        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(
                serialization_dir=self.TEST_DIR,
                model_save_interval=0.0001,
                num_serialized_models_to_keep=10,
            ),
        )

        trainer.train()

        # Now check the serialized files for models saved during the epoch.
        prefix = "model_state_epoch_*"
        file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix)))
        epochs = [
            re.search(r"_([0-9\.\-]+)\.th", fname).group(1)
            for fname in file_names
        ]
        # We should have checkpoints at the end of each epoch and during each, e.g.
        # [0.timestamp, 0, 1.timestamp, 1]
        assert len(epochs) == 4
        assert epochs[3] == "1"
        assert "." in epochs[0]

        # Now make certain we can restore from timestamped checkpoint.
        # To do so, remove the checkpoint from the end of epoch 1&2, so
        # that we are forced to restore from the timestamped checkpoints.
        for k in range(2):
            os.remove(
                os.path.join(self.TEST_DIR,
                             "model_state_epoch_{}.th".format(k)))
            os.remove(
                os.path.join(self.TEST_DIR,
                             "training_state_epoch_{}.th".format(k)))
        os.remove(os.path.join(self.TEST_DIR, "best.th"))

        restore_trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            self.data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                      model_save_interval=0.0001),
        )
        epoch = restore_trainer._restore_checkpoint()
        assert epoch == 2
        # One batch per epoch.
        assert restore_trainer._batch_num_total == 2
Exemplo n.º 10
0
        class SlowDataLoader:
            data_loader = SimpleDataLoader(self.instances, batch_size=2)

            def __iter__(self):
                time.sleep(2.5)
                return iter(self.data_loader)

            def __len__(self):
                return len(self.data_loader)

            def set_target_device(self, _):
                pass
Exemplo n.º 11
0
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: List[str] = None,
        cuda_device: int = -1,
        lissa_batch_size: int = 8,
        damping: float = 3e-3,
        num_samples: int = 1,
        recursion_depth: Union[float, int] = 0.25,
        scale: float = 1e4,
    ) -> None:
        super().__init__(
            model=model,
            train_data_path=train_data_path,
            train_dataset_reader=train_dataset_reader,
            test_dataset_reader=test_dataset_reader,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
        )

        self._lissa_dataloader = SimpleDataLoader(
            list(self._train_loader.iter_instances()),
            lissa_batch_size,
            shuffle=True,
            vocab=self.vocab,
        )
        self._lissa_dataloader.set_target_device(self.device)
        if isinstance(recursion_depth, float) and recursion_depth > 0.0:
            self._lissa_dataloader.batches_per_epoch = int(
                len(self._lissa_dataloader) * recursion_depth)
        elif isinstance(recursion_depth, int) and recursion_depth > 0:
            self._lissa_dataloader.batches_per_epoch = recursion_depth
        else:
            raise ValueError(
                "'recursion_depth' should be a positive int or float")

        self._damping = damping
        self._num_samples = num_samples
        self._recursion_depth = recursion_depth
        self._scale = scale
Exemplo n.º 12
0
 def test_sanity_check_callback(self):
     model_with_bias = FakeModelForTestingNormalizationBiasVerification(
         use_bias=True)
     inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
     data_loader = SimpleDataLoader([inst, inst], 2)
     trainer = GradientDescentTrainer(
         model_with_bias,
         self.optimizer,
         data_loader,
         num_epochs=1,
         serialization_dir=self.TEST_DIR,
         callbacks=[SanityChecksCallback(serialization_dir=self.TEST_DIR)],
     )
     with pytest.raises(SanityCheckError):
         trainer.train()
Exemplo n.º 13
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        # Now finally we can iterate through batches.
        loader = SimpleDataLoader(instances, 3)
        loader.index_with(vocab)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                )
Exemplo n.º 14
0
    def test_trainer_can_log_learning_rates_tensorboard(self):
        data_loader = SimpleDataLoader(self.instances, 4)
        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                TensorBoardCallback(
                    serialization_dir=self.TEST_DIR,
                    summary_interval=2,
                    should_log_learning_rate=True,
                )
            ],
        )

        trainer.train()
Exemplo n.º 15
0
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty()
        assert penalty is None

        data_loader = SimpleDataLoader(self.instances, batch_size=32)
        trainer = GradientDescentTrainer(self.model, None,
                                         data_loader)  # optimizer,

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iter(data_loader))
        validation_batch = next(iter(data_loader))

        training_loss = trainer.batch_outputs(
            training_batch, for_training=True)["loss"].item()
        validation_loss = trainer.batch_outputs(
            validation_batch, for_training=False)["loss"].item()

        # Training loss should have the regularization penalty, but validation loss should not.
        numpy.testing.assert_almost_equal(training_loss, validation_loss)
Exemplo n.º 16
0
def benchmark_xlmr_mdl():

    from allennlp.data import DataLoader
    from allennlp.training.util import evaluate

    xlmr = load_xlmr_coref_model()

    instances = xlmr.dataset_reader.load_dataset(testset)
    data_loader = SimpleDataLoader(instances, 1)
    data_loader.index_with(xlmr.model.vocab)

    start = time.time()

    metrics = evaluate(xlmr.model, data_loader)

    print('**XLM-R model**')
    print_speed_performance(start, num_sentences, num_tokens)
    print('Precision : ', metrics['coref_precision'])
    print('Recall : ', metrics['coref_recall'])
    print('F1 : ', metrics['coref_f1'])
    print('Mention Recall : ', metrics['mention_recall'])
Exemplo n.º 17
0
 def test_trainer_respects_epoch_size_smaller_tnan_total(self):
     batches_per_epoch = 1
     num_epochs = 2
     data_loader_smaller_epoch = SimpleDataLoader(
         self.instances,
         2,
         batches_per_epoch=batches_per_epoch,
     )
     trainer = GradientDescentTrainer(
         self.model,
         self.optimizer,
         data_loader_smaller_epoch,
         validation_data_loader=self.validation_data_loader,
         num_epochs=num_epochs,
         serialization_dir=self.TEST_DIR,
     )
     assert trainer._batch_num_total == 0
     metrics = trainer.train()
     epoch = metrics["epoch"]
     assert epoch == num_epochs - 1
     assert trainer._batch_num_total == num_epochs * batches_per_epoch
Exemplo n.º 18
0
    def test_sanity_check_default(self):
        model_with_bias = FakeModelForTestingNormalizationBiasVerification(use_bias=True)
        inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
        data_loader = SimpleDataLoader([inst, inst], 2)
        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
        )
        with pytest.raises(SanityCheckError):
            trainer.train()

        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
            run_sanity_checks=False,
        )

        # Check is not run, so no failure.
        trainer.train()
def main(serialization_directory: str,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(
        os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    evaluation_data_path = data if data else config["validation_data_path"]

    archive = load_archive(os.path.join(serialization_directory,
                                        "model.tar.gz"),
                           cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory,
                                        prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory,
                                  prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    dataset = list(dataset_reader.read(evaluation_data_path))

    with torch.autograd.no_grad():
        loader = SimpleDataLoader(dataset, 32)
        model_predictions: List[List[str]] = []
        for batch in Tqdm.tqdm(loader):
            batch = move_to_device(batch, device)
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(dataset, model_predictions):
            fields = instance.fields
            verb_index = fields["metadata"]["verb_index"]
            gold_tags = fields["metadata"]["gold_tags"]
            sentence = fields["metadata"]["words"]
            write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                     sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Exemplo n.º 20
0
    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    train_loader, dev_loader = build_data_loaders(train_data, dev_data)
    train_loader.index_with(vocab)
    dev_loader.index_with(vocab)

    # You obviously won't want to create a temporary file for your training
    # results, but for execution in binder for this guide, we need to do this.
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = build_trainer(model, serialization_dir, train_loader, dev_loader)
        trainer.train()

    return model, dataset_reader


# We've copied the training loop from an earlier example, with updated model
# code, above in the Setup section. We run the training loop to get a trained
# model.
model, dataset_reader = run_training_loop()

# Now we can evaluate the model on a new dataset.
test_data = list(dataset_reader.read("quick_start/data/movie_review/test.tsv"))
data_loader = SimpleDataLoader(test_data, 8)
data_loader.index_with(model.vocab)

results = evaluate(model, data_loader)
print(results)