Пример #1
0
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = AllennlpDataset([1] * 40)
        optim = self._get_optimizer()
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=DataLoader(instances, batch_size=10),
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "num_epochs": 3,
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=DataLoader(instances, batch_size=10),
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
Пример #2
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if 'dataset_reader' in params:
        reader = DatasetReader.from_params(params.pop('dataset_reader'))
    else:
        raise RuntimeError('`dataset_reader` section is required')

    all_instances = []
    if 'train_data_path' in params:
        print('Reading the training data...')
        train_data = reader.read(params.pop('train_data_path'))
        all_instances.extend(train_data)
    else:
        raise RuntimeError('`train_data_path` section is required')

    validation_data = None
    if 'validation_data_path' in params:
        print('Reading the validation data...')
        validation_data = reader.read(params.pop('validation_data_path'))
        all_instances.extend(validation_data)

    print('Building the vocabulary...')
    vocab = Vocabulary.from_instances(all_instances)

    model = None
    iterator = None
    if 'model' not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print('Showing the first 10 instances:')
        for inst in all_instances[:10]:
            print(inst)
    else:
        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        loader_params = deepcopy(params.pop("data_loader"))
        train_data_loader = DataLoader.from_params(dataset=train_data,
                                                   params=loader_params)
        dev_data_loader = DataLoader.from_params(dataset=validation_data,
                                                 params=loader_params)
        iterator.index_with(vocab)

        # set up a temporary, empty directory for serialization
        with tempfile.TemporaryDirectory() as serialization_dir:
            trainer = Trainer.from_params(
                model=model,
                serialization_dir=serialization_dir,
                data_loader=train_data_loader,
                validation_data_loader=dev_data_loader,
                params=params.pop('trainer'))
            trainer.train()

    return {
        'params': params_copy,
        'dataset_reader': reader,
        'vocab': vocab,
        'iterator': iterator,
        'model': model
    }
Пример #3
0
def create_trainer_for_finding_lr(
    pipeline: Pipeline,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    pipeline
        The pipeline with the model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    if hasattr(training_data, "index_with"):
        training_data.index_with(pipeline.backbone.vocab)

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return Trainer.from_params(
        model=pipeline._model,
        data_loader=training_data_loader,
        params=trainer_params,
        serialization_dir=None,
    )
Пример #4
0
 def test_reduce_on_plateau_and_metric_agree(self):
     # pylint: disable=protected-access
     for metric in ["+acc", "-loss"]:
         trainer_params = Params({
             "validation_metric": metric,
             "learning_rate_scheduler": {
                 "type": "reduce_on_plateau"
             },
             "optimizer": {
                 "type": "adam",
                 "lr": 0.01
             }
         })
         trainer = Trainer.from_params(model=self.model,
                                       serialization_dir=self.TEST_DIR,
                                       iterator=self.iterator,
                                       train_data=self.instances,
                                       validation_data=self.instances,
                                       params=trainer_params)
         if metric[0] == "+":
             correct_mode = "max"
             assert trainer._learning_rate_scheduler.lr_scheduler.mode == correct_mode
         else:
             correct_mode = "min"
             assert trainer._learning_rate_scheduler.lr_scheduler.mode == correct_mode
Пример #5
0
def create_trainer_for_finding_lr(
    model: PipelineModel,
    trainer_config: TrainerConfiguration,
    training_data: InstancesDataset,
) -> GradientDescentTrainer:
    """Returns an AllenNLP Trainer used for the learning rate scan.

    Parameters
    ----------
    model
        The underlying model
    trainer_config
        A trainer configuration
    training_data
        The training data
    """
    prepare_environment(Params({}))

    trainer_params = Params(
        helpers.sanitize_for_params(trainer_config.to_allennlp_trainer()))

    training_data_loader = create_dataloader(training_data,
                                             trainer_config.batch_size,
                                             trainer_config.data_bucketing)

    return cast(
        "GradientDescentTrainer",
        Trainer.from_params(
            model=model,
            data_loader=training_data_loader,
            params=trainer_params,
            serialization_dir=None,
        ),
    )
Пример #6
0
def get_trainer_from_config(
        config: Params,
        train_instances: List[Instance],
        val_instances: List[Instance],
        device: int,
        serialization_dir: Optional[str] = None) -> Trainer:
    trainer_params = config.pop("trainer")
    trainer_params["cuda_device"] = device
    model_params = config.pop("model")
    vocab_dir = config.pop("vocab_dir", None)
    if vocab_dir is None:
        vocab = Vocabulary.from_instances(train_instances)
    else:
        vocab = Vocabulary.from_files(vocab_dir)
    model = Model.from_params(model_params, vocab=vocab)
    iterator = DataIterator.from_params(config.pop("iterator"))
    trainer_params["num_serialized_models_to_keep"] = 1
    iterator.index_with(vocab)
    trainer = Trainer.from_params(model=model,
                                  iterator=iterator,
                                  train_data=train_instances,
                                  validation_data=val_instances,
                                  serialization_dir=serialization_dir,
                                  params=trainer_params)
    return trainer
    def setup_method(self):
        super().setup_method()
        params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "cuda_device": -1,
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })
        all_data_loaders = data_loaders_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            instances=(instance for data_loader in all_data_loaders.values()
                       for instance in data_loader.iter_instances()),
        )
        model = Model.from_params(vocab=vocab, params=params.pop("model"))

        data_loader = all_data_loaders["train"]
        data_loader.index_with(vocab)

        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR,
                                         "test_search_learning_rate")

        self.trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=data_loader,
            params=trainer_params,
            validation_data=None,
            validation_iterator=None,
        )
Пример #8
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
Пример #9
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(self.model, self.TEST_DIR,
                                        self.iterator, self.dataset, None,
                                        params.get('trainer'))
Пример #10
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.iterator = DataIterator.from_params(params["iterator"])
     self.trainer = Trainer.from_params(self.model, self.TEST_DIR,
                                        self.iterator, self.dataset, None,
                                        params.get("trainer"))
Пример #11
0
    def _setup(self):
        """Setup the trainer components and local resources"""
        prepare_environment(
            Params({} if self._trainer_config.random_seed is None else {
                "random_seed": self._trainer_config.random_seed,
                "numpy_seed": self._trainer_config.random_seed,
                "pytorch_seed": self._trainer_config.random_seed,
            }))
        os.makedirs(self._output_dir, exist_ok=True)

        # We don't need to load pretrained weights from saved models
        if self._pipeline.config.features.word:
            self._pipeline.config.features.word.weights_file = None

        serialization_params = sanitize(self._allennlp_configuration())
        with open(os.path.join(self._output_dir, CONFIG_NAME),
                  "w") as param_file:
            json.dump(serialization_params, param_file, indent=4)

        self._pipeline.save_vocabulary(
            os.path.join(self._output_dir, "vocabulary"))

        for dataset in [self._training, self._validation, self._test]:
            if dataset and hasattr(dataset, "index_with"):
                dataset.index_with(self._pipeline.backbone.vocab)

        trainer_params = Params(
            helpers.sanitize_for_params(
                self._trainer_config.to_allennlp_trainer()))

        pipeline_model = self._pipeline._model

        training_data_loader = create_dataloader(
            self._training,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
            self._trainer_config.batches_per_epoch,
        )

        validation_data_loader = (create_dataloader(
            self._validation,
            self._trainer_config.batch_size,
            self._trainer_config.data_bucketing,
        ) if self._validation else None)

        self._trainer = Trainer.from_params(
            model=pipeline_model,
            serialization_dir=self._output_dir,
            data_loader=training_data_loader,
            validation_data_loader=validation_data_loader,
            params=trainer_params,
            epoch_callbacks=self._epoch_callbacks,
        )
Пример #12
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if "dataset_reader" in params:
        reader = DatasetReader.from_params(params.pop("dataset_reader"))
    else:
        raise RuntimeError("`dataset_reader` section is required")

    loader_params = params.pop("data_loader")
    train_data_loader = DataLoader.from_params(
        reader=reader,
        data_path=params.pop("train_data_path"),
        params=loader_params.duplicate(),
    )
    dev_data_loader = DataLoader.from_params(
        reader=reader,
        data_path=params.pop("validation_data_path"),
        params=loader_params,
    )

    print("Building the vocabulary...")
    vocab = Vocabulary.from_instances(train_data_loader.iter_instances())

    if "model" not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print("Showing the first 10 instances:")
        for inst in train_data_loader.iter_instances():
            print(inst)
            return None

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    # set up a temporary, empty directory for serialization
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_data_loader,
            validation_data_loader=dev_data_loader,
            params=params.pop("trainer"),
        )
        trainer.train()

    return {
        "params": params_copy,
        "dataset_reader": reader,
        "vocab": vocab,
        "model": model,
    }
Пример #13
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.data_loader = DataLoader.from_params(dataset=self.instances,
                                               params=params["data_loader"])
     self.trainer = Trainer.from_params(
         model=self.model,
         data_loader=self.data_loader,
         serialization_dir=self.TEST_DIR,
         params=params.get("trainer"),
     )
Пример #14
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(
             self.model,
             self.TEST_DIR,
             self.iterator,
             self.dataset,
             None,
             params.get('trainer')
     )
Пример #15
0
def get_trainer_from_config(config: Params,
                            train_instances: List[Instance],
                            val_instances: List[Instance],
                            vocab: Optional[Vocabulary] = None,
                            device: Optional[int] = -1) -> Trainer:
    trainer_params = config.pop("trainer")
    trainer_params["cuda_device"] = device
    model_params = config.pop("model")
    vocab = vocab or Vocabulary.from_instances(train_instances)
    model = Model.from_params(model_params, vocab=vocab)
    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(vocab)
    trainer = Trainer.from_params(
        model=model,
        iterator=iterator,
        train_data=train_instances,
        validation_data=val_instances,
        serialization_dir=None,
        params=trainer_params)
    return trainer
Пример #16
0
 def test_mode_specified_in_reduce_on_plateau(self):
     # pylint: disable=protected-access
     for mode, metric in [("min", "-custom"), ("max", "+custom")]:
         trainer_params = Params({
             "validation_metric": metric,
             "learning_rate_scheduler": {
                 "type": "reduce_on_plateau",
                 "mode": mode
             },
             "optimizer": {
                 "type": "adam",
                 "lr": 0.01
             }
         })
         trainer = Trainer.from_params(model=self.model,
                                       serialization_dir=self.TEST_DIR,
                                       iterator=self.iterator,
                                       train_data=self.instances,
                                       validation_data=self.instances,
                                       params=trainer_params)
         assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
Пример #17
0
 def test_mode_doesnt_agree_with_metric(self):
     # pylint: disable=protected-access
     for mode, metric in [("max", "-custom"), ("min", "+custom")]:
         trainer_params = Params({
             "validation_metric": metric,
             "learning_rate_scheduler": {
                 "type": "reduce_on_plateau",
                 "mode": mode
             },
             "optimizer": {
                 "type": "adam",
                 "lr": 0.01
             }
         })
         with self.assertLogs(logger="allennlp.training.util",
                              level="WARNING"):
             # we warn when the metric and the mode don't agree
             trainer = Trainer.from_params(model=self.model,
                                           serialization_dir=self.TEST_DIR,
                                           iterator=self.iterator,
                                           train_data=self.instances,
                                           validation_data=self.instances,
                                           params=trainer_params)
         assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
Пример #18
0
def train_model(data_path,
                params,
                serialization_dir,
                cuda_device=-1,
                use_validation_data=True):

    os.makedirs(serialization_dir, exist_ok=True)

    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    ds_params = params.pop('dataset_reader', {})
    data_params = ds_params.pop('data', {})
    dataset_reader = CMVReader.from_params(ds_params)

    logger.info('Reading training data...')

    train_data = dataset_reader.read('train', **data_params)

    #train_data_response_only_for_vocab = dataset_reader.read('train', response_only=True)
    #all_datasets = [train_data_response_only_for_vocab]
    all_datasets = [train_data]
    datasets_in_vocab = ['train']  #_response_only_for_vocab']

    if use_validation_data:
        logger.info('Reading validation data...')
        data_params['weakpoints_only'] = False
        validation_data = dataset_reader.read('val', **data_params)
        all_datasets.append(validation_data)
        datasets_in_vocab.append('val')
    else:
        validation_data = None

    logger.info('Creating a vocabulary using %s data.',
                ', '.join(datasets_in_vocab))
    vocab_params = params.pop('vocabulary', {})
    dataset = None
    if 'directory_path' not in vocab_params:
        dataset = Batch([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ])

    vocab = Vocabulary.from_params(vocab_params, dataset)
    vocab.save_to_files(os.path.join(serialization_dir, 'vocabulary'))

    iterator = DataIterator.from_params(params.pop('iterator'))

    cmv_predictor_params = params.pop('cmv_predictor')
    predictor_pretrained_params = cmv_predictor_params.pop(
        'predictor_pretrained_params', None)
    cmv_predictor = Model.from_params(params=cmv_predictor_params, vocab=vocab)
    model_state = torch.load(predictor_pretrained_params['filename'],
                             map_location=util.device_mapping(
                                 predictor_pretrained_params['cuda_device']))
    cmv_predictor.load_state_dict(model_state)

    if params.pop('shared_embedder', False):
        print('using shared embedder')
        document_embedder = HierarchicalDocumentEmbedder(
            vocab, cmv_predictor._response_embedder,
            cmv_predictor._response_word_attention,
            cmv_predictor._response_encoder)
    else:
        document_embedder = Model.from_params(
            params=params.pop('document_embedder'), vocab=vocab)
    cmv_extractor = Model.from_params(params=params.pop('cmv_extractor'))
    cmv_discriminator = Model.from_params(
        params=params.pop('cmv_discriminator'))

    cmv_actor_critic_params = params.pop('cmv_actor_critic', None)
    cmv_actor_critic = None
    if cmv_actor_critic_params is not None:
        cmv_actor_critic = Model.from_params(params=cmv_actor_critic_params)

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer", None)
    if trainer_params is not None:
        if cuda_device is not None:
            trainer_params["cuda_device"] = cuda_device
        trainer = Trainer.from_params(cmv_predictor, serialization_dir,
                                      iterator, train_data, validation_data,
                                      trainer_params)

    compress_response = params.pop('compress_response', False)

    generator_iterator = DataIterator.from_params(
        params.pop('generator_iterator'))
    cmv_actor_critic_trainer_params = params.pop('actor_critic_trainer', None)
    if cmv_actor_critic_trainer_params is not None:
        cmv_actor_critic_pretrainer = CMVActorCriticTrainer(
            document_embedder, cmv_predictor, cmv_extractor, cmv_actor_critic,
            cmv_actor_critic_trainer_params.pop('train_predictor', False),
            cmv_actor_critic_trainer_params.pop('train_fake_predictor', False),
            compress_response)

        cmv_actor_critic_serialization_dir = os.path.join(
            serialization_dir, 'actor_critic')

        cmv_actor_critic_trainer = Trainer.from_params(
            cmv_actor_critic_pretrainer, cmv_actor_critic_serialization_dir,
            generator_iterator, train_data, validation_data,
            cmv_actor_critic_trainer_params)
    else:
        ac_pretrained_params = params.pop('pretrained_actor_critic', None)
        if ac_pretrained_params is not None:
            cmv_actor_critic_pretrainer = CMVActorCriticTrainer(
                document_embedder, cmv_predictor, cmv_extractor, None)
            model_state = torch.load(ac_pretrained_params['filename'],
                                     map_location=util.device_mapping(
                                         ac_pretrained_params['cuda_device']))
            cmv_actor_critic_pretrainer.load_state_dict(model_state)
            document_embedder = cmv_actor_critic_pretrainer._document_embedder
            cmv_predictor = cmv_actor_critic_pretrainer._cmv_predictor
            cmv_extractor = cmv_actor_critic_pretrainer._cmv_extractor

    generator = CMVGeneratorTrainer(
        document_embedder,
        cmv_predictor,
        cmv_extractor,
        cmv_discriminator,
        cmv_actor_critic,
        update_extractor=True,  #cmv_actor_critic_trainer_params is None,
        update_gold_extractor=False,  #True,
        compress_response=compress_response)  #False)

    discriminator = CMVDiscriminatorTrainer(document_embedder, cmv_predictor,
                                            cmv_extractor, cmv_discriminator,
                                            compress_response)

    generator_serialization_dir = os.path.join(serialization_dir, 'generator')
    os.makedirs(generator_serialization_dir, exist_ok=True)
    generator_trainer = GANTrainer.from_params(generator,
                                               generator_serialization_dir,
                                               generator_iterator, train_data,
                                               validation_data,
                                               params.pop('generator_trainer'))

    discriminator_serialization_dir = os.path.join(serialization_dir,
                                                   'discriminator')
    os.makedirs(discriminator_serialization_dir, exist_ok=True)
    discriminator_trainer = GANTrainer.from_params(
        discriminator, discriminator_serialization_dir, iterator, train_data,
        validation_data, params.pop('discriminator_trainer'))

    #first train predictor for N steps
    if trainer_params is not None:
        trainer._num_epochs = 5  #hacky
        trainer.train()

    #TODO? then train actor critic for M steps
    #if we are using separate predictors, use the full CMV to train the extractor based on maximizing persuasiveness prediction
    if cmv_actor_critic_trainer_params is not None:
        cmv_actor_critic_trainer.train()

    #then alternate training between discriminator and generator for E epochs
    generator_trainer._num_epochs = 1  #hacky
    discriminator_trainer._num_epochs = 1  #hacky
    gan_epochs = params.pop("gan_epochs")
    for i in range(gan_epochs):
        discriminator_trainer.train()
        generator_trainer.train()
        discriminator_trainer._num_epochs += 1  #very hacky
        generator_trainer._num_epochs += 1  #also hacky

        #if cmv_actor_critic_trainer_params is not None:
        #    cmv_actor_critic_trainer._num_epochs += 1
        #    cmv_actor_critic_trainer.train()

    # Now tar up results
    archive_model(serialization_dir)
    archive_model(generator_serialization_dir)
    archive_model(discriminator_serialization_dir)

    return generator
Пример #19
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str,
                filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERReader(db,
                                 sentence_level=ds_params.pop(
                                     "sentence_level", False),
                                 wiki_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('wiki_tokenizer', {})),
                                 claim_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('claim_tokenizer', {})),
                                 token_indexers=TokenIndexer.dict_from_params(
                                     ds_params.pop('token_indexers', {})),
                                 filtering=filtering)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Пример #20
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)


    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError("currently find-learning-rate only works with the default Trainer")
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=None,
                                  params=trainer_params,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Пример #21
0
def find_learning_rate_model(params: Params, serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    trainer: :class:`~allennlp.common.registrable.Registrable`
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
                                 f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  params=trainer_params,
                                  validation_data=None,
                                  validation_iterator=None)

    logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
    learning_rates, losses = search_learning_rate(trainer,
                                                  start_lr=start_lr,
                                                  end_lr=end_lr,
                                                  num_batches=num_batches,
                                                  linear_steps=linear_steps,
                                                  stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
Пример #22
0
if 'vocabulary' in params:
    vocab_params = params['vocabulary']
    vocab = Vocabulary.from_params(
        params=vocab_params, instances=train_instances.extend(valid_instances))
else:
    vocab = Vocabulary.from_instances(train_instances.extend(valid_instances))
dep_model = Model.from_params(vocab=vocab, params=params['model'])
print(dep_model)
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
train_dataset = Batch(train_instances)
train_dataset.index_instances(vocab)

valid_dataset = Batch(valid_instances)
valid_dataset.index_instances(vocab)

# dep_trainer = Trainer(dep_model, dep_file_path, dep_iterator, dep_train_data, dep_valid_data)
trainer_params = params.pop("trainer")

trainer = Trainer.from_params(model=dep_model,
                              serialization_dir='',
                              iterator=iterator,
                              train_data=train_dataset,
                              validation_data=valid_dataset,
                              params=trainer_params,
                              validation_iterator=iterator)

metrics = trainer.train()

archive_model('data/output')
Пример #23
0
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int,
                serialization_dir: str, filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    try:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout, True)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr, True)  # type: ignore
    except TypeError:
        sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                               sys.stdout)  # type: ignore
        sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                               sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    read_settings = ds_params.pop('read_settings', {})
    dataset_reader = FEVERReader.from_params(ds_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(
        train_data_path,
        include_metadata=True,
        replace_with_gold=read_settings.pop('replace_gold', False),
        pad_with_nearest=read_settings.pop('pad_with_nearest', 0))

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path,
                                              include_metadata=True)
    else:
        validation_data = None

    vocab_params = params.pop("vocabulary", {})
    dataset = None
    print(dict(vocab_params), 'directory_path' not in vocab_params)
    assert ('directory_path' in vocab_params)
    vocab = Vocabulary.from_params(vocab_params, dataset)
    print(vocab)
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
Пример #24
0
def find_learning_rate_model(
    params: Params,
    serialization_dir: str,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
    force: bool = False,
) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    # Parameters

    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results.
    start_lr : `float`
        Learning rate to start the search.
    end_lr : `float`
        Learning rate upto which search is done.
    num_batches : `int`
        Number of mini-batches to run Learning rate finder.
    linear_steps : `bool`
        Increase learning rate linearly if False exponentially.
    stopping_factor : `float`
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If `None` search proceeds till the `end_lr`
    force : `bool`
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    create_serialization_dir(params,
                             serialization_dir,
                             recover=False,
                             force=force)

    prepare_environment(params)

    cuda_device = params.params.get("trainer").get("cuda_device", -1)
    check_for_gpu(cuda_device)
    distributed_params = params.params.get("distributed")
    # See https://github.com/allenai/allennlp/issues/3658
    assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation),
    )
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        instances=(instance for key, dataset in all_datasets.items()
                   for instance in dataset
                   if key in datasets_for_vocab_creation),
    )

    train_data = all_datasets["train"]
    train_data.index_with(vocab)
    model = Model.from_params(vocab=vocab, params=params.pop("model"))
    data_loader = DataLoader.from_params(dataset=train_data,
                                         params=params.pop("data_loader"))

    trainer_params = params.pop("trainer")

    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "gradient_descent")
    if trainer_choice != "gradient_descent":
        raise ConfigurationError(
            "currently find-learning-rate only works with the GradientDescentTrainer"
        )
    trainer: GradientDescentTrainer = Trainer.from_params(  # type: ignore
        model=model,
        serialization_dir=serialization_dir,
        data_loader=data_loader,
        params=trainer_params,
    )

    logger.info(
        f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations."
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor,
    )
    logger.info("Finished learning rate search.")
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, "lr-losses.png"))
Пример #25
0
def train_model(data_path,
                params,
                serialization_dir,
                cuda_device=-1,
                use_validation_data=True):

    os.makedirs(serialization_dir, exist_ok=True)

    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    ds_params = params.pop('dataset_reader', {})
    data_params = ds_params.pop('data', {})
    dataset_reader = CMVReader.from_params(ds_params)
    '''
    dataset_reader = CMVReader(data_path,
                               tokenizer=Tokenizer.from_params(ds_params.pop('tokenizer', {})),
                               token_indexers=TokenIndexer.from_params(ds_params.pop('token_indexers', {})))
    '''

    logger.info('Reading training data...')

    train_data = dataset_reader.read('train', **data_params)

    #train_data_response_only_for_vocab = dataset_reader.read('train', response_only=True)
    #train_data_op_only_for_vocab = dataset_reader.read('train', op_only=True)
    #all_datasets = [train_data_response_only_for_vocab, train_data_op_only_for_vocab]
    all_datasets = [train_data]
    datasets_in_vocab = ['train']  #_response_only_for_vocab']

    if use_validation_data:
        logger.info('Reading validation data...')
        validation_data = dataset_reader.read('val', **data_params)
        all_datasets.append(validation_data)
        datasets_in_vocab.append('val')
    else:
        validation_data = None

    logger.info('Creating a vocabulary using %s data.',
                ', '.join(datasets_in_vocab))
    vocab_params = params.pop('vocabulary', {})
    dataset = None
    if 'directory_path' not in vocab_params:
        dataset = Batch([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ])

    vocab = Vocabulary.from_params(vocab_params, dataset)
    vocab.save_to_files(os.path.join(serialization_dir, 'vocabulary'))

    model = Model.from_params(params=params.pop('model'), vocab=vocab)
    iterator = DataIterator.from_params(params.pop('iterator'))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model