def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if 'dataset_reader' in params: reader = DatasetReader.from_params(params.pop('dataset_reader')) else: raise RuntimeError('`dataset_reader` section is required') all_instances = [] if 'train_data_path' in params: print('Reading the training data...') train_data = reader.read(params.pop('train_data_path')) all_instances.extend(train_data) else: raise RuntimeError('`train_data_path` section is required') validation_data = None if 'validation_data_path' in params: print('Reading the validation data...') validation_data = reader.read(params.pop('validation_data_path')) all_instances.extend(validation_data) print('Building the vocabulary...') vocab = Vocabulary.from_instances(all_instances) model = None iterator = None if 'model' not in params: # 'dataset' mode — just preview the (first 10) instances print('Showing the first 10 instances:') for inst in all_instances[:10]: print(inst) else: model = Model.from_params(vocab=vocab, params=params.pop('model')) loader_params = deepcopy(params.pop("data_loader")) train_data_loader = DataLoader.from_params(dataset=train_data, params=loader_params) dev_data_loader = DataLoader.from_params(dataset=validation_data, params=loader_params) train_data.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop('trainer')) trainer.train() return { 'params': params_copy, 'dataset_reader': reader, 'vocab': vocab, 'iterator': iterator, 'model': model }
def __init__(self, config: Params, vocab: Vocabulary): super().__init__() # embed_size, hidden_size, vocab, dropout_rate = 0.2 self.source_embedder = NMTEmbedder(vocab.get_vocab_size("char_src"), config.duplicate()) self.target_embedder = NMTEmbedder(vocab.get_vocab_size("char_trg"), config.duplicate()) self.hidden_size = config.pop("hidden_size") self.dropout_rate = config.pop("dropout_rate") self.vocab = vocab self.target_vocab_size = self.vocab.get_vocab_size("token_trg") self.device = config.pop("device") self.encoder = nn.LSTM( input_size=self.source_embedder.char_emb_size, hidden_size=self.hidden_size, bidirectional=True, bias=True, batch_first=True, ) self.decoder = nn.LSTMCell( input_size=self.source_embedder.char_emb_size + self.hidden_size, hidden_size=self.hidden_size, bias=True, ) self.h_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) self.c_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) self.att_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) self.combined_output_projection = nn.Linear( in_features=self.hidden_size * 3, out_features=self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear( in_features=self.hidden_size, out_features=self.target_vocab_size, bias=False, ) self.dropout = nn.Dropout(p=self.dropout_rate)
class TestTrainerUtil(AllenNlpTestCase): def setUp(self): super().setUp() self.snli_file = str(self.FIXTURES_ROOT / "data" / "snli.jsonl") self.params = Params({ "dataset_reader": { "type": "snli" }, "train_data_path": self.snli_file }) self.cache_directory = str(self.FIXTURES_ROOT / "data_cache") def tearDown(self): super().tearDown() shutil.rmtree(self.cache_directory) def test_datasets_from_params_uses_caching_correctly_in_simplest_case( self): # We'll rely on the dataset reader tests to be sure of the functionality of this caching; # we're just checking here that things get hooked up correctly to the right spots. cache_prefix = "prefix" _ = util.datasets_from_params(self.params.duplicate(), self.cache_directory, cache_prefix) expected_cache_file = ( f"{self.cache_directory}/{cache_prefix}/{flatten_filename(self.snli_file)}" ) expected_param_file = f"{self.cache_directory}/{cache_prefix}/params.json" assert os.path.exists(expected_cache_file) assert os.path.exists(expected_param_file) with open(expected_param_file, "r") as param_file: saved_params = json.load(param_file) assert saved_params == self.params.pop("dataset_reader").as_dict( quiet=True) def test_datasets_from_params_uses_caching_correctly_with_hashed_params( self): # We'll rely on the dataset reader tests to be sure of the functionality of this caching; # we're just checking here that things get hooked up correctly to the right spots. _ = util.datasets_from_params(self.params, self.cache_directory) cache_prefix = util._dataset_reader_param_hash(Params({"type": "snli"})) expected_cache_file = ( f"{self.cache_directory}/{cache_prefix}/{flatten_filename(self.snli_file)}" ) assert os.path.exists(expected_cache_file)
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def test_train_number_of_steps(self): number_of_epochs = 2 last_num_steps_per_epoch: Optional[int] = None @LearningRateScheduler.register("mock") class MockLRScheduler(ExponentialLearningRateScheduler): def __init__(self, optimizer: torch.optim.Optimizer, num_steps_per_epoch: int): super().__init__(optimizer) nonlocal last_num_steps_per_epoch last_num_steps_per_epoch = num_steps_per_epoch batch_callback_counter = 0 @BatchCallback.register("counter") class CounterBatchCallback(BatchCallback): def __call__( self, trainer: GradientDescentTrainer, batch_inputs: List[List[TensorDict]], batch_outputs: List[Dict[str, Any]], epoch: int, batch_number: int, is_training: bool, is_master: bool, ) -> None: nonlocal batch_callback_counter if is_training: batch_callback_counter += 1 params = Params( { "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "test_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "evaluate_on_test": True, "data_loader": {"batch_size": 2}, "trainer": { "num_epochs": number_of_epochs, "optimizer": "adam", "learning_rate_scheduler": {"type": "mock"}, "batch_callbacks": ["counter"], }, } ) train_model( params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_normal") ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 normal_steps_per_epoch = last_num_steps_per_epoch original_batch_size = params["data_loader"]["batch_size"] params["data_loader"]["batch_size"] = 1 train_model( params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_with_bs1") ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 assert normal_steps_per_epoch == math.ceil(last_num_steps_per_epoch / original_batch_size) params["data_loader"]["batch_size"] = original_batch_size params["trainer"]["num_gradient_accumulation_steps"] = 3 train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "train_with_ga")) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 assert math.ceil(normal_steps_per_epoch / 3) == last_num_steps_per_epoch
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, node_rank: int = 0, include_package: List[str] = None, ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. # Returns best_model : ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, recover=recover, cache_directory=cache_directory, cache_prefix=cache_prefix, include_package=include_package, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(master_port) os.environ["WORLD_SIZE"] = str(world_size) logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since the data iterators # in each worker will yield only `rank` specific instances. Hence it is safe to construct # the vocabulary and write it to disk before initializing the distributed context. The workers # will load the vocabulary from the path specified. make_vocab_from_params(params.duplicate(), serialization_dir) params["vocabulary"] = { "directory_path": os.path.join(serialization_dir, "vocabulary"), "extend": False, # vocab extension would have been done above } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, recover, cache_directory, cache_prefix, include_package, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) model = Model.load(params, serialization_dir) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) else: # Workaround to obtain the evaluation parts. pieces = TrainerPieces.from_params( params.duplicate(), # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
class ArchivalTest(AllenNlpTestCase): def setup_method(self): super().setup_method() self.params = Params( { "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "data_loader": {"batch_size": 2}, "trainer": {"num_epochs": 2, "optimizer": "adam", "cuda_device": -1}, } ) def test_archiving(self): # copy params, since they'll get consumed during training params_copy = self.params.duplicate() params_dict_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive serialization_dir = self.TEST_DIR / "archive_test" model = train_model(self.params, serialization_dir=serialization_dir) archive_path = serialization_dir / "model.tar.gz" # load from the archive archive = load_archive(archive_path) model2 = archive.model assert_models_equal(model, model2) assert isinstance( archive.dataset_reader, type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())), ) assert isinstance( archive.validation_dataset_reader, type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())), ) # validation_dataset_reader is not in the config, so fall back to dataset_reader # check that params are the same params2 = archive.config assert params2.as_dict() == params_dict_copy def test_archive_model_uses_archive_path(self): serialization_dir = self.TEST_DIR / "serialization" # Train a model train_model(self.params, serialization_dir=serialization_dir) # Use a new path. archive_model( serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz" ) archive = load_archive(serialization_dir / "new_path.tar.gz") assert archive def test_loading_serialization_directory(self): # copy params, since they'll get consumed during training params_dict_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive serialization_dir = self.TEST_DIR / "serialization" model = train_model(self.params, serialization_dir=serialization_dir) # load from the serialization directory itself archive = load_archive(serialization_dir) model2 = archive.model assert_models_equal(model, model2) # check that params are the same params2 = archive.config assert params2.as_dict() == params_dict_copy def test_can_load_from_archive_model(self): serialization_dir = self.FIXTURES_ROOT / "basic_classifier" / "from_archive_serialization" archive_path = serialization_dir / "model.tar.gz" model = load_archive(archive_path).model # We want to be sure that we don't just not crash, but also be sure that we loaded the right # weights for the model. We'll do that by making sure that we didn't just load the model # that's in the `archive_path` of the config file, which is this one. base_model_path = self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz" base_model = load_archive(base_model_path).model base_model_params = dict(base_model.named_parameters()) for name, parameters in model.named_parameters(): if parameters.size() == base_model_params[name].size(): assert not (parameters == base_model_params[name]).all() else: # In this case, the parameters are definitely different, no need for the above # check. pass def test_include_in_archive(self): self.params["include_in_archive"] = ["metrics_epoch_*.json"] serialization_dir = self.TEST_DIR / "serialization" # Train a model train_model(self.params, serialization_dir=serialization_dir) # Assert that the additional targets were archived with tempfile.TemporaryDirectory() as tempdir: with tarfile.open(serialization_dir / "model.tar.gz", "r:gz") as archive: archive.extractall(tempdir) assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_0.json")) assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_1.json")) assert not os.path.isfile(os.path.join(tempdir, "metrics.json")) def test_invalid_include_in_archive(self): self.params["include_in_archive"] = [CONFIG_NAME] serialization_dir = self.TEST_DIR / "serialization" with pytest.raises(ConfigurationError) as exc: train_model(self.params, serialization_dir=serialization_dir) assert "are saved names and cannot be used" in str(exc.value)
def train_single(config: Params, instances: List[Instance], partially_labeled_instances: List[Instance], reader: DatasetReader, index: int, cuda_device: int) -> List[str]: instances = deepcopy(instances) partially_labeled_instances = deepcopy(partially_labeled_instances) config = deepcopy(config) test_instance = instances[index] train_val_instances = instances[:index] + instances[index + 1:] random.shuffle(train_val_instances) num_train_instances = int(0.9 * len(train_val_instances)) train_instances = train_val_instances[:num_train_instances] val_instances = train_val_instances[num_train_instances:] trainer = get_trainer_from_config(config.duplicate(), train_instances=train_instances, val_instances=val_instances, device=cuda_device) trainer.train() metric, should_decrease = trainer._validation_metric, trainer._metric_tracker._should_decrease patience = trainer._metric_tracker._patience bad_epochs = -1 num_epochs = 5 model = trainer.model metrics = get_metrics(trainer) originalModel = ModelData(metric=metrics[metric], weights=model.state_dict()) bestModel = ModelData(metric=metrics[metric], model=model) for _ in range(num_epochs): best_model = bestModel.model train_instances = get_training_data(partially_labeled_instances, best_model, reader) trainer = get_trainer_from_config( config.duplicate(), train_instances=train_instances, val_instances=val_instances, vocab=best_model.vocab, device=cuda_device) trainer.train() # update the parameters with torch.no_grad(): for name, value in trainer.model.named_parameters(): if name in originalModel.weights and value.requires_grad: value.mul_(0.1).add_(0.9 * originalModel.weights[name]) metrics = get_metrics(trainer) if should_decrease: if metrics[metric] < bestModel.metric: bestModel = ModelData(metric=metrics[metric], model=trainer.model) bad_epochs = 0 else: bad_epochs += 1 else: if metrics[metric] > bestModel.metric: bestModel = ModelData(metric=metrics[metric], model=trainer.model) bad_epochs = 0 else: bad_epochs += 1 if bad_epochs == patience: break model = bestModel.model model.eval() prediction = sanitize(model.forward_on_instance(test_instance)) return prediction, model