def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = training_util.datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None): assert os.path.isdir(model_path), "Model directory does not exist" set_seed(seed) config_path = config_path or os.path.join(model_path, "config.json") assert os.path.isfile(config_path), "Config file does not exist" params = Params.from_file(config_path) vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary") assert os.path.exists(vocabulary_path), "Vocabulary is not ready, do not forget to run preprocess.py first" vocabulary = Vocabulary.from_files(vocabulary_path) reader_params = params.duplicate().pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) train_dataset = reader.read(train_path) val_dataset = reader.read(val_path) if val_path else None model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, model_path, iterator, train_dataset, val_dataset, params.pop('trainer')) trainer.train()
def predict_json(self, _: JsonDict, cuda_device: int = -1) -> JsonDict: parameter_filename = 'allennlp/seq2seq.json' serialization_dir = 'retrained' subprocess.check_call(['mkdir', '-p', serialization_dir]) params = Params.from_file(parameter_filename) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(self._model.vocab) parameters = [[n, p] for n, p in self._model.named_parameters() if p.requires_grad] trainer_params = params.pop('trainer') optimizer = Optimizer.from_params(parameters, trainer_params.pop("optimizer")) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] trainer = SimpleTrainer(self._model, optimizer, train_data, iterator) interpreter = Interpreter(self._model, self._dataset_reader, trainer) while True: try: interpreter.cmdloop() except Exception as e: print(e) traceback.print_exc() print('Restarting interpreter cmdloop.')
def train_model( train_fp: Path, dev_fp: Path, model_fp: Path, vocab_data_fps: Optional[List[Path]] = None) -> Tuple[Model, Params]: ''' :param train_fp: The Traning dataset file path :param dev_fp: The development dataset file path :param model_fp: The json file that describes the model :param vocab_data_fps: An optional List of additional dataset files that will be used to create the models vocab :returns: A tuple containing the Trained model and an object that describes the model. ''' set_random_env() model_params = Params.from_file(model_fp) emotion_dataset_reader = DatasetReader.from_params( model_params.pop('dataset_reader')) # Data train_dataset = emotion_dataset_reader.read(cached_path(str(train_fp))) dev_dataset = emotion_dataset_reader.read(cached_path(str(dev_fp))) vocab_datasets = [train_dataset, dev_dataset] if vocab_data_fps: for vocab_data_fp in vocab_data_fps: vocab_datasets.append( emotion_dataset_reader.read(cached_path(str(vocab_data_fp)))) vocab_data = [] for vocab_dataset in vocab_datasets: vocab_data.extend(vocab_dataset) vocab = Vocabulary.from_instances(vocab_data) emotion_model = Model.from_params(vocab=vocab, params=model_params.pop('model')) data_iter = DataIterator.from_params(model_params.pop('iterator')) data_iter.index_with(vocab) # Trainer with tempfile.TemporaryDirectory() as serial_dir: trainer_params = model_params.pop('trainer') trainer = Trainer.from_params(model=emotion_model, serialization_dir=serial_dir, iterator=data_iter, train_data=train_dataset, validation_data=dev_dataset, params=trainer_params) _ = trainer.train() temp_config_fp = str(Path(serial_dir, CONFIG_NAME).resolve()) Params.from_file(model_fp).to_file(temp_config_fp) vocab.save_to_files(Path(serial_dir, "vocabulary").resolve()) archive_model(serial_dir, files_to_archive=model_params.files_to_archive) model_archive = load_archive(serial_dir, cuda_device=0) return model_archive.model, model_archive.config
def _test_model(self, file_name): params = self.params[file_name].duplicate() reader_params = params.duplicate().pop("reader", default=Params({})) if reader_params["type"] == "cnn_dailymail": reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR dataset_file = TEST_URLS_FILE elif reader_params["type"] == "ria": dataset_file = RIA_EXAMPLE_FILE else: assert False reader = DatasetReader.from_params(reader_params) tokenizer = reader._tokenizer dataset = reader.read(dataset_file) vocabulary_params = params.pop("vocabulary", default=Params({})) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, None, iterator, dataset, None, params.pop('trainer')) trainer.train() model.eval() predictor = Seq2SeqPredictor(model, reader) for article, reference_sents in reader.parse_set(dataset_file): ref_words = [ token.text for token in tokenizer.tokenize(reference_sents) ] decoded_words = predictor.predict(article)["predicted_tokens"] self.assertGreaterEqual(len(decoded_words), len(ref_words)) unk_count = 0 while DEFAULT_OOV_TOKEN in decoded_words: unk_index = decoded_words.index(DEFAULT_OOV_TOKEN) decoded_words.pop(unk_index) unk_count += 1 if unk_index < len(ref_words): ref_words.pop(unk_index) self.assertLess(unk_count, 5) self.assertListEqual(decoded_words[:len(ref_words)], ref_words)
def train(self, train_file_name: str, train_params: Params, serialization_dir: str = None, valid_file_name: str = None): assert os.path.exists(train_file_name) assert not valid_file_name or os.path.exists(valid_file_name) train_dataset = self.reader.read(train_file_name) valid_dataset = self.reader.read( valid_file_name) if valid_file_name else None iterator = DataIterator.from_params(train_params.pop('iterator')) iterator.index_with(self.vocab) trainer = Trainer.from_params(self.model, serialization_dir, iterator, train_dataset, valid_dataset, train_params.pop('trainer')) train_params.assert_empty("Trainer") return trainer.train()
def train_model(params, serialization_dir, file_friendly_logging=False, recover=False, model="bidaf"): """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ print("Starting training models...") prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) print("get all of the dataset.") datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") print("creatig vocaburary...") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) if model == "self": model = BiDAFSelfAttention.from_params(vocab, params.pop("model")) else: model = BidirectionalAttentionFlow.from_params(vocab, params.pop("model")) print("Initialized a BiDAF model.") # This is for debugging. print(model) print(serialization_dir) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) print("create iterator") train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') print("initalizing a trainer") trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, results_fn: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Tuple[Model, Dict[str, Any]]: prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(results_dir, results_fn), metrics, log=True) return best_model, metrics
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. " "Vocabulary from the saved model will be extended with current data.") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = model.vocab vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, selector: str, num_ensemble_models: Optional[int], file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model_params = params.pop('model') if selector == 'qbc': assert num_ensemble_models is not None models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)] ensemble_model = CorefEnsemble(models_list) model = ensemble_model.submodels[0] else: model = Model.from_params(vocab=vocab, params=model_params) ensemble_model = None # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop("type") trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator, ensemble_model=ensemble_model) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics, query_info = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) best_model = None logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value return best_model, metrics, query_info
def train_seq_models_reuse_iterator(base_filename, output_dir_base, gpu): processed_filenames = {} base_filename_prefix = base_filename[:base_filename.rfind('.')] filename_expression = base_filename_prefix + '*' + base_filename[ base_filename.rfind('.'):] if os.path.isfile(base_filename): params_to_pull_iterator_from = Params.from_file(base_filename, "") params_for_copying = Params.from_file(base_filename, "") else: filename_to_pull = get_config_filenames_matching( filename_expression)[0] params_to_pull_iterator_from = Params.from_file(filename_to_pull, "") params_for_copying = Params.from_file(filename_to_pull, "") all_datasets = datasets_from_params(params_to_pull_iterator_from) datasets_for_vocab_creation = set( params_to_pull_iterator_from.pop("datasets_for_vocab_creation", all_datasets)) vocab = Vocabulary.from_params( params_to_pull_iterator_from.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) iterator = DataIterator.from_params( params_to_pull_iterator_from.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params_to_pull_iterator_from.pop( "validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None params_to_copy = [ "validation_iterator", "vocabulary", "iterator", "dataset_reader", "datasets_for_vocab_creation", "train_data_path", "validation_data_path" ] copied_param_vals = [(param_name, params_for_copying.pop(param_name, None)) for param_name in params_to_copy] while True: config_filenames = get_config_filenames_matching(filename_expression) cur_config_filename = None for config_filename in config_filenames: if not config_filename in processed_filenames: processed_filenames[config_filename] = 0 cur_config_filename = config_filename break if cur_config_filename is None: break edit_config_file_to_have_gpu(cur_config_filename, gpu) progressing_ok = True try: params = Params.from_file(cur_config_filename, "") except: print("Could not properly read params from " + cur_config_filename + "; skipping.") progressing_ok = False if progressing_ok: try: for param_tup in copied_param_vals: modify_param(params, param_tup[0], param_tup[1]) except: print("Something went wrong while modifying params in " + cur_config_filename) progressing_ok = False if progressing_ok: print("Starting to train model from " + cur_config_filename) if progressing_ok: try: cur_config_filename = cur_config_filename[:cur_config_filename. rfind('.')] last_letters_to_take = len(cur_config_filename) - len( base_filename_prefix) if last_letters_to_take > 0: tag_to_append_to_dir = cur_config_filename[( -1 * last_letters_to_take):] else: tag_to_append_to_dir = '' serialization_dir = output_dir_base + tag_to_append_to_dir except: progressing_ok = False print("Could not properly assemble a serialization directory") if progressing_ok: try: train_model_given_params_and_iterators( params, serialization_dir, iterator, validation_iterator, vocab, all_datasets, params_to_copy) except: progressing_ok = False print( "Training model failed for some reason; skipping to next model." ) print("Done processing all config files.")
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': # all_datasets = datasets_from_params(params) corpus = Corpus.from_params(params.pop('corpus')) # datasets_for_vocab_creation = set(params.pop( # "datasets_for_vocab_creation", all_datasets)) # for dataset in datasets_for_vocab_creation: # if dataset not in all_datasets: # raise ConfigurationError( # f"invalid 'dataset_for_vocab_creation' {dataset}") # logger.info("From dataset instances, %s will be considered for vocabulary creation.", # ", ".join(datasets_for_vocab_creation)) seed = params.pop_int("seed", 5678) vocab_params = params.pop("vocabulary", {}) vocab_type = vocab_params.get("type", "default") if vocab_type == 'default' and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) elif vocab_type == 'empty': vocab = Vocabulary() else: seed_environment(seed) vocab = Vocabulary.from_params(vocab_params, corpus.train) # Need to reset the seed. Otherwise loading existing vocab and creating # vocab from scratch will lead to different behavior. seed_environment(seed) # contextualizer_params = params.pop('contextualizer') # contextualizer = Seq2SeqDecoder.from_params( # vocab=vocab, params=contextualizer_params) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None # train_data = all_datasets['train'] # validation_data = all_datasets.get('validation') # test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) batch_weight_key = params.pop('batch_weight_key', '') return TrainerPieces(model, iterator, corpus, validation_iterator, batch_weight_key, trainer_params)
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
# files=['./Final Task/Test/SemEval2017-task3-English-test-input.xml'] files = ['./Final Task/dev/SemEval2016-Task3-CQA-QL-dev.xml'] attn = "_cos_hyper" calculate_map = True write_file = False Wfile_name = "out.txt" import_submodules(library) model_config = "config/%s_eval.jsonnet" % model_name overrides = overrides = json.dumps( {"trainer": { "cuda_device": cuda_device }}) params = Params.from_file(model_config, overrides) model_file = 'checkpoint/%s%s/' % (model_name, attn) iterator = DataIterator.from_params(params.pop("iterator")) torch.manual_seed(0) numpy.random.seed(0) if write_file: wf = Write_outfile(Wfile_name) print("Loading vocabulary") vocab = Vocabulary.from_files(model_file + 'vocabulary') print('Initialing model') model = Model.from_params(vocab=vocab, params=params.pop('model')) print("Loading Model file from %s" % (model_file + 'best.th')) with open(model_file + 'best.th', 'rb') as f: model.load_state_dict(torch.load(f, encoding='utf-8'))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def test_registry_has_builtin_iterators(self): assert DataIterator.by_name('adaptive').__name__ == 'AdaptiveIterator' assert DataIterator.by_name('basic').__name__ == 'BasicIterator' assert DataIterator.by_name('bucket').__name__ == 'BucketIterator'
for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) <<<<<<< HEAD model = Model.from_params(vocab, params.pop('model')) selector = Selector.from_params(vocab, params.pop('selector')) ======= >>>>>>> 9b2f0b45abdad09c78036fdaebe7dd2c9973128a iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") <<<<<<< HEAD trainer = Trainer.from_params(model,
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) if params.pop('vocabulary', None): logger.warning( "You passed parameters for the vocabulary in your configuration file, but " "we are ignoring them, using instead the vocabulary from the saved model." ) vocab = model.vocab vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def __init__(self, params: Params, serialization_dir: str, recover: bool = False, cache_directory: Optional[str] = None, cache_prefix: Optional[str] = None): self.original_params = params.duplicate() self.main_serialization_dir = serialization_dir if recover or cache_directory or cache_prefix: raise NotImplementedError( f'Currently do not support `recover` {recover}, ' f'`cache_directory` {cache_directory}, or ' f'`cache_prefix` {cache_prefix}') task_order = params.get('trainer').pop('task_order') main_task = params.get('trainer').pop('main_task') self.task_order = task_order if main_task != task_order[-1]: raise ConfigurationError( f'main task {main_task} with `trainer` has' ' to be equal to the last task in the ' f'`task_order` {task_order}') logger.warning(f"Main task {main_task}") logger.warning("Training tasks each epoch in the following order:") for task_index, task in enumerate(task_order): logger.warning(f'{task_index}: {task}') # Get shared iterator shared_values = params.pop('shared_values') iterator = DataIterator.from_params(shared_values.pop("iterator")) self.iterator = iterator # Get dataset information # task name: dataset split name: data all_task_data: Dict[str, Dict[str, List[Instance]]] = {} self.task_params = {} for task in task_order: logger.warning(f'Loading dataset for {task}') task_data = {} task_params = params.get(task) self.task_params[task] = task_params.duplicate() dataset_reader = DatasetReader.from_params( task_params.pop('dataset_reader')) task_data['train'] = dataset_reader.read( task_params.pop('train_data_path')) task_data['validation'] = dataset_reader.read( task_params.pop('validation_data_path')) task_data['test'] = dataset_reader.read( task_params.pop('test_data_path')) all_task_data[task] = task_data # Create the vocab logger.warning('Creating Vocab from all task data') all_instances = [ data for task_data in all_task_data.values() for data in task_data.values() ] all_instances = list(itertools.chain.from_iterable(all_instances)) vocab = Vocabulary.from_instances(all_instances) iterator.index_with(vocab) logger.warning('Iterator indexed') # Shared model parameters text_embedder = None if 'text_field_embedder' in shared_values: logger.warning('Creating shared text embedder') text_embedder_params = shared_values.pop('text_field_embedder') text_embedder = TextFieldEmbedder.from_params( params=text_embedder_params, vocab=vocab) shared_encoder = None if 'shared_encoder' in shared_values: logger.warning('Creating shared Sequence Encoder') shared_encoder_params = shared_values.pop('shared_encoder') shared_encoder = Seq2SeqEncoder.from_params( params=shared_encoder_params) # Creating task specific models task_models: Dict[str, SharedCrfTagger] = {} for task in task_order: logger.warning(f'Creating shared model for task {task}') task_params = params.get(task) task_model_params = task_params.pop('model') task_text_embedder = None if text_embedder is not None: task_text_embedder = text_embedder if task_text_embedder is not None and "text_field_embedder" in task_model_params: raise ConfigurationError('Cannot have a shared text field ' 'embedder and a task specific one') if shared_encoder is not None and "shared_encoder" in task_model_params: raise ConfigurationError( 'Cannot have a shared encoder in shared_values ' 'and a task specific shared encoder') if "text_field_embedder" in task_model_params: task_text_embedder_params = task_model_params.pop( 'text_field_embedder') task_text_embedder = TextFieldEmbedder.from_params( params=task_text_embedder_params, vocab=vocab) if task_model_params.pop('type') != 'shared_crf_tagger': raise ConfigurationError( 'The SharedCRF tagger model is the ' f'only supported model. Error task {task}') task_models[task] = SharedCrfTagger.from_params( vocab=vocab, text_field_embedder=task_text_embedder, shared_encoder=shared_encoder, params=task_model_params) # Task specific trainers task_trainers: Dict[str, Trainer] = {} self.task_serialization_dir = {} for task in task_order: logger.warning(f'Creating {task} trainer') task_serialization_dir = str(Path(serialization_dir, task)) self.task_serialization_dir[task] = task_serialization_dir logger.warning( f'Task {task} serialization directory: {task_serialization_dir}' ) task_trainer_params = params.get(task).pop('trainer') task_train_data = all_task_data[task]['train'] task_validation_data = all_task_data[task]['validation'] task_model = task_models[task] task_trainers[task] = Trainer.from_params( params=task_trainer_params, model=task_model, serialization_dir=task_serialization_dir, iterator=iterator, validation_data=task_validation_data, train_data=task_train_data) # Getting task specific evaluation data self.task_cuda_evaluation = {} self.auxiliary_task_validation_data = {} self.all_task_test_data = {} for task in task_order: # If not setting for cuda or not then cuda is assumed. self.task_cuda_evaluation[task] = 0 if 'evaluate' in params.get(task): if 'cuda_device' in params.get(task).get('evaluate'): is_cuda = params.get(task).pop('evaluate')['cuda_device'] self.task_cuda_evaluation[task] = is_cuda if task != main_task: self.auxiliary_task_validation_data[task] = all_task_data[ task]['validation'] self.all_task_test_data[task] = all_task_data[task]['test'] # Remove all of the tasks from the params for task in task_order: params.pop(task) params.pop('trainer') params.assert_empty('MultiTaskTrainer') self.task_trainers = task_trainers
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets.append(test_data) datasets_in_vocab.append("test") else: test_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for dataset in all_datasets for instance in dataset.instances])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets, all_datasets_aux, all_datasets_aux2 = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set(params.pop("auxiliary_datasets_for_vocab_creation", all_datasets_aux)) datasets_for_vocab_creation_aux2 = set(params.pop("auxiliary_datasets_for_vocab_creation_2", all_datasets_aux2)) mixing_ratio = params.pop_float("mixing_ratio") mixing_ratio2 = params.pop_float("mixing_ratio2") cutoff_epoch = params.pop("cutoff_epoch", -1) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab_instances_aux = [ instance for key, dataset in all_datasets_aux.items() for instance in dataset if key in datasets_for_vocab_creation_aux ] vocab_instances_aux.extend([ instance for key, dataset in all_datasets_aux2.items() for instance in dataset if key in datasets_for_vocab_creation_aux2 ]) vocab = VocabularyMultitask.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), instances_aux=vocab_instances_aux ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) iterator_aux.index_with(vocab) iterator_aux2 = DataIterator.from_params(params.pop("iterator_aux2")) iterator_aux2.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None # TODO: if validation in multi-task need to add validation iterator as above train_data = all_datasets.get('train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') train_data_aux = all_datasets_aux.get('train_aux') validation_data_aux = all_datasets_aux.get('validation_aux') test_data_aux = all_datasets_aux.get('test_aux') train_data_aux2 = all_datasets_aux2.get('train_aux') validation_data_aux2 = all_datasets_aux2.get('validation_aux') test_data_aux2 = all_datasets_aux2.get('test_aux') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = MultiTaskTrainer2.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, iterator_aux2=iterator_aux2, train_data=train_data, train_data_aux=train_data_aux, train_data_aux2=train_data_aux2, mixing_ratio=mixing_ratio, mixing_ratio2=mixing_ratio2, cutoff_epoch=cutoff_epoch, validation_data_aux=validation_data_aux, validation_data_aux2=validation_data_aux2, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) evaluate_aux_on_test = params.pop_bool("evaluate_aux_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_aux_on_test: # for instance in test_data_aux: # instance.index_fields(vocab) # for instance in test_data_aux2: # instance.index_fields(vocab) test_metrics_aux = evaluate(best_model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access test_metrics_aux2 = evaluate(best_model, test_data_aux2, iterator_aux2, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics_aux.items(): metrics["test_aux_" + key] = value for key, value in test_metrics_aux2.items(): metrics["test_aux2_" + key] = value elif test_data_aux: logger.info("To evaluate on the auxiliary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def modified_train_model(serialization_dir, training_config_filename, cuda_device=-1, file_friendly_logging: bool = False) -> Model: """ Function not currently in use. This is from back when I was trying to keep each successive addition to the model's training in the same serialization directory. Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ model, params, prev_optimizer_params, cur_optimizer_params = \ load_model_from_serialization_dir(serialization_dir, training_config_filename, cuda_device=cuda_device) prepare_environment(params) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) params.pop('model') iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) list_of_cur_optimizer_param_keys = [ key for key in cur_optimizer_params.as_flat_dict().keys() ] list_of_prev_optimizer_param_keys = [ key for key in prev_optimizer_params.as_flat_dict().keys() ] optimizer_params_match = True for key in list_of_cur_optimizer_param_keys: if key not in list_of_prev_optimizer_param_keys: optimizer_params_match = False break for key in list_of_prev_optimizer_param_keys: if key not in list_of_cur_optimizer_param_keys: optimizer_params_match = False break if not optimizer_params_match: # a list of each p is what will be passed to the optimizer constructor while constructing Trainer-- # adjust if necessary (i.e., if we changed optimizers) model_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad] assert "parameter_groups" not in list_of_cur_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" assert "parameter_groups" not in list_of_prev_optimizer_param_keys, \ "Current way of dealing with optimizer change doesn't take parameter groups into account" for param_tup in model_params: # modify the second element of param_tup in-place (it's a dict) to match the keys specified in # cur_optimizer_params param_dict = param_tup[1] keys_to_del = [] keys_already_in_dict = [] try: for key in param_dict.keys(): if not key in list_of_cur_optimizer_param_keys: keys_to_del.append(key) else: keys_already_in_dict.append(key) for key in keys_to_del: del param_dict[key] for key_to_have in list_of_cur_optimizer_param_keys: if key_to_have != "type" and key_to_have not in keys_already_in_dict: param_dict[key_to_have] = cur_optimizer_params.get( key_to_have) except: pass trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model(params: Params, serialization_dir: str, cuda_device: int, train_data_path: str, validation_data_path: str, test_data_path: str, file_friendly_logging: bool = False) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # all_datasets = datasets_from_params(params) all_datasets = datasets_from_args(params, train_data_path, validation_data_path, test_data_path) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) if cuda_device >= 0: model = model.cuda(cuda_device) # iterator = DataIterator.from_params(params.pop("iterator")) # iterator.index_with(vocab) train_iterator = DataIterator.from_params(params.pop("train_iterator")) val_iterator = DataIterator.from_params(params.pop("val_iterator")) train_iterator.index_with(vocab) val_iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, train_iterator, val_iterator, cuda_device, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) # params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, val_iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) if params.pop('vocabulary', None): logger.warning( "You passed parameters for the vocabulary in your configuration file, but " "we are ignoring them, using instead the vocabulary from the saved model." ) vocab = model.vocab vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "") -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances( vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_type = trainer_params.pop("type", "default") if trainer_type == "default": trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) else: raise ConfigurationError( "currently fine-tune only works with the default Trainer") evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, batch_weight_key=batch_weight_key) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, model: Model = None, embedding_sources_mapping: Dict[str, str] = None, extend_vocab: bool = False, ) -> "MetaTrainerPieces": all_datasets = training_util.datasets_from_params(params) vocabulary_params = params.pop("vocabulary", {}) if model: if params.pop("model", None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the loaded model parameters." ) # TODO(mattg): This should be updated now that directory_path no longer exists. if vocabulary_params.get("directory_path", None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored because we already " "have a model with a vocabulary.") vocab = model.vocab else: vocab = None vocabulary_path = os.path.join(serialization_dir, "vocabulary") if not vocab or extend_vocab: vocab = MetaTrainerPieces.create_or_extend_vocab( datasets=all_datasets, params=params, recover=recover, vocab=vocab, vocabulary_params=vocabulary_params, vocabulary_path=vocabulary_path, ) if not model: model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab(embedding_sources_mapping) # Initializing the model can have side effect of expanding the vocabulary # Save the vocab only in the master. In the degenerate non-distributed # case, we're trivially the master. if is_master(): vocab.save_to_files(vocabulary_path) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_datas = all_datasets["train"] validation_datas = all_datasets.get("validation") test_datas = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) log_frozen_and_tunable_parameter_names(model) return cls( model=model, iterator=iterator, train_datasets=train_datas, validation_datasets=validation_datas, test_datasets=test_datas, validation_iterator=validation_iterator, params=trainer_params, )
def from_partial_objects( cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_iterator: DataIterator = None, validation_data: Iterable[Instance] = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, model_save_interval: float = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: int = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) batches_per_epoch = iterator.get_num_batches(train_data) if batches_per_epoch == 1: # get_num_batches returns 1 when it can't determine the answer batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) return cls( model, optimizer_, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, )
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. # 1. Primary training data. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) # 2. Auxillary training data. dataset_reader_aux = DatasetReader.from_params( params.pop('dataset_reader_aux')) train_data_path_aux = params.pop('train_data_path_aux') logger.info("Reading auxilliary training data from %s", train_data_path_aux) train_data_aux = dataset_reader_aux.read(train_data_path_aux) # If only using a fraction of the auxiliary data. aux_sample_fraction = params.pop("aux_sample_fraction", 1.0) if aux_sample_fraction < 1.0: sample_size = int(aux_sample_fraction * len(train_data_aux.instances)) train_data_aux = Dataset( random.sample(train_data_aux.instances, sample_size)) # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset. train_size = len(train_data.instances) aux_train_size = len(train_data_aux.instances) mixing_ratio = params.pop("mixing_ratio") # mixing_ratio = float(train_size)/aux_train_size if train_size > aux_train_size: # case for PB scaffold. difference = train_size - aux_train_size aux_sample = [ random.choice(train_data_aux.instances) for _ in range(difference) ] train_data_aux = Dataset(train_data_aux.instances + aux_sample) logger.info( "Inflating auxiliary train data from {} to {} samples".format( aux_train_size, len(train_data_aux.instances))) # else: # case for FN scaffold. # difference = aux_train_size - train_size # train_sample = [random.choice(train_data.instances) for _ in range(difference)] # train_data = Dataset(train_data.instances + train_sample) # logger.info("Inflating train data from {} to {} samples".format( # train_size, len(train_data.instances))) all_datasets: Dict[str, Dataset] = {"train": train_data} all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux} # 3. Primary validation data. validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets["validation"] = validation_data else: validation_data = None # 4. Auxillary validation data. validation_data_path_aux = params.pop('validation_data_path_aux', None) if validation_data_path_aux is not None: logger.info("Reading auxilliary validation data from %s", validation_data_path_aux) validation_data_aux = dataset_reader_aux.read(validation_data_path_aux) all_datasets_aux["validation_aux"] = validation_data_aux else: validation_data_aux = None # 5. Primary test data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets["test"] = test_data else: test_data = None # 6. Auxillary test data test_data_path_aux = params.pop("test_data_path_aux", None) if test_data_path_aux is not None: logger.info("Reading auxillary test data from %s", test_data_path_aux) test_data_aux = dataset_reader_aux.read(test_data_path_aux) all_datasets_aux["test_aux"] = test_data_aux else: test_data_aux = None datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set( params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "Creating a vocabulary using %s data. Auxillary also included.", ", ".join(datasets_for_vocab_creation)) dataset_primary = Dataset([ instance for key, dataset in all_datasets.items() for instance in dataset.instances if key in datasets_for_vocab_creation ]) dataset_aux = Dataset([ instance for key, dataset in all_datasets_aux.items() for instance in dataset.instances if key in datasets_for_vocab_creation_aux ]) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), dataset_primary, dataset_aux=dataset_aux) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) train_data.index_instances(vocab) train_data_aux.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) if validation_data_aux: validation_data_aux.index_instances(vocab) cutoff_epoch = params.pop("cutoff_epoch", -1) trainer_params = params.pop("trainer") trainer = MultiTaskTrainer.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, train_dataset=train_data, train_dataset_aux=train_data_aux, mixing_ratio=mixing_ratio, cutoff_epoch=cutoff_epoch, validation_dataset=validation_data, validation_dataset_aux=validation_data_aux, params=trainer_params, files_to_archive=params.files_to_archive) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_on_test: test_data_aux.index_instances(vocab) evaluate(model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data_aux: logger.info( "To evaluate on the auxillary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def test_registry_has_builtin_iterators(self): assert DataIterator.by_name(u'basic').__name__ == u'BasicIterator' assert DataIterator.by_name(u'bucket').__name__ == u'BucketIterator'
def test_registry_has_builtin_iterators(self): assert DataIterator.by_name("basic").__name__ == "BasicIterator" assert DataIterator.by_name("bucket").__name__ == "BucketIterator"
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def from_params( cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "TrainerPieces": all_datasets = training_util.meta_dataset_from_params(params, cache_directory, cache_prefix) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab_params = params.pop("vocabulary", {}) vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary"), vocab_params.get("padding_token", None), vocab_params.get("oov_token", None), ) else: instance_train = ( instance for key, dataset in all_datasets.items() if key == 'train' for subdata in dataset for instance in subdata ) instance_valid_test = ( instance for key, dataset in all_datasets.items() if key != 'train' for instance in dataset ) instances = chain(instance_train, instance_valid_test) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), # Using a generator comprehension here is important # because, being lazy, it allows us to not iterate over the # dataset when directory_path is specified. # ( # instance # for key, dataset in all_datasets.items() # if (key in datasets_for_vocab_creation) # for instance in dataset # ), instances ) model = Model.from_params(vocab=vocab, params=params.pop("model")) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary # Save the vocab only in the master if not is_distributed() or is_master(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) # print('[info] iterator in meta_pieces is:{}'.format(params.pop("iterator"))) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets["train"] validation_data = all_datasets.get("validation") test_data = all_datasets.get("test") trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model ) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return cls( model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params, )