def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder': # type: ignore # pylint: disable=arguments-differ embedding_params: Params = params.pop("embedding") # Embedding.from_params() uses "tokens" as the default namespace, but we need to change # that to be "token_characters" by default. embedding_params.setdefault("vocab_namespace", "token_characters") embedding = Embedding.from_params(vocab, embedding_params) encoder_params: Params = params.pop("encoder") encoder = Seq2VecEncoder.from_params(encoder_params) dropout = params.pop_float("dropout", 0.0) params.assert_empty(cls.__name__) return cls(embedding, encoder, dropout)
def from_params(self, params: Params) -> PytorchSeq2VecWrapper: if not params.pop('batch_first', True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True module = self._module_class(**params.as_dict()) return PytorchSeq2VecWrapper(module)
def from_params(cls, params: Params): input_dim = params.pop_int('input_dim') num_layers = params.pop_int('num_layers') hidden_dims = params.pop('hidden_dims') activations = params.pop('activations') dropout = params.pop('dropout', 0.0) if isinstance(activations, list): activations = [Activation.by_name(name)() for name in activations] else: activations = Activation.by_name(activations)() params.assert_empty(cls.__name__) return cls(input_dim=input_dim, num_layers=num_layers, hidden_dims=hidden_dims, activations=activations, dropout=dropout)
def from_params( cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder': # type: ignore # pylint: disable=arguments-differ,bad-super-call # The original `from_params` for this class was designed in a way that didn't agree # with the constructor. The constructor wants a 'token_embedders' parameter that is a # `Dict[str, TokenEmbedder]`, but the original `from_params` implementation expected those # key-value pairs to be top-level in the params object. # # This breaks our 'configuration wizard' and configuration checks. Hence, going forward, # the params need a 'token_embedders' key so that they line up with what the constructor wants. # For now, the old behavior is still supported, but produces a DeprecationWarning. embedder_to_indexer_map = params.pop("embedder_to_indexer_map", None) if embedder_to_indexer_map is not None: embedder_to_indexer_map = embedder_to_indexer_map.as_dict( quiet=True) allow_unmatched_keys = params.pop_bool("allow_unmatched_keys", False) token_embedder_params = params.pop('token_embedders', None) if token_embedder_params is not None: # New way: explicitly specified, so use it. token_embedders = { name: TokenEmbedder.from_params(subparams, vocab=vocab) for name, subparams in token_embedder_params.items() } else: # Warn that the original behavior is deprecated warnings.warn( DeprecationWarning( "the token embedders for BasicTextFieldEmbedder should now " "be specified as a dict under the 'token_embedders' key, " "not as top-level key-value pairs")) token_embedders = {} keys = list(params.keys()) for key in keys: embedder_params = params.pop(key) token_embedders[key] = TokenEmbedder.from_params( vocab=vocab, params=embedder_params) params.assert_empty(cls.__name__) return cls(token_embedders, embedder_to_indexer_map, allow_unmatched_keys)
def from_params(cls, params: Params) -> 'Elmo': # Add files to archive params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) num_output_representations = params.pop('num_output_representations') do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float('dropout', 0.5) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, num_output_representations=num_output_representations, requires_grad=requires_grad, do_layer_norm=do_layer_norm, dropout=dropout)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BidafEnsemble': # type: ignore # pylint: disable=arguments-differ if vocab: raise ConfigurationError("vocab should be None") submodels = [] paths = params.pop("submodels") for path in paths: submodels.append(load_archive(path).model) return cls(submodels=submodels)
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info( "Using a separate dataset reader to load validation and test data." ) validation_and_test_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read( validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def from_params(cls, vocab: Vocabulary, params: Params) -> 'BiattentiveClassificationNetwork': # type: ignore # pylint: disable=arguments-differ embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab=vocab, params=embedder_params) embedding_dropout = params.pop("embedding_dropout") pre_encode_feedforward = FeedForward.from_params(params.pop("pre_encode_feedforward")) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) integrator = Seq2SeqEncoder.from_params(params.pop("integrator")) integrator_dropout = params.pop("integrator_dropout") output_layer_params = params.pop("output_layer") if "activations" in output_layer_params: output_layer = FeedForward.from_params(output_layer_params) else: output_layer = Maxout.from_params(output_layer_params) elmo = params.pop("elmo", None) if elmo is not None: elmo = Elmo.from_params(elmo) use_input_elmo = params.pop_bool("use_input_elmo", False) use_integrator_output_elmo = params.pop_bool("use_integrator_output_elmo", False) initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, embedding_dropout=embedding_dropout, pre_encode_feedforward=pre_encode_feedforward, encoder=encoder, integrator=integrator, integrator_dropout=integrator_dropout, output_layer=output_layer, elmo=elmo, use_input_elmo=use_input_elmo, use_integrator_output_elmo=use_integrator_output_elmo, initializer=initializer, regularizer=regularizer)
def from_params(cls, model_parameters: List, params: Params): # type: ignore # pylint: disable=arguments-differ if isinstance(params, str): optimizer = params params = Params({}) else: optimizer = params.pop_choice("type", Optimizer.list_available()) # make the parameter groups if need groups = params.pop("parameter_groups", None) if groups: # The input to the optimizer is list of dict. # Each dict contains a "parameter group" and groups specific options, # e.g., {'params': [list of parameters], 'lr': 1e-3, ...} # Any config option not specified in the additional options (e.g. # for the default group) is inherited from the top level config. # see: http://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options # # groups contains something like: #"parameter_groups": [ # [["regex1", "regex2"], {"lr": 1e-3}, # ["regex3"], {"lr": 1e-4}] #] #(note that the allennlp config files require double quotes ", and will # fail (sometimes silently) with single quotes '). # This is typed as as Any since the dict values other then # the params key are passed to the Optimizer constructor and # can be any type it accepts. # In addition to any parameters that match group specific regex, # we also need a group for the remaining "default" group. # Those will be included in the last entry of parameter_groups. parameter_groups: Any = [{'params': []} for _ in range(len(groups) + 1)] # add the group specific kwargs for k in range(len(groups)): # pylint: disable=consider-using-enumerate parameter_groups[k].update(groups[k][1].as_dict()) regex_use_counts: Dict[str, int] = {} parameter_group_names: List[set] = [set() for _ in range(len(groups) + 1)] for name, param in model_parameters: # Determine the group for this parameter. group_index = None for k, group_regexes in enumerate(groups): for regex in group_regexes[0]: if regex not in regex_use_counts: regex_use_counts[regex] = 0 if re.search(regex, name): if group_index is not None and group_index != k: raise ValueError("{} was specified in two separate parameter groups".format(name)) group_index = k regex_use_counts[regex] += 1 if group_index is not None: parameter_groups[group_index]['params'].append(param) parameter_group_names[group_index].add(name) else: # the default group parameter_groups[-1]['params'].append(param) parameter_group_names[-1].add(name) # log the parameter groups logger.info("Done constructing parameter groups.") for k in range(len(groups) + 1): group_options = {key: val for key, val in parameter_groups[k].items() if key != 'params'} logger.info("Group %s: %s, %s", k, list(parameter_group_names[k]), group_options) # check for unused regex for regex, count in regex_use_counts.items(): if count == 0: logger.warning("When constructing parameter groups, " " %s not match any parameter name", regex) else: parameter_groups = [param for name, param in model_parameters] # Log the number of parameters to optimize num_parameters = 0 for parameter_group in parameter_groups: if isinstance(parameter_group, dict): num_parameters += sum(parameter.numel() for parameter in parameter_group["params"]) else: num_parameters += parameter_group.numel() logger.info("Number of trainable parameters: %s", num_parameters) return Optimizer.by_name(optimizer)(parameter_groups, **params.as_dict()) # type: ignore
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore """ We need the vocabulary here to know how many items we need to embed, and we look for a ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If you know beforehand exactly how many embeddings you need, or aren't using a vocabulary mapping for the things getting embedded here, then you can pass in the ``num_embeddings`` key directly, and the vocabulary will be ignored. In the configuration file, a file containing pretrained embeddings can be specified using the parameter ``"pretrained_file"``. It can be the path to a local file or an URL of a (cached) remote file. Two formats are supported: * hdf5 file - containing an embedding matrix in the form of a torch.Tensor; * text file - an utf-8 encoded text file with space separated fields:: [word] [dim 1] [dim 2] ... The text file can eventually be compressed with gzip, bz2, lzma or zip. You can even select a single file inside an archive containing multiple files using the URI:: "(archive_uri)#file_path_inside_the_archive" where ``archive_uri`` can be a file system path or a URL. For example:: "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt" """ # pylint: disable=arguments-differ num_embeddings = params.pop_int('num_embeddings', None) vocab_namespace = params.pop("vocab_namespace", "tokens") if num_embeddings is None: num_embeddings = vocab.get_vocab_size(vocab_namespace) embedding_dim = params.pop_int('embedding_dim') pretrained_file = params.pop("pretrained_file", None) projection_dim = params.pop_int("projection_dim", None) trainable = params.pop_bool("trainable", True) padding_index = params.pop_int('padding_index', None) max_norm = params.pop_float('max_norm', None) norm_type = params.pop_float('norm_type', 2.) scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False) sparse = params.pop_bool('sparse', False) params.assert_empty(cls.__name__) if pretrained_file: # If we're loading a saved model, we don't want to actually read a pre-trained # embedding file - the embeddings will just be in our saved weights, and we might not # have the original embedding file anymore, anyway. weight = _read_pretrained_embeddings_file(pretrained_file, embedding_dim, vocab, vocab_namespace) else: weight = None return cls(num_embeddings=num_embeddings, embedding_dim=embedding_dim, projection_dim=projection_dim, weight=weight, padding_index=padding_index, trainable=trainable, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse)
def from_params(cls, params: Params, size: int) -> 'C': # type: ignore name = params.pop('name') return cls(size=size, name=name)