def test_known_configs(self): configs = os.listdir(self.PROJECT_ROOT / "training_config") # Our configs use environment variable substitution, and the _jsonnet parser # will fail if we don't pass it correct environment variables. forced_variables = [ # constituency parser 'PTB_TRAIN_PATH', 'PTB_DEV_PATH', 'PTB_TEST_PATH', # srl_elmo_5.5B 'SRL_TRAIN_DATA_PATH', 'SRL_VALIDATION_DATA_PATH', # coref 'COREF_TRAIN_DATA_PATH', 'COREF_DEV_DATA_PATH', 'COREF_TEST_DATA_PATH', # ner 'NER_TRAIN_DATA_PATH', 'NER_TEST_A_PATH', 'NER_TEST_B_PATH' ] for var in forced_variables: os.environ[var] = os.environ.get(var) or str(self.TEST_DIR) for config in configs: try: Params.from_file(self.PROJECT_ROOT / "training_config" / config) except Exception as e: raise AssertionError(f"unable to load params for {config}, because {e}") for var in forced_variables: if os.environ[var] == str(self.TEST_DIR): del os.environ[var]
def test_fine_tune_nograd_regex(self): original_model = load_archive(self.model_archive).model name_parameters_original = dict(original_model.named_parameters()) regex_lists = [[], [".*attend_feedforward.*", ".*token_embedder.*"], [".*compare_feedforward.*"]] for regex_list in regex_lists: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = regex_list shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model( model=original_model, params=params, serialization_dir=self.serialization_dir) # If regex is matched, parameter name should have requires_grad False # If regex is matched, parameter name should have same requires_grad # as the originally loaded model for name, parameter in tuned_model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad \ == name_parameters_original[name].requires_grad # If all parameters have requires_grad=False, then error. with pytest.raises(Exception) as _: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = ["*"] shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model( model=original_model, params=params, serialization_dir=self.serialization_dir)
def test_regexes_with_backslashes(self): bad_regex = self.TEST_DIR / 'bad_regex.jsonnet' good_regex = self.TEST_DIR / 'good_regex.jsonnet' with open(bad_regex, 'w') as f: f.write(r'{"myRegex": "a\.b"}') with open(good_regex, 'w') as f: f.write(r'{"myRegex": "a\\.b"}') with pytest.raises(RuntimeError): Params.from_file(bad_regex) params = Params.from_file(good_regex) regex = params['myRegex'] assert re.match(regex, "a.b") assert not re.match(regex, "a-b") # Check roundtripping good_regex2 = self.TEST_DIR / 'good_regex2.jsonnet' with open(good_regex2, 'w') as f: f.write(json.dumps(params.as_dict())) params2 = Params.from_file(good_regex2) assert params.as_dict() == params2.as_dict()
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 150 to match # the embedding + binary feature dimensions. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_elmo_num_repr_set_flags_mismatch_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, with num_output_representations=2. Set # only one flag to true. tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # set num_output_representations to 1, and set both flags to True. tmp_params = deepcopy(params) tmp_params["model"]["elmo"]["num_output_representations"] = 1 tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the encoder wrong - it should be 2 to match # the embedding dimension from the text_field_embedder. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_load_from_file(self): filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json' params = Params.from_file(filename) assert "dataset_reader" in params assert "trainer" in params model_params = params.pop("model") assert model_params.pop("type") == "bidaf"
def test_fine_tune_does_not_expand_vocab_by_default(self): params = Params.from_file(self.config_file) # snli2 has a new token in it params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # By default, no vocab expansion. fine_tune_model(model, params, self.serialization_dir)
def test_elmo_but_no_set_flags_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, but set both flags to false. params["model"]["use_input_elmo"] = False params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))
def test_env_var_substitution(self): substitutor = self.TEST_DIR / 'substitutor.jsonnet' key = 'TEST_ENV_VAR_SUBSTITUTION' assert os.environ.get(key) is None with open(substitutor, 'w') as f: f.write(f'{{"path": std.extVar("{key}")}}') # raises without environment variable set with pytest.raises(RuntimeError): Params.from_file(substitutor) os.environ[key] = "PERFECT" params = Params.from_file(substitutor) assert params['path'] == "PERFECT" del os.environ[key]
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params(self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer'))
def make_vocab_from_args(args: argparse.Namespace): """ Just converts from an ``argparse.Namespace`` object to params. """ parameter_path = args.param_path overrides = args.overrides serialization_dir = args.serialization_dir params = Params.from_file(parameter_path, overrides) make_vocab_from_params(params, serialization_dir)
def test_fine_tune_runtime_errors_with_vocab_expansion(self): params = Params.from_file(self.config_file) params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # If we do vocab expansion, we get a runtime error because of the embedding. with pytest.raises(RuntimeError): fine_tune_model(model, params, self.serialization_dir, extend_vocab=True)
def test_overrides(self): filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json' overrides = '{ "train_data_path": "FOO", "model": { "type": "BAR" },'\ '"model.text_field_embedder.tokens.type": "BAZ" }' params = Params.from_file(filename, overrides) assert "dataset_reader" in params assert "trainer" in params assert params["train_data_path"] == "FOO" model_params = params.pop("model") assert model_params.pop("type") == "BAR" assert model_params["text_field_embedder"]["tokens"]["type"] == "BAZ"
def test_jsonnet_features(self): config_file = self.TEST_DIR / 'config.jsonnet' with open(config_file, 'w') as f: f.write("""{ // This example is copied straight from the jsonnet docs person1: { name: "Alice", welcome: "Hello " + self.name + "!", }, person2: self.person1 { name: "Bob" }, }""") params = Params.from_file(config_file) alice = params.pop("person1") bob = params.pop("person2") assert alice.as_dict() == {"name": "Alice", "welcome": "Hello Alice!"} assert bob.as_dict() == {"name": "Bob", "welcome": "Hello Bob!"} params.assert_empty("TestParams")
def test_no_elmo_but_set_flags_throws_configuration_error(self): params = Params.from_file(self.param_file) # There is no elmo specified in self.param_file, but set # use_input_elmo and use_integrator_output_elmo to True. # use_input_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # use_integrator_output_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = False tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # both use_input_elmo and use_integrator_output_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) tempdir = None if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = replacement_filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)