Exemplo n.º 1
0
    def test_known_configs(self):
        configs = os.listdir(self.PROJECT_ROOT / "training_config")

        # Our configs use environment variable substitution, and the _jsonnet parser
        # will fail if we don't pass it correct environment variables.
        forced_variables = [
            # constituency parser
            'PTB_TRAIN_PATH', 'PTB_DEV_PATH', 'PTB_TEST_PATH',

            # srl_elmo_5.5B
            'SRL_TRAIN_DATA_PATH', 'SRL_VALIDATION_DATA_PATH',

            # coref
            'COREF_TRAIN_DATA_PATH', 'COREF_DEV_DATA_PATH', 'COREF_TEST_DATA_PATH',

            # ner
            'NER_TRAIN_DATA_PATH', 'NER_TEST_A_PATH', 'NER_TEST_B_PATH'
        ]

        for var in forced_variables:
            os.environ[var] = os.environ.get(var) or str(self.TEST_DIR)

        for config in configs:
            try:
                Params.from_file(self.PROJECT_ROOT / "training_config" / config)
            except Exception as e:
                raise AssertionError(f"unable to load params for {config}, because {e}")

        for var in forced_variables:
            if os.environ[var] == str(self.TEST_DIR):
                del os.environ[var]
Exemplo n.º 2
0
 def test_fine_tune_nograd_regex(self):
     original_model = load_archive(self.model_archive).model
     name_parameters_original = dict(original_model.named_parameters())
     regex_lists = [[], [".*attend_feedforward.*", ".*token_embedder.*"],
                    [".*compare_feedforward.*"]]
     for regex_list in regex_lists:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(
             model=original_model,
             params=params,
             serialization_dir=self.serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # If regex is matched, parameter name should have same requires_grad
         # as the originally loaded model
         for name, parameter in tuned_model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad \
                 == name_parameters_original[name].requires_grad
     # If all parameters have requires_grad=False, then error.
     with pytest.raises(Exception) as _:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = ["*"]
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = fine_tune_model(
             model=original_model,
             params=params,
             serialization_dir=self.serialization_dir)
Exemplo n.º 3
0
    def test_regexes_with_backslashes(self):
        bad_regex = self.TEST_DIR / 'bad_regex.jsonnet'
        good_regex = self.TEST_DIR / 'good_regex.jsonnet'

        with open(bad_regex, 'w') as f:
            f.write(r'{"myRegex": "a\.b"}')

        with open(good_regex, 'w') as f:
            f.write(r'{"myRegex": "a\\.b"}')

        with pytest.raises(RuntimeError):
            Params.from_file(bad_regex)

        params = Params.from_file(good_regex)
        regex = params['myRegex']

        assert re.match(regex, "a.b")
        assert not re.match(regex, "a-b")

        # Check roundtripping
        good_regex2 = self.TEST_DIR / 'good_regex2.jsonnet'
        with open(good_regex2, 'w') as f:
            f.write(json.dumps(params.as_dict()))
        params2 = Params.from_file(good_regex2)

        assert params.as_dict() == params2.as_dict()
 def test_mismatching_dimensions_throws_configuration_error(self):
     params = Params.from_file(self.param_file)
     # Make the phrase layer wrong - it should be 150 to match
     # the embedding + binary feature dimensions.
     params["model"]["encoder"]["input_size"] = 10
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.pop("model"))
Exemplo n.º 5
0
    def test_elmo_num_repr_set_flags_mismatch_throws_configuration_error(self):
        # pylint: disable=line-too-long
        params = Params.from_file(self.FIXTURES_ROOT /
                                  'biattentive_classification_network' /
                                  'elmo_experiment.json')
        # Elmo is specified in the model, with num_output_representations=2. Set
        # only one flag to true.
        tmp_params = deepcopy(params)
        tmp_params["model"]["use_input_elmo"] = False
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))

        tmp_params = deepcopy(params)
        tmp_params["model"]["use_input_elmo"] = True
        tmp_params["model"]["use_integrator_output_elmo"] = False
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))

        # set num_output_representations to 1, and set both flags to True.
        tmp_params = deepcopy(params)
        tmp_params["model"]["elmo"]["num_output_representations"] = 1
        tmp_params["model"]["use_input_elmo"] = True
        tmp_params["model"]["use_integrator_output_elmo"] = True
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
Exemplo n.º 6
0
 def test_mismatching_dimensions_throws_configuration_error(self):
     params = Params.from_file(self.param_file)
     # Make the encoder wrong - it should be 2 to match
     # the embedding dimension from the text_field_embedder.
     params["model"]["encoder"]["input_size"] = 10
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.pop("model"))
Exemplo n.º 7
0
    def test_load_from_file(self):
        filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json'
        params = Params.from_file(filename)

        assert "dataset_reader" in params
        assert "trainer" in params

        model_params = params.pop("model")
        assert model_params.pop("type") == "bidaf"
Exemplo n.º 8
0
    def test_fine_tune_does_not_expand_vocab_by_default(self):
        params = Params.from_file(self.config_file)
        # snli2 has a new token in it
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' /
                                        'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # By default, no vocab expansion.
        fine_tune_model(model, params, self.serialization_dir)
Exemplo n.º 9
0
 def test_elmo_but_no_set_flags_throws_configuration_error(self):
     # pylint: disable=line-too-long
     params = Params.from_file(self.FIXTURES_ROOT /
                               'biattentive_classification_network' /
                               'elmo_experiment.json')
     # Elmo is specified in the model, but set both flags to false.
     params["model"]["use_input_elmo"] = False
     params["model"]["use_integrator_output_elmo"] = False
     with pytest.raises(ConfigurationError):
         Model.from_params(vocab=self.vocab, params=params.get("model"))
Exemplo n.º 10
0
    def test_env_var_substitution(self):
        substitutor = self.TEST_DIR / 'substitutor.jsonnet'
        key = 'TEST_ENV_VAR_SUBSTITUTION'

        assert os.environ.get(key) is None

        with open(substitutor, 'w') as f:
            f.write(f'{{"path": std.extVar("{key}")}}')

        # raises without environment variable set
        with pytest.raises(RuntimeError):
            Params.from_file(substitutor)

        os.environ[key] = "PERFECT"

        params = Params.from_file(substitutor)
        assert params['path'] == "PERFECT"

        del os.environ[key]
Exemplo n.º 11
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(self.model, self.TEST_DIR,
                                        self.iterator, self.dataset, None,
                                        params.get('trainer'))
Exemplo n.º 12
0
def make_vocab_from_args(args: argparse.Namespace):
    """
    Just converts from an ``argparse.Namespace`` object to params.
    """
    parameter_path = args.param_path
    overrides = args.overrides
    serialization_dir = args.serialization_dir

    params = Params.from_file(parameter_path, overrides)

    make_vocab_from_params(params, serialization_dir)
Exemplo n.º 13
0
    def test_fine_tune_runtime_errors_with_vocab_expansion(self):
        params = Params.from_file(self.config_file)
        params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' /
                                        'snli2.jsonl')

        model = load_archive(self.model_archive).model

        # If we do vocab expansion, we get a runtime error because of the embedding.
        with pytest.raises(RuntimeError):
            fine_tune_model(model,
                            params,
                            self.serialization_dir,
                            extend_vocab=True)
Exemplo n.º 14
0
    def test_overrides(self):
        filename = self.FIXTURES_ROOT / 'bidaf' / 'experiment.json'
        overrides = '{ "train_data_path": "FOO", "model": { "type": "BAR" },'\
                    '"model.text_field_embedder.tokens.type": "BAZ" }'
        params = Params.from_file(filename, overrides)

        assert "dataset_reader" in params
        assert "trainer" in params
        assert params["train_data_path"] == "FOO"

        model_params = params.pop("model")
        assert model_params.pop("type") == "BAR"
        assert model_params["text_field_embedder"]["tokens"]["type"] == "BAZ"
Exemplo n.º 15
0
    def test_jsonnet_features(self):
        config_file = self.TEST_DIR / 'config.jsonnet'
        with open(config_file, 'w') as f:
            f.write("""{
                            // This example is copied straight from the jsonnet docs
                            person1: {
                                name: "Alice",
                                welcome: "Hello " + self.name + "!",
                            },
                            person2: self.person1 { name: "Bob" },
                        }""")

        params = Params.from_file(config_file)

        alice = params.pop("person1")
        bob = params.pop("person2")

        assert alice.as_dict() == {"name": "Alice", "welcome": "Hello Alice!"}
        assert bob.as_dict() == {"name": "Bob", "welcome": "Hello Bob!"}

        params.assert_empty("TestParams")
Exemplo n.º 16
0
    def test_no_elmo_but_set_flags_throws_configuration_error(self):
        params = Params.from_file(self.param_file)
        # There is no elmo specified in self.param_file, but set
        # use_input_elmo and use_integrator_output_elmo to True.
        # use_input_elmo set to True
        tmp_params = deepcopy(params)
        tmp_params["model"]["use_input_elmo"] = True
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))

        # use_integrator_output_elmo set to True
        tmp_params = deepcopy(params)
        tmp_params["model"]["use_input_elmo"] = False
        tmp_params["model"]["use_integrator_output_elmo"] = True
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))

        # both use_input_elmo and use_integrator_output_elmo set to True
        tmp_params = deepcopy(params)
        tmp_params["model"]["use_input_elmo"] = True
        tmp_params["model"]["use_integrator_output_elmo"] = True
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
Exemplo n.º 17
0
def load_archive(archive_file: str,
                 cuda_device: int = -1,
                 overrides: str = "",
                 weights_file: str = None) -> Archive:
    """
    Instantiates an Archive from an archived `tar.gz` file.

    Parameters
    ----------
    archive_file: ``str``
        The archive file to load the model from.
    weights_file: ``str``, optional (default = None)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    cuda_device: ``int``, optional (default = -1)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    overrides: ``str``, optional (default = "")
        JSON overrides to apply to the unarchived ``Params`` object.
    """
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(
            f"loading archive file {archive_file} from cache at {resolved_archive_file}"
        )

    tempdir = None
    if os.path.isdir(resolved_archive_file):
        serialization_dir = resolved_archive_file
    else:
        # Extract archive to temp dir
        tempdir = tempfile.mkdtemp()
        logger.info(
            f"extracting archive file {resolved_archive_file} to temp dir {tempdir}"
        )
        with tarfile.open(resolved_archive_file, 'r:gz') as archive:
            archive.extractall(tempdir)

        serialization_dir = tempdir
    # Check for supplemental files in archive
    fta_filename = os.path.join(serialization_dir, _FTA_NAME)
    if os.path.exists(fta_filename):
        with open(fta_filename, 'r') as fta_file:
            files_to_archive = json.loads(fta_file.read())

        # Add these replacements to overrides
        replacements_dict: Dict[str, Any] = {}
        for key, _ in files_to_archive.items():
            replacement_filename = os.path.join(serialization_dir,
                                                f"fta/{key}")
            replacements_dict[key] = replacement_filename

        overrides_dict = parse_overrides(overrides)
        combined_dict = with_fallback(preferred=unflatten(replacements_dict),
                                      fallback=overrides_dict)
        overrides = json.dumps(combined_dict)
    # Load config
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME),
                              overrides)
    config.loading_from_archive = True

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)
    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = Model.load(config.duplicate(),
                       weights_file=weights_path,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device)
    if tempdir:
        # Clean up temp dir
        shutil.rmtree(tempdir)
    return Archive(model=model, config=config)