Exemplo n.º 1
0
    def build_vocab(self,
                    model_id,
                    config,
                    storage,
                    model_storage,
                    image,
                    push_model=True):
        start_time = time.time()
        local_config = self._finalize_config(config)
        objects, tokenization_config = self._generate_vocabularies(
            local_config)
        end_time = time.time()

        local_config['tokenization'] = utility.resolve_environment_variables(
            tokenization_config)
        config['tokenization'] = tokenization_config
        config['model'] = model_id
        config['modelType'] = 'base'
        config['imageTag'] = image
        config['build'] = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        bundle_dependencies(objects, config, local_config)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
Exemplo n.º 2
0
 def _finalize_config(self, config, training=True):
     config = utility.resolve_environment_variables(config,
                                                    training=training)
     config = self._upgrade_data_config(config, training=training)
     config = utility.resolve_remote_files(config, self._shared_dir,
                                           self._storage)
     return config
Exemplo n.º 3
0
    def build_vocab(self,
                    model_id,
                    config,
                    storage,
                    model_storage,
                    image,
                    push_model=True):
        start_time = time.time()
        local_config = self._finalize_config(config)
        objects, preprocess_config, vocab_config = self._generate_vocabularies(
            local_config)
        end_time = time.time()

        # Old PN9 tokenization / buildvocab configuration
        if isinstance(preprocess_config, dict):
            local_config[
                "tokenization"] = utility.resolve_environment_variables(
                    preprocess_config)
            config["tokenization"] = preprocess_config
        elif isinstance(preprocess_config, list):
            local_config["preprocess"] = utility.resolve_environment_variables(
                preprocess_config)
            config["preprocess"] = preprocess_config
            local_config["vocabulary"] = utility.resolve_environment_variables(
                vocab_config)
            config["vocabulary"] = vocab_config
        else:
            raise RuntimeError(
                "Unknown preprocess configuration after buildvocab: \"{}\"".
                format(preprocess_config))

        config['model'] = model_id
        config['modelType'] = 'base'
        config['imageTag'] = image
        config['build'] = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        bundle_dependencies(objects, config, local_config)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
Exemplo n.º 4
0
def test_resolve_env():
    config = {"a": "${A_DIR}/a", "b": ["${B_DIR}/b", "${A_TRAIN_DIR}/a"]}
    os.environ["A_DIR"] = "foo"
    os.environ["B_DIR"] = "bar"
    config = utility.resolve_environment_variables(config)
    assert config["a"] == "foo/a"
    assert config["b"] == ["bar/b", "foo/a"]
    del os.environ["A_DIR"]
    del os.environ["B_DIR"]
def test_resolve_env_no_training():
    config = {
        "a": "${A_DIR}/a",
        "b": "${A_TRAIN_DIR}/a"
    }
    os.environ["A_DIR"] = "foo"
    config = utility.resolve_environment_variables(config, training=False)
    assert config["a"] == "foo/a"
    assert config["b"] == "${A_TRAIN_DIR}/a"
Exemplo n.º 6
0
    def _get_vocabs_info(self,
                         config,
                         local_config,
                         model_config=None,
                         tokens_to_add=None,
                         keep_previous=False):
        if tokens_to_add is None:
            tokens_to_add = {}
        vocab_config = config.get('vocabulary', {})
        vocab_local_config = local_config.get('vocabulary', {})
        # For compatibility with old configurations
        tok_config = config.get('tokenization', {})
        tok_local_config = local_config.get('tokenization', {})
        joint_vocab = is_joint_vocab(vocab_local_config)
        parent_dependencies = {}
        if model_config:
            model_config = config_util.old_to_new_config(model_config)
            model_vocab_config = model_config.get('vocabulary', {})
            model_vocab_local_config = utility.resolve_remote_files(
                utility.resolve_environment_variables(model_vocab_config),
                self._shared_dir, self._storage)
            model_joint_vocab = is_joint_vocab(model_vocab_local_config)
            if joint_vocab != model_joint_vocab:
                raise ValueError(
                    "Changing joint vocabularies to split vocabularies "
                    "(or vice-versa) is currently not supported.")
            if keep_previous:
                bundle_dependencies(parent_dependencies,
                                    copy.deepcopy(model_vocab_config),
                                    copy.deepcopy(model_vocab_local_config))
        else:
            model_vocab_config = None
            model_vocab_local_config = None
        source_tokens_to_add = tokens_to_add.get('source') or []
        target_tokens_to_add = tokens_to_add.get('target') or []
        if joint_vocab:
            source_tokens_to_add = set(
                list(source_tokens_to_add) + list(target_tokens_to_add))
            target_tokens_to_add = source_tokens_to_add
        src_info = self._get_vocab_info(
            'source',
            vocab_config,
            vocab_local_config,
            tok_config,
            tok_local_config,
            model_config=model_vocab_config,
            model_local_config=model_vocab_local_config,
            tokens_to_add=source_tokens_to_add,
            keep_previous=keep_previous,
            joint_vocab=joint_vocab)
        tgt_info = self._get_vocab_info(
            'target',
            vocab_config,
            vocab_local_config,
            tok_config,
            tok_local_config,
            model_config=model_vocab_config,
            model_local_config=model_vocab_local_config,
            tokens_to_add=target_tokens_to_add,
            keep_previous=keep_previous,
            joint_vocab=joint_vocab)

        if vocab_config:
            config.pop('tokenization', None)
            local_config.pop('tokenization', None)

        return src_info, tgt_info, parent_dependencies