Exemplo n.º 1
0
    def build_vocab(self,
                    model_id,
                    config,
                    storage,
                    model_storage,
                    image,
                    push_model=True):
        start_time = time.time()
        local_config = self._finalize_config(config)
        objects, tokenization_config = self._generate_vocabularies(
            local_config)
        end_time = time.time()

        local_config['tokenization'] = utility.resolve_environment_variables(
            tokenization_config)
        config['tokenization'] = tokenization_config
        config['model'] = model_id
        config['modelType'] = 'base'
        config['imageTag'] = image
        config['build'] = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        bundle_dependencies(objects, config, local_config)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
Exemplo n.º 2
0
 def release_wrapper(self,
                     config,
                     model_path,
                     storage,
                     image,
                     destination,
                     optimization_level=None,
                     gpuid=0,
                     push_model=True):
     local_config = self._finalize_config(config, training=False)
     objects = self.release(local_config,
                            model_path,
                            optimization_level=optimization_level,
                            gpuid=gpuid)
     extract_model_resources(objects, config)
     model_id = config['model'] + '_release'
     config['model'] = model_id
     config['modelType'] = 'release'
     config['imageTag'] = image
     for name in ("parent_model", "build", "data"):
         if name in config:
             del config[name]
     objects_dir = os.path.join(self._models_dir, model_id)
     build_model_dir(objects_dir, objects, config, should_check_integrity)
     if push_model:
         storage.push(objects_dir, storage.join(destination, model_id))
Exemplo n.º 3
0
 def release_wrapper(self,
                     config,
                     model_path,
                     storage,
                     image,
                     destination,
                     optimization_level=None,
                     gpuid=0,
                     push_model=True):
     local_config = self._finalize_config(config, training=False)
     objects = self.release(
         local_config,
         model_path,
         optimization_level=optimization_level,
         gpuid=gpuid)
     bundle_dependencies(objects, config, local_config)
     model_id = config['model'] + '_release'
     config['model'] = model_id
     config['modelType'] = 'release'
     config['imageTag'] = image
     for name in ("parent_model", "build", "data"):
         if name in config:
             del config[name]
     inference_options = config.get('inference_options')
     if inference_options is not None:
         schema = config_util.validate_inference_options(inference_options, config)
         options_path = os.path.join(self._output_dir, 'options.json')
         with open(options_path, 'w') as options_file:
             json.dump(schema, options_file)
         objects[os.path.basename(options_path)] = options_path
     objects_dir = os.path.join(self._models_dir, model_id)
     build_model_dir(objects_dir, objects, config, should_check_integrity)
     if push_model:
         storage.push(objects_dir, storage.join(destination, model_id))
Exemplo n.º 4
0
    def preprocess_into_model(self,
                              model_id,
                              config,
                              storage,
                              model_storage,
                              image,
                              parent_model=None,
                              model_path=None,
                              push_model=True):
        logger.info('Starting preprocessing %s', model_id)
        start_time = time.time()

        local_config = self._finalize_config(config)
        data_dir, train_dir, num_samples, distribution_summary, samples_metadata = (
            self._generate_training_data(local_config))
        if num_samples == 0:
            raise RuntimeError('data sampling generated 0 sentences')
        if not self._support_multi_training_files:
            data_dir = self._merge_multi_training_files(
                data_dir, train_dir, config['source'], config['target'])

        end_time = time.time()
        logger.info('Finished preprocessing %s in %s seconds', model_id, str(end_time-start_time))

        # Fill training details.
        if parent_model:
            config['parent_model'] = parent_model
        config['model'] = model_id
        config['modelType'] = 'preprocess'
        config['imageTag'] = image
        config['sampling'] = {
            'numSamples': num_samples,
            'samplesMetadata': samples_metadata}
        parent_build_info = config.get('build')
        build_info = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        build_info = self._summarize_data_distribution(
            build_info, distribution_summary, parent_build_info=parent_build_info)
        config['build'] = build_info

        # Build and push the model package.
        objects = {'data': data_dir}
        bundle_dependencies(objects, config, local_config)
        # Forward other files from the parent model.
        if model_path is not None:
            for f in os.listdir(model_path):
                if f not in objects:
                    objects[f] = os.path.join(model_path, f)
        objects_dir = os.path.join(self._models_dir, model_id)
        build_model_dir(objects_dir, objects, config, should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
        return {
            'num_sentences': build_info.get('sentenceCount')
        }
Exemplo n.º 5
0
    def build_vocab(self,
                    model_id,
                    config,
                    storage,
                    model_storage,
                    image,
                    push_model=True):
        start_time = time.time()
        local_config = self._finalize_config(config)
        objects, preprocess_config, vocab_config = self._generate_vocabularies(
            local_config)
        end_time = time.time()

        # Old PN9 tokenization / buildvocab configuration
        if isinstance(preprocess_config, dict):
            local_config[
                "tokenization"] = utility.resolve_environment_variables(
                    preprocess_config)
            config["tokenization"] = preprocess_config
        elif isinstance(preprocess_config, list):
            local_config["preprocess"] = utility.resolve_environment_variables(
                preprocess_config)
            config["preprocess"] = preprocess_config
            local_config["vocabulary"] = utility.resolve_environment_variables(
                vocab_config)
            config["vocabulary"] = vocab_config
        else:
            raise RuntimeError(
                "Unknown preprocess configuration after buildvocab: \"{}\"".
                format(preprocess_config))

        config['model'] = model_id
        config['modelType'] = 'base'
        config['imageTag'] = image
        config['build'] = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        bundle_dependencies(objects, config, local_config)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
Exemplo n.º 6
0
    def preprocess_into_model(self,
                              model_id,
                              config,
                              storage,
                              model_storage,
                              image,
                              parent_model=None,
                              model_path=None,
                              model_config=None,
                              push_model=True):
        logger.info('Starting preprocessing %s', model_id)
        start_time = time.time()

        local_config = self._finalize_config(config)
        data_dir, num_samples, distribution_summary, samples_metadata, tokens_to_add = (
            self._build_data(local_config))

        end_time = time.time()
        logger.info('Finished preprocessing %s in %s seconds', model_id,
                    str(end_time - start_time))

        _, _, parent_dependencies = self._get_vocabs_info(
            config,
            local_config,
            model_config=model_config,
            tokens_to_add=tokens_to_add,
            keep_previous=True)

        # Fill training details.
        if parent_model:
            config['parent_model'] = parent_model
        config['model'] = model_id
        config['modelType'] = 'preprocess'
        config['imageTag'] = image
        config['sampling'] = {
            'numSamples': num_samples,
            'samplesMetadata': samples_metadata
        }
        parent_build_info = config.get('build')
        build_info = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        build_info = self._summarize_data_distribution(
            build_info,
            distribution_summary,
            parent_build_info=parent_build_info)
        config['build'] = build_info

        # Build and push the model package.
        objects = {'data': data_dir}
        bundle_dependencies(objects, config, local_config)
        # Forward other files from the parent model that are not tracked by the config.
        if model_path is not None:
            for f in os.listdir(model_path):
                if f not in objects and f not in parent_dependencies:
                    objects[f] = os.path.join(model_path, f)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
        return {'num_sentences': build_info.get('sentenceCount')}
Exemplo n.º 7
0
    def train_wrapper(self,
                      model_id,
                      config,
                      storage,
                      model_storage,
                      image,
                      parent_model=None,
                      model_path=None,
                      model_config=None,
                      gpuid=0,
                      push_model=True):
        logger.info('Starting training model %s', model_id)
        start_time = time.time()

        parent_model_type = config.get(
            'modelType') if model_path is not None else None
        local_config = self._finalize_config(config)
        if parent_model_type == 'preprocess':
            data_dir = os.path.join(model_path, 'data')
            num_samples = config['sampling']['numSamples']
            samples_metadata = config['sampling']['samplesMetadata']
            tokens_to_add = {}
            del config['sampling']
            logger.info('Using preprocessed data from %s' % data_dir)
        else:
            data_dir, num_samples, distribution_summary, samples_metadata, tokens_to_add = (
                self._build_data(local_config))

        src_vocab_info, tgt_vocab_info, _ = self._get_vocabs_info(
            config,
            local_config,
            model_config=model_config,
            tokens_to_add=tokens_to_add)

        if parent_model_type in ('base', ):
            model_path = None
        objects = self.train_multi_files(local_config,
                                         data_dir,
                                         src_vocab_info,
                                         tgt_vocab_info,
                                         model_path=model_path,
                                         num_samples=num_samples,
                                         samples_metadata=samples_metadata,
                                         gpuid=gpuid)

        end_time = time.time()
        logger.info('Finished training model %s in %s seconds', model_id,
                    str(end_time - start_time))

        # Fill training details.
        config['model'] = model_id
        config['modelType'] = 'checkpoint'
        config['imageTag'] = image
        build_info = {
            'containerId': os.uname()[1],
            'endDate': end_time,
            'startDate': start_time
        }

        if parent_model_type == 'preprocess':
            # Inherit distribution summary and the parent from the preprocess run.
            config['build'].update(build_info)
        else:
            if parent_model:
                config['parent_model'] = parent_model
            parent_build_info = config.get('build')
            build_info = self._summarize_data_distribution(
                build_info,
                distribution_summary,
                parent_build_info=parent_build_info)
            config['build'] = build_info

        # Build and push the model package.
        bundle_dependencies(objects, config, local_config)
        objects_dir = os.path.join(self._models_dir, model_id)
        utility.build_model_dir(objects_dir, objects, config,
                                should_check_integrity)
        if push_model:
            storage.push(objects_dir, storage.join(model_storage, model_id))
        return {'num_sentences': config['build'].get('sentenceCount')}