Exemplo n.º 1
0
 def _serving_state(self, config):
     state = {}
     if 'tokenization' in config:
         tok_config = config['tokenization']
         state['src_tokenizer'] = tokenizer.build_tokenizer(tok_config['source'])
         state['tgt_tokenizer'] = tokenizer.build_tokenizer(tok_config['target'])
     return state
Exemplo n.º 2
0
    def __init__(self, tok_config):
        self._src_tokenizer = ('source' in tok_config and \
                              tokenizer.build_tokenizer(tok_config['source'])) or \
                              ('multi' in tok_config and \
                               tokenizer.build_tokenizer(tok_config['multi']))

        self._tgt_tokenizer = ('target' in tok_config and \
                              tokenizer.build_tokenizer(tok_config['target'])) or \
                              ('multi' in tok_config and \
                               tokenizer.build_tokenizer(tok_config['multi']))
Exemplo n.º 3
0
    def _generate_training_data(self, config):
        if 'data' in config and 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
        else:
            train_dir = 'train'
        data_path = os.path.join(self._corpus_dir, train_dir)
        num_samples = None
        summary = None
        metadata = None
        logger.info('Generating training data from %s', data_path)
        if 'data' in config and 'sample_dist' in config['data']:
            sample_dir = os.path.join(self._data_dir, 'sample')
            if not os.path.exists(sample_dir):
                os.mkdir(sample_dir)
            sample_path = os.path.join(sample_dir, train_dir)
            logger.info('Sampling training data to %s', sample_path)
            summary, metadata = sample(config['data']['sample'],
                                       config['data']['sample_dist'],
                                       data_path, sample_path,
                                       config['source'], config['target'])
            num_samples = sum(six.itervalues(summary['file']))
            if num_samples == 0:
                raise RuntimeError('data sampling generated 0 sentences')
            data_path = sample_path
        if 'tokenization' in config:
            tok_config = config['tokenization']
            src_tokenizer = tokenizer.build_tokenizer(
                tok_config['source'] if 'source' in tok_config else tok_config)
            tgt_tokenizer = tokenizer.build_tokenizer(
                tok_config['target'] if 'target' in tok_config else tok_config)
            tokenized_dir = os.path.join(self._data_dir, 'tokenized')
            if not os.path.exists(tokenized_dir):
                os.mkdir(tokenized_dir)
            tokenized_path = os.path.join(tokenized_dir, train_dir)
            logger.info('Tokenizing training data to %s', tokenized_path)
            tokenizer.tokenize_directory(data_path, tokenized_path,
                                         src_tokenizer, tgt_tokenizer,
                                         config['source'], config['target'])
            data_path = tokenized_path
        if not self._support_multi_training_files:
            merged_dir = os.path.join(self._data_dir, 'merged')
            if not os.path.exists(merged_dir):
                os.mkdir(merged_dir)
            merged_path = os.path.join(merged_dir, train_dir)
            logger.info('Merging training data to %s/train.{%s,%s}',
                        merged_path, config['source'], config['target'])
            data.merge_files_in_directory(data_path, merged_path,
                                          config['source'], config['target'])
            data_path = merged_path

        return data_path, num_samples, summary, metadata
Exemplo n.º 4
0
 def _postprocess_file(self, config, source, target):
     if 'tokenization' in config:
         tok_config = config['tokenization']
         tgt_tokenizer = tokenizer.build_tokenizer(tok_config['target'])
         output = "%s.detok" % target
         tokenizer.detokenize_file(tgt_tokenizer, target, output)
         return output
     return target
Exemplo n.º 5
0
 def _preprocess_file(self, config, input):
     if 'tokenization' in config:
         tok_config = config['tokenization']
         src_tokenizer = tokenizer.build_tokenizer(tok_config['source'])
         output = "%s.tok" % input
         tokenizer.tokenize_file(src_tokenizer, input, output)
         return output
     return input
Exemplo n.º 6
0
 def _postprocess_file(self, config, input):
     if 'tokenization' in config:
         tok_config = config['tokenization']
         tgt_tokenizer = tokenizer.build_tokenizer(
             tok_config['target'] if 'target' in tok_config else tok_config)
         output = "%s.detok" % input
         tokenizer.detokenize_file(tgt_tokenizer, input, output)
         return output
     return input
Exemplo n.º 7
0
 def build_tokenizer_by_config(self, tok_config, lang):
     if tok_config is None:
         tok_config = {"mode": "aggressive"}
         if lang == 'zh':
             tok_config['segment_alphabet'] = ['Han']
             tok_config['segment_alphabet_change'] = True
     # to avoid SentencePiece sampling
     if 'sp_nbest_size' in tok_config:
         tok_config['sp_nbest_size'] = 0
     return tokenizer.build_tokenizer(tok_config)
Exemplo n.º 8
0
    def _generate_training_data(self, config):
        if 'data' in config and 'train_dir' in config['data']:
            train_dir = config['data']['train_dir']
        else:
            train_dir = 'train'
        data_path = os.path.join(self._corpus_dir, train_dir)
        num_samples = None
        summary = None
        metadata = None
        if 'data' in config and 'sample_dist' in config['data']:
            sample_dir = os.path.join(self._data_dir, 'sample')
            if not os.path.exists(sample_dir):
                os.mkdir(sample_dir)
            sample_path = os.path.join(sample_dir, train_dir)
            logger.info('Sampling training data to %s', sample_path)
            summary, metadata = sample(
                config['data']['sample'],
                config['data']['sample_dist'],
                data_path,
                sample_path,
                config['source'],
                config['target'])
            num_samples = sum(six.itervalues(summary['file']))
            data_path = sample_path
        if 'tokenization' in config:
            tok_config = config['tokenization']
            src_tokenizer = 'source' in tok_config and tokenizer.build_tokenizer(tok_config['source'])
            tgt_tokenizer = 'target' in tok_config and tokenizer.build_tokenizer(tok_config['target'])
            tokenized_dir = os.path.join(self._data_dir, 'tokenized')
            if not os.path.exists(tokenized_dir):
                os.mkdir(tokenized_dir)
            tokenized_path = os.path.join(tokenized_dir, train_dir)
            logger.info('Tokenizing training data to %s', tokenized_path)
            tokenizer.tokenize_directory(
                data_path,
                tokenized_path,
                src_tokenizer,
                tgt_tokenizer,
                config['source'],
                config['target'])
            data_path = tokenized_path

        return data_path, train_dir, num_samples, summary, metadata
Exemplo n.º 9
0
    def __init__(self, tok_config):
        self._src_tokenizer = 'source' in tok_config and \
                              tokenizer.build_tokenizer(tok_config['source'])

        self._tgt_tokenizer = 'target' in tok_config and \
                              tokenizer.build_tokenizer(tok_config['target'])