def _serving_state(self, config): state = {} if 'tokenization' in config: tok_config = config['tokenization'] state['src_tokenizer'] = tokenizer.build_tokenizer(tok_config['source']) state['tgt_tokenizer'] = tokenizer.build_tokenizer(tok_config['target']) return state
def __init__(self, tok_config): self._src_tokenizer = ('source' in tok_config and \ tokenizer.build_tokenizer(tok_config['source'])) or \ ('multi' in tok_config and \ tokenizer.build_tokenizer(tok_config['multi'])) self._tgt_tokenizer = ('target' in tok_config and \ tokenizer.build_tokenizer(tok_config['target'])) or \ ('multi' in tok_config and \ tokenizer.build_tokenizer(tok_config['multi']))
def _generate_training_data(self, config): if 'data' in config and 'train_dir' in config['data']: train_dir = config['data']['train_dir'] else: train_dir = 'train' data_path = os.path.join(self._corpus_dir, train_dir) num_samples = None summary = None metadata = None logger.info('Generating training data from %s', data_path) if 'data' in config and 'sample_dist' in config['data']: sample_dir = os.path.join(self._data_dir, 'sample') if not os.path.exists(sample_dir): os.mkdir(sample_dir) sample_path = os.path.join(sample_dir, train_dir) logger.info('Sampling training data to %s', sample_path) summary, metadata = sample(config['data']['sample'], config['data']['sample_dist'], data_path, sample_path, config['source'], config['target']) num_samples = sum(six.itervalues(summary['file'])) if num_samples == 0: raise RuntimeError('data sampling generated 0 sentences') data_path = sample_path if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = tokenizer.build_tokenizer( tok_config['source'] if 'source' in tok_config else tok_config) tgt_tokenizer = tokenizer.build_tokenizer( tok_config['target'] if 'target' in tok_config else tok_config) tokenized_dir = os.path.join(self._data_dir, 'tokenized') if not os.path.exists(tokenized_dir): os.mkdir(tokenized_dir) tokenized_path = os.path.join(tokenized_dir, train_dir) logger.info('Tokenizing training data to %s', tokenized_path) tokenizer.tokenize_directory(data_path, tokenized_path, src_tokenizer, tgt_tokenizer, config['source'], config['target']) data_path = tokenized_path if not self._support_multi_training_files: merged_dir = os.path.join(self._data_dir, 'merged') if not os.path.exists(merged_dir): os.mkdir(merged_dir) merged_path = os.path.join(merged_dir, train_dir) logger.info('Merging training data to %s/train.{%s,%s}', merged_path, config['source'], config['target']) data.merge_files_in_directory(data_path, merged_path, config['source'], config['target']) data_path = merged_path return data_path, num_samples, summary, metadata
def _postprocess_file(self, config, source, target): if 'tokenization' in config: tok_config = config['tokenization'] tgt_tokenizer = tokenizer.build_tokenizer(tok_config['target']) output = "%s.detok" % target tokenizer.detokenize_file(tgt_tokenizer, target, output) return output return target
def _preprocess_file(self, config, input): if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = tokenizer.build_tokenizer(tok_config['source']) output = "%s.tok" % input tokenizer.tokenize_file(src_tokenizer, input, output) return output return input
def _postprocess_file(self, config, input): if 'tokenization' in config: tok_config = config['tokenization'] tgt_tokenizer = tokenizer.build_tokenizer( tok_config['target'] if 'target' in tok_config else tok_config) output = "%s.detok" % input tokenizer.detokenize_file(tgt_tokenizer, input, output) return output return input
def build_tokenizer_by_config(self, tok_config, lang): if tok_config is None: tok_config = {"mode": "aggressive"} if lang == 'zh': tok_config['segment_alphabet'] = ['Han'] tok_config['segment_alphabet_change'] = True # to avoid SentencePiece sampling if 'sp_nbest_size' in tok_config: tok_config['sp_nbest_size'] = 0 return tokenizer.build_tokenizer(tok_config)
def _generate_training_data(self, config): if 'data' in config and 'train_dir' in config['data']: train_dir = config['data']['train_dir'] else: train_dir = 'train' data_path = os.path.join(self._corpus_dir, train_dir) num_samples = None summary = None metadata = None if 'data' in config and 'sample_dist' in config['data']: sample_dir = os.path.join(self._data_dir, 'sample') if not os.path.exists(sample_dir): os.mkdir(sample_dir) sample_path = os.path.join(sample_dir, train_dir) logger.info('Sampling training data to %s', sample_path) summary, metadata = sample( config['data']['sample'], config['data']['sample_dist'], data_path, sample_path, config['source'], config['target']) num_samples = sum(six.itervalues(summary['file'])) data_path = sample_path if 'tokenization' in config: tok_config = config['tokenization'] src_tokenizer = 'source' in tok_config and tokenizer.build_tokenizer(tok_config['source']) tgt_tokenizer = 'target' in tok_config and tokenizer.build_tokenizer(tok_config['target']) tokenized_dir = os.path.join(self._data_dir, 'tokenized') if not os.path.exists(tokenized_dir): os.mkdir(tokenized_dir) tokenized_path = os.path.join(tokenized_dir, train_dir) logger.info('Tokenizing training data to %s', tokenized_path) tokenizer.tokenize_directory( data_path, tokenized_path, src_tokenizer, tgt_tokenizer, config['source'], config['target']) data_path = tokenized_path return data_path, train_dir, num_samples, summary, metadata
def __init__(self, tok_config): self._src_tokenizer = 'source' in tok_config and \ tokenizer.build_tokenizer(tok_config['source']) self._tgt_tokenizer = 'target' in tok_config and \ tokenizer.build_tokenizer(tok_config['target'])