def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN source_datasets = _INPUT_FILES target_datasets = _OUTPUT_FILES source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "test" filename_src = "en_{}.src".format(tag) filename_dst = "ru_{}.dst".format(tag) data_path = './shad_nlp18_contextNMT/data_fused/' return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + filename_src, data_path + filename_dst), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = self.source_data_files(dataset_split) """ for item in datasets: dummy_file_name = item[0].split('/')[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) """ source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][0]]] for item in train_dataset] source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) source_encoder = text_encoder.TokenTextEncoder( source_vocab_filename, replace_oov=self.oov_token) target_encoder = text_encoder.TokenTextEncoder( target_vocab_filename, replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "%s-compiled-%s" % (self.name, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + '.lang1', data_path + '.lang2'), source_encoder, target_encoder)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): """在生成数据的时候,主要是通过这个方法获取已编码样本的""" generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_vocab(data_dir) target_encoder = self.get_vocab(data_dir, is_target=True) return text_problems.text2text_generate_encoded(generator, encoder, target_encoder, has_inputs=self.has_inputs)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoders = self.get_or_create_vocab(data_dir, tmp_dir) return text_problems.text2text_generate_encoded(sample_generator=generator, vocab=encoders["inputs"], targets_vocab=encoders["targets"], has_inputs=self.has_inputs)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=self.approx_vocab_size, model_prefix=self.source_vocab_name, sources=source_datasets, file_byte_budget=1e10) target_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=int(self.approx_vocab_size / 2), model_prefix=self.target_vocab_name, sources=target_datasets, file_byte_budget=1e10) tag = "train" if train else "dev" filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset #if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "%s" % (tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".source", data_path + ".target"), source_vocab, target_vocab)
def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) source_encoder = self.feature_encoders(data_dir)['inputs'] target_encoder = self.feature_encoders(data_dir)['targets'] return text_problems.text2text_generate_encoded( generator, source_encoder, targets_vocab=target_encoder, has_inputs=True)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): """ override this function just for add "replace_oov = '<UNK>' """ generator = self.generate_samples(data_dir, tmp_dir, dataset_split) vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov='<UNK>') return text2text_generate_encoded(generator, encoder, has_inputs=self.has_inputs)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): """在生成数据的时候,主要是通过这个方法获取已编码样本的 args:data_dir:存储t2tdatagen产生的数据 temp_dir: 原始数据存放地址 """ generator = self.generate_samples(data_dir, tmp_dir, dataset_split) # 一个生成器每次产生一个句子对 encoder = self.get_vocab(data_dir) target_encoder = self.get_vocab(data_dir, is_target=True) # 将句子编码, has_input=True表示对源语言数据编码,=False表示对目标语言数据编码 return text_problems.text2text_generate_encoded( generator, encoder, target_encoder, has_inputs=self.has_inputs)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN datasets = _TRAIN_DATASETS if train else _DEV_DATASETS vocab_list = [] print("=======Get Vocab from ", self.vocab_name, '...', end='') with open(self.vocab_name, 'r', encoding='utf-8') as f: vocab_list = f.read().splitlines() print("=======Done") vocab = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=vocab_list, replace_oov="<UNK>", num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS) return text_problems.text2text_generate_encoded(text_problems.text2text_txt_iterator(datasets[0], datasets[1]), vocab, vocab)
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" filename = "parsing_{0}.pairs".format("train" if train else "dev") source_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 0, prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab, target_vocab)
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" filename = "parsing_{0}.pairs".format("train" if train else "dev") source_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 0, prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name), replace_oov=self.oov_token) target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name), replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "wmt_pdre_tok_%s" % tag data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, _ZHZH_TRAIN_DATASETS, file_byte_budget=1e8) train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "mydata_enzh_tok_%s" % tag) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), symbolizer_vocab, symbolizer_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS for item in datasets: dummy_file_name = item[0].split("/")[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表 source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag) # 将所有语料连接存入一个文件中 data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): # train_dataset = self.get_training_dataset(tmp_dir) if dataset_split == problem.DatasetSplit.TRAIN: datasets = LM_TRAIN_DATASETS tag = "train" elif dataset_split == problem.DatasetSplit.EVAL: datasets = LM_DEV_DATASETS tag = "dev" else: datasets = LM_TEST_DATASETS tag = "test" # train = dataset_split == problem.DatasetSplit.TRAIN # datasets = train_dataset if train else LM_TEST_DATASETS # source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] # target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.source_vocab_name, vocab_size=self.approx_vocab_size, generator=self.generate(tmp_dir=tmp_dir, source_filenames=self.source_filenames, index=1), max_subtoken_length=None) target_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.target_vocab_name, vocab_size=self.approx_vocab_size, generator=self.generate(tmp_dir=tmp_dir, source_filenames=self.source_filenames, index=2), max_subtoken_length=1) # tag = "train" if train else "dev" filename_base = "thchs_pinyinzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = self.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN source_dataset = ['en_corpus'] target_dataset = ['ch_corpus'] source_vocab = get_or_generate_vocab(data_dir, self.source_vocab_name, self.approx_vocab_size, source_dataset) target_vocab = get_or_generate_vocab(data_dir, self.target_vocab_name, self.approx_vocab_size, target_dataset) tag = "train" if train else "dev" filename_base = "challenger_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator( os.path.join(data_dir, 'en_corpus'), os.path.join(data_dir, 'ch_corpus')), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_or_create_vocab(data_dir, tmp_dir) return text_problems.text2text_generate_encoded( generator, encoder, has_inputs=self.has_inputs)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_or_create_vocab(data_dir, tmp_dir) return text_problems.text2text_generate_encoded(generator, encoder, has_inputs=self.has_inputs)
os.path.join(data_dir, 'ch_corpus')), source_vocab, target_vocab) def feature_encoders(self, data_dir): source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) source_token = text_encoder.SubwordTextEncoder(source_vocab_filename) target_token = text_encoder.SubwordTextEncoder(target_vocab_filename) return { "inputs": source_token, "targets": target_token, } if __name__ == '__main__': root_path = '/home/zhangpengpeng/PycharmProjects/challenger_mt/mt/data1' source_vocab = text_encoder.TokenTextEncoder(os.path.join( root_path, 'en_vocab'), num_reserved_ids=2, replace_oov=UNK) target_vocab = text_encoder.TokenTextEncoder(os.path.join( root_path, 'ch_vocab'), num_reserved_ids=2, replace_oov=UNK) encoded = text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator( os.path.join(root_path, 'en_corpus'), os.path.join(root_path, 'ch_corpus')), source_vocab, target_vocab) for _ in range(100): print(encoded())