def testCompileData(self): filename = "out" filepath = os.path.join(self.tmp_dir, filename) translate.compile_data(self.tmp_dir, self.DATASETS, filename) count = 0 for i, example in enumerate( text_problems.text2text_txt_iterator(filepath + ".lang1", filepath + ".lang2")): expected = self.data[i] self.assertEqual(list(expected), [example["inputs"], example["targets"]]) count += 1 self.assertEqual(count, len(self.data))
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = self.source_data_files(dataset_split) """ for item in datasets: dummy_file_name = item[0].split('/')[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) """ source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][0]]] for item in train_dataset] source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) source_encoder = text_encoder.TokenTextEncoder( source_vocab_filename, replace_oov=self.oov_token) target_encoder = text_encoder.TokenTextEncoder( target_vocab_filename, replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "%s-compiled-%s" % (self.name, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + '.lang1', data_path + '.lang2'), source_encoder, target_encoder)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset #if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "%s" % (tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".source", data_path + ".target"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, train): train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=self.approx_vocab_size, model_prefix=self.source_vocab_name, sources=source_datasets, file_byte_budget=1e10) target_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=int(self.approx_vocab_size / 2), model_prefix=self.target_vocab_name, sources=target_datasets, file_byte_budget=1e10) tag = "train" if train else "dev" filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, train): datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" vocab_datasets = [] data_path = translate.compile_data(tmp_dir, datasets, "czeng57m_encs_tok_%s" % tag) # CzEng contains 100 gz files with tab-separated columns, so let's expect # it is the first dataset in datasets and use the newly created *.lang{1,2} # files for vocab construction. if datasets[0][0].endswith("czeng57m.tar"): vocab_datasets.append([ datasets[0][0], [ "czeng57m_encs_tok_%s.lang1" % tag, "czeng57m_encs_tok_%s.lang2" % tag ] ]) datasets = datasets[1:] vocab_datasets += [[item[0], [item[1][0], item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, vocab_datasets) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, text_encoder.EOS_ID)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" # create shared vocabulary if self.vocab_type == "subwords": data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) self.get_or_create_vocab(data_dir, tmp_dir) sample_iterator = text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2") elif self.vocab_type == "tokens": sample_iterator = super().generate_samples(data_dir, tmp_dir, dataset_split) else: raise ValueError("VocabType not supported") # create source feature vocabularies data_path = self.compile_sfeat_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) self.create_src_feature_vocabs(data_dir, tmp_dir) sfeat_iterator = text_problems.txt_line_iterator(data_path + ".sfeat") def _generate(sample_iterator, sfeat_iterator): for sample in sample_iterator: sample["sfeats"] = next(sfeat_iterator) yield sample return _generate(sample_iterator, sfeat_iterator)
def generate_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generator(self, _, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) return translate.character_generator( data_path + ".lang1", data_path + ".lang2", character_vocab, EOS)
def generator(self, _, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_chr_%s" % tag) return translate.character_generator(data_path + ".lang1", data_path + ".lang2", character_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENDE_TRAIN_DATASETS) datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enfr_chr_%s" % tag) return translate.character_generator( data_path + ".lang1", data_path + ".lang2", character_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENFR_TRAIN_SMALL_DATA) if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, source_datasets + target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enmk_setimes32k_rev return translate.token_generator(data_path + ".lang2", data_path + ".lang1", symbolizer_vocab, EOS)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name), replace_oov=self.oov_token) target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name), replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "wmt_pdre_tok_%s" % tag data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, train): datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enzh_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = self.get_datasets(train) # build vocab from training datasets source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)] target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): TRAIN_DATASET = self.get_training_dataset(tmp_dir) datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET] target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, _file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, _file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, _ZHZH_TRAIN_DATASETS, file_byte_budget=1e8) train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "mydata_enzh_tok_%s" % tag) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), symbolizer_vocab, symbolizer_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS for item in datasets: dummy_file_name = item[0].split("/")[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表 source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag) # 将所有语料连接存入一个文件中 data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) # For eval, use authentic data. if dataset_split != problem.DatasetSplit.TRAIN: for example in text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2"): yield example else: # For training, mix synthetic and authentic data as follows. for (file1, file2) in self.backtranslate_data_filenames: path1 = os.path.join(tmp_dir, file1) path2 = os.path.join(tmp_dir, file2) # Synthetic data first. for example in text_problems.text2text_txt_iterator(path1, path2): yield example # Now authentic data. for example in text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2"): yield example
def generator(self, data_dir, tmp_dir, train): datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enzh_wmt8k_rev return translate.bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1", source_vocab, target_vocab, EOS)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) # Iterator over authentic data. it_auth = text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2") # For eval, use authentic data. if dataset_split != problem.DatasetSplit.TRAIN: for example in it_auth: yield example else: # For training, mix synthetic and authentic data as follows. for (file1, file2) in self.backtranslate_data_filenames: path1 = os.path.join(tmp_dir, file1) path2 = os.path.join(tmp_dir, file2) # Synthetic data first. for example in text_problems.text2text_txt_iterator(path1, path2): yield example # Now authentic data. for example in it_auth: yield example