def generator(self, data_dir, tmp_dir, train): train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _TRAIN_DATASETS if train else _TEST_DATASETS source_datasets = [[FLAGS.raw_data_dir, [item[0]]] for item in datasets] target_datasets = [[FLAGS.raw_data_dir, [item[1]]] for item in datasets] # Copy vocab to data directory source_vocab_path = os.path.join(data_dir, self.source_vocab_name) target_vocab_path = os.path.join(data_dir, self.target_vocab_name) if os.path.exists(source_vocab_path): os.remove(source_vocab_path) if os.path.exists(target_vocab_path): os.remove(target_vocab_path) copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[0]), source_vocab_path) copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[1]), target_vocab_path) source_token_vocab = text_encoder.TokenTextEncoder(source_vocab_path, replace_oov="<unk>") target_token_vocab = text_encoder.TokenTextEncoder(target_vocab_path, replace_oov="<unk>") tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".src", data_path + ".trg", source_token_vocab, target_token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): source_vocab = generator_utils.get_local_vocab( data_dir, self.source_vocab_name()) target_vocab = generator_utils.get_local_vocab( data_dir, self.targeted_vocab_size) tag = "train" if train else "dev" filename_base = "%s.%d" % (self.targeted_vocab_size, tag) """Concatenate all `datasets` and save to `filename`. return tmp_dir/filename_base """ data_path = os.path.join(data_dir, filename_base) """Generator for sequence-to-sequence tasks that uses tokens. This generator assumes the files at source_path and target_path have the same number of lines and yields dictionaries of "inputs" and "targets" where inputs are token ids from the " "-split source (and target, resp.) lines converted to integers using the token_map. Args: source_path: path to the file with source sentences. target_path: path to the file with target sentences. source_token_vocab: text_encoder.TextEncoder object. target_token_vocab: text_encoder.TextEncoder object. eos: integer to append at the end of each sequence (default: None). Yields: A dictionary {"inputs": source-line, "targets": target-line} where the lines are integer lists converted from tokens in the file lines. """ return translate.bi_vocabs_token_generator(data_path + ".mn.shuf", data_path + ".ch.shuf", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): tag = "train" if train else "dev" source_path = "{}/{}/".format(self.root_dir, tag) vocab_file = "{}/{}".format(self.root_dir, self.vocabulary_file) symbolizer_vocab = text_encoder.TokenTextEncoder(vocab_file, replace_oov='<UNK>') mesh_vocab_filename = os.path.join(self.root_dir, self.mesh_vocabulary_file) mesh_vocab = text_encoder.TokenTextEncoder(mesh_vocab_filename, replace_oov='<UNK>') return translate.bi_vocabs_token_generator( source_path + self.data_file, source_path + self.tag_file, symbolizer_vocab, mesh_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enzh_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): TRAIN_DATASET = self.get_training_dataset(tmp_dir) datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET] target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, _file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, _file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = self.get_datasets(train) # build vocab from training datasets source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)] target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _KOEN_TRAIN_DATASETS if train else _KOEN_TEST_DATASETS source_datasets = [item[0] for item in _KOEN_TRAIN_DATASETS] target_datasets = [item[1] for item in _KOEN_TRAIN_DATASETS] tf.gfile.MakeDirs(data_dir) source_vocab = get_or_generate_vocab(data_dir, tmp_dir, self.source_vocab_name, self.sourced_vocab_size, source_datasets, mode=self.source_mode) target_vocab = get_or_generate_vocab(data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, mode=self.target_mode) tag = "train" if train else "dev" data_path = get_or_compile_data(tmp_dir, datasets, "simple_koen_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, text_encoder.EOS_ID)
def generator(self, data_dir, tmp_dir, train): datasets = _TRAIN_DATASETS if train else _TEST_DATASETS source_datasets = [[FLAGS.raw_data_dir, [item[0]]] for item in datasets] target_datasets = [[FLAGS.raw_data_dir, [item[1]]] for item in datasets] source_vocab = generator_utils.get_or_generate_vocab_nocompress( data_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab_nocompress( data_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".src", data_path + ".trg", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enzh_wmt8k_rev return translate.bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENKR_SUBTITLE_TRAIN_DATASETS if train else _ENKR_SUBTITLE_TEST_DATASETS source_datasets = [ os.path.join(data_dir, path[1][0]) for path in datasets ] target_datasets = [ os.path.join(data_dir, path[1][1]) for path in datasets ] source_vocab = generator_utils.get_or_generate_txt_vocab( data_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_txt_vocab( data_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = compile_data_from_txt(tmp_dir, datasets, "zero_shot_enkrch_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)