def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN source_datasets = _INPUT_FILES target_datasets = _OUTPUT_FILES source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "test" filename_src = "en_{}.src".format(tag) filename_dst = "ru_{}.dst".format(tag) data_path = './shad_nlp18_contextNMT/data_fused/' return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + filename_src, data_path + filename_dst), source_vocab, target_vocab)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def parsing_token_generator(data_dir, tmp_dir, train, source_vocab_size, target_vocab_size): """Generator for parsing as a sequence-to-sequence task that uses tokens. This generator assumes the files parsing_{train,dev}.trees, which contain trees in wsj format. Args: data_dir: path to the data directory. tmp_dir: path to temporary storage directory. train: whether we're training or not. source_vocab_size: source vocab size. target_vocab_size: target vocab size. Returns: A generator to a dictionary of inputs and outputs. """ source_symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "wsj_source.vocab.%d" % source_vocab_size, source_vocab_size) target_symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "wsj_target.vocab.%d" % target_vocab_size, target_vocab_size) filename = "%s_%s.trees" % (FLAGS.parsing_path, "train" if train else "dev") tree_filepath = os.path.join(tmp_dir, filename) return token_generator(tree_filepath, source_symbolizer_vocab, target_symbolizer_vocab, 1)
def parsing_token_generator(tmp_dir, train, source_vocab_size, target_vocab_size): """Generator for parsing as a sequence-to-sequence task that uses tokens. This generator assumes the files parsing_{train,dev}.trees, which contain trees in wsj format. Args: tmp_dir: path to the file with source sentences. train: path to the file with target sentences. source_vocab_size: source vocab size. target_vocab_size: target vocab size. Returns: A generator to a dictionary of inputs and outputs. """ source_symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) filename = "%s_%s.trees" % (FLAGS.parsing_path, "train" if train else "dev") tree_filepath = os.path.join(tmp_dir, filename) return token_generator(tree_filepath, source_symbolizer_vocab, target_symbolizer_vocab, 1)
def generator(self, data_dir, tmp_dir, train): #pdb.set_trace() source_vocab_size = self.targeted_vocab_size target_vocab_size = self.targeted_vocab_size #symbolizer_vocab = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) source_datasets = [[ "pronoun_enfr_train.lang1", ["pronoun_enfr_train.lang1"] ]] target_datasets = [[ "pronoun_enfr_train.lang2", ["pronoun_enfr_train.lang2"] ]] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.pronoun-en.%d" % source_vocab_size, source_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.pronoun-fr.%d" % target_vocab_size, target_vocab_size, target_datasets) #datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS #tag = "train" if train else "dev" #data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) data_path = tmp_dir + "/pronoun_enfr_" if train: data_path = data_path + "train" else: data_path = data_path + "dev" return bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1", source_vocab, target_vocab, EOS)
def parsing_token_generator(tmp_dir, train, source_vocab_size, target_vocab_size): """Generator for parsing as a sequence-to-sequence task that uses tokens. This generator assumes the files parsing_{train,dev}.wsj, which contain trees in wsj format and wsj_{source,target}.tokens.vocab.<vocab_size> exist in tmp_dir. Args: tmp_dir: path to the file with source sentences. train: path to the file with target sentences. source_vocab_size: source vocab size. target_vocab_size: target vocab size. Returns: A generator to a dictionary of inputs and outputs. """ source_symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "wsj_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "wsj_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) filename = "parsing_%s.trees" % ("train" if train else "dev") tree_filepath = os.path.join(tmp_dir, filename) return token_generator(tree_filepath, source_symbolizer_vocab, target_symbolizer_vocab, 1)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, train): train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def parsing_token_generator(tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) filename = "parsing_%s" % ("train" if train else "dev") text_filepath = os.path.join(tmp_dir, filename + ".text") tags_filepath = os.path.join(tmp_dir, filename + ".tags") return token_generator(text_filepath, tags_filepath, symbolizer_vocab, EOS)
def parsing_token_generator(tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) filename = "parsing_%s" % ("train" if train else "dev") text_filepath = os.path.join(tmp_dir, filename + ".text") tags_filepath = os.path.join(tmp_dir, filename + ".tags") return token_generator(text_filepath, tags_filepath, symbolizer_vocab, 1)
def parsing_token_generator(data_dir, tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) filename = "%s_%s.trees" % (FLAGS.parsing_path, "train" if train else "dev") tree_filepath = os.path.join(tmp_dir, filename) return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab, symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): source_vocab_size = self.targeted_vocab_size target_vocab_size = self.targeted_vocab_size datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.zhen-zh.%d" % source_vocab_size, source_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.zhen-en.%d" % target_vocab_size, target_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, [_TRAIN_DATASETS["sv"]]) datasets = _TRAIN_DATASETS["sv"] if train else _TEST_DATASETS["sv"] fulltext_file, summaries_file = download_and_extract_data( tmp_dir, datasets) return token_generator(fulltext_file, summaries_file, vocab, EOS)
def zhen_wordpiece_token_generator(tmp_dir, train, source_vocab_size, target_vocab_size): """Wordpiece generator for the WMT'17 zh-en dataset.""" datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] source_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.zh.%d" % source_vocab_size, source_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.en.%d" % target_vocab_size, target_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enzh_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ENZH_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ENZH_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enzh_tok_%s" % tag) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" vocab_datasets = [] data_path = translate.compile_data(tmp_dir, datasets, "czeng57m_encs_tok_%s" % tag) # CzEng contains 100 gz files with tab-separated columns, so let's expect # it is the first dataset in datasets and use the newly created *.lang{1,2} # files for vocab construction. if datasets[0][0].endswith("czeng57m.tar"): vocab_datasets.append([ datasets[0][0], [ "czeng57m_encs_tok_%s.lang1" % tag, "czeng57m_encs_tok_%s.lang2" % tag ] ]) datasets = datasets[1:] vocab_datasets += [[item[0], [item[1][0], item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, vocab_datasets) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, text_encoder.EOS_ID)
def timit_generator(data_dir, tmp_dir, training, how_many, start_from=0, eos_list=None, vocab_filename=None, vocab_size=0): """Data generator for TIMIT transcription problem. Args: data_dir: path to the data directory. tmp_dir: path to temporary storage directory. training: a Boolean; if true, we use the train set, otherwise the test set. how_many: how many inputs and labels to generate. start_from: from which input to start. eos_list: optional list of end of sentence tokens, otherwise use default value `1`. vocab_filename: file within `tmp_dir` to read vocabulary from. If this is not provided then the target sentence will be encoded by character. vocab_size: integer target to generate vocabulary size to. Yields: A dictionary representing the images with the following fields: * inputs: a float sequence containing the audio data * audio/channel_count: an integer * audio/sample_count: an integer * audio/sample_width: an integer * targets: an integer sequence representing the encoded sentence """ eos_list = [1] if eos_list is None else eos_list if vocab_filename is not None: vocab_symbolizer = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, vocab_filename, vocab_size) _get_timit(tmp_dir) datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS) i = 0 for data_dir, (audio_ext, transcription_ext) in datasets: data_dir = os.path.join(tmp_dir, data_dir) data_files = _collect_data(data_dir, audio_ext, transcription_ext) data_pairs = data_files.values() for input_file, target_file in sorted(data_pairs)[start_from:]: if i == how_many: return i += 1 audio_data, sample_count, sample_width, num_channels = _get_audio_data( input_file) text_data = _get_text_data(target_file) if vocab_filename is None: label = [ord(c) for c in text_data] + eos_list else: label = vocab_symbolizer.encode(text_data) + eos_list yield { "inputs": audio_data, "audio/channel_count": [num_channels], "audio/sample_count": [sample_count], "audio/sample_width": [sample_width], "targets": label }
def timit_generator(data_dir, tmp_dir, training, how_many, start_from=0, eos_list=None, vocab_filename=None, vocab_size=0): """Data generator for TIMIT transcription problem. Args: data_dir: path to the data directory. tmp_dir: path to temporary storage directory. training: a Boolean; if true, we use the train set, otherwise the test set. how_many: how many inputs and labels to generate. start_from: from which input to start. eos_list: optional list of end of sentence tokens, otherwise use default value `1`. vocab_filename: file within `tmp_dir` to read vocabulary from. If this is not provided then the target sentence will be encoded by character. vocab_size: integer target to generate vocabulary size to. Yields: A dictionary representing the images with the following fields: * inputs: a float sequence containing the audio data * audio/channel_count: an integer * audio/sample_count: an integer * audio/sample_width: an integer * targets: an integer sequence representing the encoded sentence """ eos_list = [1] if eos_list is None else eos_list if vocab_filename is not None: vocab_symbolizer = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, vocab_filename, vocab_size) _get_timit(tmp_dir) datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS) i = 0 for data_dir, (audio_ext, transcription_ext) in datasets: data_dir = os.path.join(tmp_dir, data_dir) data_files = _collect_data(data_dir, audio_ext, transcription_ext) data_pairs = data_files.values() for input_file, target_file in sorted(data_pairs)[start_from:]: if i == how_many: return i += 1 audio_data, sample_count, sample_width, num_channels = _get_audio_data( input_file) text_data = _get_text_data(target_file) if vocab_filename is None: label = [ord(c) for c in text_data] + eos_list else: label = vocab_symbolizer.encode(text_data) + eos_list yield { "inputs": audio_data, "audio/channel_count": [num_channels], "audio/sample_count": [sample_count], "audio/sample_width": [sample_width], "targets": label }
def generator(self, data_dir, tmp_dir, train): TRAIN_DATASET = self.get_training_dataset(tmp_dir) datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET] target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets, _file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets, _file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def ende_wordpiece_token_generator(tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size) datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def ende_wordpiece_token_generator(tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, 1)
def generator(self, data_dir, tmp_dir, train): datasets = self.get_datasets(train) # build vocab from training datasets source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)] target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
def generate_vocab(self, data_dir, tmp_dir, **kwargs): datasets = get_dataset(tmp_dir) source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] _ = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) _ = generator_utils.get_or_generate_vocab(data_dir, tmp_dir, self.target_vocab_name, int(self.approx_vocab_size / 2), target_datasets, file_byte_budget=1e8)
def generator(self, data_dir, tmp_dir, train): datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enzh_wmt8k_rev return bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1", source_vocab, target_vocab, EOS)
def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size): """Instance of token generator for the WMT en->fr task.""" symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size): """Instance of token generator for the WMT en->fr task.""" symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size) datasets = _ENFR_TRAIN_DATASETS if train else _ENFR_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, 1)
def generator(self, data_dir, tmp_dir, train): vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, [_TRAIN_DATASETS["facts-result"]]) datasets = _TRAIN_DATASETS[ "facts-result"] if train else _TEST_DATASETS["facts-result"] document_file, labels_file = download_and_extract_data( tmp_dir, datasets) return token_generator(document_file, labels_file, vocab, EOS)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS for item in datasets: dummy_file_name = item[0].split("/")[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] # 有词汇表直接编码,没有此汇表会自己创建词表;此处构建的是编码器,同时可以创建词表 source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_sub_%s" % (self.approx_vocab_size, tag) # 将所有语料连接存入一个文件中 data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENDE_TRAIN_DATASETS) datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
def generator(self, data_dir, tmp_dir, train): datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, source_datasets + target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _TRAIN_DATASETS["en-sv"]) datasets = _TRAIN_DATASETS["en-sv"] if train else _TEST_DATASETS[ "en-sv"] tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "legal_ensv_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def mken_wordpiece_token_generator(tmp_dir, train, vocab_size): """Wordpiece generator for the SETimes Mk-En dataset.""" datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size, source_datasets + target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENFR_TRAIN_SMALL_DATA) if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, source_datasets + target_datasets) tag = "train" if train else "dev" data_path = translate._compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enmk_setimes32k_rev return translate.token_generator(data_path + ".lang2", data_path + ".lang1", symbolizer_vocab, EOS)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, _ZHZH_TRAIN_DATASETS, file_byte_budget=1e8) train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "mydata_enzh_tok_%s" % tag) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), symbolizer_vocab, symbolizer_vocab)
def generator(self, data_dir, tmp_dir, train): datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) vocab_datasets = [] # CzEng contains 100 gz files with tab-separated columns, so let's expect # it is the first dataset in datasets and use the newly created *.lang{1,2} files instead. if datasets[0][0].endswith("data-plaintext-format.tar"): vocab_datasets.append([ datasets[0][0], ["wmt_encs_tok_%s.lang1" % tag, "wmt_encs_tok_%s.lang2" % tag] ]) datasets = datasets[1:] vocab_datasets += [[item[0], [item[1][0], item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, vocab_datasets) return token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): # Called twice: for train and test # Get the list of the training samples (coding challenge samples) samples = list(generator_samples(tmp_dir, self.pb_constants)) # Split between train and dev # Suffle to get problems from diverse sources (CodeChef and CodeForces) and # dificulties in each set. # Need to sort the samples first before shuffling (as walk() isn't # deterministic) samples.sort(key=lambda x: x.desc_file) # in-place rng = random.Random(7531) # Local fixed seed rng.shuffle(samples) # in-place # Train: 5019/5228 problems # Dev: 209/5228 problems len_samples = len(samples) split = len_samples // 25 samples = samples[split:] if train else samples[:split] tf.logging.info("Number of samples for {}: {}/{}".format( "train" if train else "dev", len(samples), len_samples )) def generator_samples_content(get_source, get_target): source, target = None, None # Iterate over the coding samples for sample in samples: if get_source: with tf.gfile.GFile(sample.desc_file, mode="r") as source_file: source = source_file.read() if get_target: # Each challenge can have multiple implementations (or none) for code_file in sample.code_files: with tf.gfile.GFile(code_file, mode="r") as target_file: target = target_file.read() target = self.preprocess_target(target) yield source, target elif sample.code_files: # Only take the source if a target exists yield source, target def generator_target(): for _, target in generator_samples_content(False, True): yield target.strip() # Generate vocab for both source and target source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size) target_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.vocab_target_filename, vocab_size=self.target_vocab_size, generator=generator_target(),) # Yield the training and testing samples eos_list = [EOS] for source, target in generator_samples_content(True, True): source_ints = source_vocab.encode(source.strip()) + eos_list target_ints = target_vocab.encode(target.strip()) + eos_list yield { "inputs": source_ints, "targets": target_ints, }
def mscoco_generator(data_dir, tmp_dir, training, how_many, start_from=0, eos_list=None, vocab_filename=None, vocab_size=0): """Image generator for MSCOCO captioning problem with token-wise captions. Args: data_dir: path to the data directory. tmp_dir: path to temporary storage directory. training: a Boolean; if true, we use the train set, otherwise the test set. how_many: how many images and labels to generate. start_from: from which image to start. eos_list: optional list of end of sentence tokens, otherwise use default value `1`. vocab_filename: file within `tmp_dir` to read vocabulary from. vocab_size: integer target to generate vocabulary size to. Yields: A dictionary representing the images with the following fields: * image/encoded: the string encoding the image as JPEG, * image/format: the string "jpeg" representing image format, * image/class/label: a list of integers representing the caption, * image/height: an integer representing the height, * image/width: an integer representing the width. Every field is actually a list of the corresponding type. """ eos_list = [1] if eos_list is None else eos_list if vocab_filename is not None: vocab_symbolizer = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, vocab_filename, vocab_size) _get_mscoco(tmp_dir) caption_filepath = ( _MSCOCO_TRAIN_CAPTION_FILE if training else _MSCOCO_EVAL_CAPTION_FILE) caption_filepath = os.path.join(tmp_dir, caption_filepath) prefix = _MSCOCO_TRAIN_PREFIX if training else _MSCOCO_EVAL_PREFIX caption_file = io.open(caption_filepath) caption_json = json.load(caption_file) # Dictionary from image_id to ((filename, height, width), captions). image_dict = dict() for image in caption_json["images"]: image_dict[image["id"]] = [(image["file_name"], image["height"], image["width"]), []] annotations = caption_json["annotations"] annotation_count = len(annotations) image_count = len(image_dict) tf.logging.info("Processing %d images and %d labels\n" % (image_count, annotation_count)) for annotation in annotations: image_id = annotation["image_id"] image_dict[image_id][1].append(annotation["caption"]) data = list(image_dict.values())[start_from:start_from + how_many] random.shuffle(data) for image_info, labels in data: image_filename = image_info[0] image_filepath = os.path.join(tmp_dir, prefix, image_filename) with tf.gfile.Open(image_filepath, "r") as f: encoded_image_data = f.read() height, width = image_info[1], image_info[2] for label in labels: if vocab_filename is None: label = [ord(c) for c in label] + eos_list else: label = vocab_symbolizer.encode(label) + eos_list yield { "image/encoded": [encoded_image_data], "image/format": ["jpeg"], "image/class/label": label, "image/height": [height], "image/width": [width] }