def make_vocal_file(): aaer = aaer_corpus.AAERParserTokens() generator_utils.get_or_generate_vocab_inner( data_dir=const.T2T_DATA_DIR, vocab_filename=const.T2T_AAER_VOLCAB_NAME, vocab_size=40000, generator=aaer.get_tokens())
def generator(self, data_dir, tmp_dir, train): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate vocab encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, lambda: self.doc_generator(imdb_dir, "train")) # Generate examples dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { "inputs": encoder.encode(doc) + [EOS], "targets": [int(label)], }
def generator(self, data_dir, tmp_dir, train): """Generate examples.""" data_file = TRAIN_DATASETS if train else TEST_DATASETS # Generate vocab raw_gen = RawDataGenerator() vocab_encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, raw_gen.generator(data_file, for_vocab=True)) label_encoder = text_encoder.ClassLabelEncoder( class_labels_fname=LABEL_FILE) # Generate examples for label, entities, sentence in raw_gen.generator(data_file): entities = [vocab_encoder.encode(e) for e in entities] sentence = vocab_encoder.encode(sentence) entities_pos = raw_gen.find_start_position(entities, sentence) yield { "inputs": sentence, "targets": [label_encoder.encode(label)], 'lexical': raw_gen.lexical_feature(entities_pos, sentence), 'position1': raw_gen.position_feature(entities_pos[0], sentence), 'position2': raw_gen.position_feature(entities_pos[1], sentence), }
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type == VocabType.CHARACTER: encoder = text_encoder.ByteTextEncoder() elif self.vocab_type == VocabType.SUBWORD: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) else: other_problem = self.use_vocab_from_other_problem if other_problem: return other_problem.get_or_create_vocab(data_dir, tmp_dir, force_get) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.approx_vocab_size, self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=self.max_subtoken_length, reserved_tokens=( text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens)) elif self.vocab_type == VocabType.TOKEN: vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov=self.oov_token) else: raise ValueError( "Unrecognized VocabType: %s" % str(self.vocab_type)) return encoder
def get_or_generate_vocab(self, data_dir, tmp_dir=None): filename_base = "nst_enzh_%sk_tok_%s" % (self.approx_vocab_size, "train") """collect training Files to be passed to generate vocab(src)""" src_file_name = self.compile_data(tmp_dir, "train", filename_base + ".lang1", file_ext="trans.txt") """collect training Files to be passed to generate vocab(trg),(*pinyin.txt or *tmp.txt)""" trg_file_name = self.compile_data(tmp_dir, "train", filename_base + ".lang2", file_ext="pinyin.txt") def traverse_samples(filepath, file_byte_budget): with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget_ = file_byte_budget counter = 0 countermax = int(source_file.size() / file_byte_budget_ / 2) for line in source_file: if counter < countermax: counter += 1 else: if file_byte_budget_ <= 0: break line = line.strip() file_byte_budget_ -= len(line) counter = 0 yield line src_vocab = generator_utils.get_or_generate_vocab_inner( data_dir, self.source_vocab_name, self.approx_vocab_size, traverse_samples(src_file_name, file_byte_budget=1e8), ) trg_vocab = generator_utils.get_or_generate_vocab_inner( data_dir, self.target_vocab_name, self.approx_vocab_size, traverse_samples(trg_file_name, file_byte_budget=1e8), ) return src_vocab, trg_vocab
def feature_encoders(self, data_dir): vocab_filename = os.path.join(data_dir, self.vocab_file) # encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK") encoder = generator_utils.get_or_generate_vocab_inner( data_dir=const.T2T_DATA_DIR, vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size, generator=aaer.AAERExParserTokens().get_tokens()) return {"inputs": encoder, "targets": encoder}
def generator(self, data_dir, tmp_dir, _): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, lambda: story_generator(tmp_dir)) for story in story_generator(tmp_dir): summary, rest = _story_summary_split(story) encoded_summary = encoder.encode(summary) + [EOS] encoded_story = encoder.encode(rest) + [EOS] yield {"inputs": encoded_story, "targets": encoded_summary}
def generator(self, data_dir, tmp_dir, _): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, page_generator(tmp_dir, max_docs=10000)) for page in page_generator(tmp_dir): title = _page_title(page) encoded = encoder.encode(page) + [EOS] encoded_title = encoder.encode(title) + [EOS] yield {"inputs": encoded_title, "targets": encoded}
def generator(self, data_dir, tmp_dir, _): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, lambda: page_generator(tmp_dir, max_docs=10000)) for page in page_generator(tmp_dir): title = _page_title(page) encoded = encoder.encode(page) + [EOS] encoded_title = encoder.encode(title) + [EOS] yield {"inputs": encoded_title, "targets": encoded}
def generate_vocab(self): # Generate vocab token_generator = self.raw_gen.generator([TRAIN_FILE, TEST_FILE], for_vocab=True) self.vocab_encoder = generator_utils.get_or_generate_vocab_inner( OUTPUT_DIR, VOCAB_FILE, VOCAB_SIZE, token_generator) tf.logging.info('vocab_size', self.vocab_encoder.vocab_size) self.label_encoder = text_encoder.ClassLabelEncoder( class_labels_fname=LABEL_FILE)
def generator(self, data_dir, tmp_dir, is_training): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, example_generator(tmp_dir, is_training, sum_token=False)) for example in example_generator(tmp_dir, is_training, sum_token=True): story, summary = _story_summary_split(example) encoded_summary = encoder.encode(summary) + [EOS] encoded_story = encoder.encode(story) + [EOS] yield {"inputs": encoded_story, "targets": encoded_summary}
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): # train_dataset = self.get_training_dataset(tmp_dir) if dataset_split == problem.DatasetSplit.TRAIN: datasets = LM_TRAIN_DATASETS tag = "train" elif dataset_split == problem.DatasetSplit.EVAL: datasets = LM_DEV_DATASETS tag = "dev" else: datasets = LM_TEST_DATASETS tag = "test" # train = dataset_split == problem.DatasetSplit.TRAIN # datasets = train_dataset if train else LM_TEST_DATASETS # source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] # target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.source_vocab_name, vocab_size=self.approx_vocab_size, generator=self.generate(tmp_dir=tmp_dir, source_filenames=self.source_filenames, index=1), max_subtoken_length=None) target_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.target_vocab_name, vocab_size=self.approx_vocab_size, generator=self.generate(tmp_dir=tmp_dir, source_filenames=self.source_filenames, index=2), max_subtoken_length=1) # tag = "train" if train else "dev" filename_base = "thchs_pinyinzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = self.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generator(self, data_dir, tmp_dir, is_training): all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, example_generator(all_files, urls_path, sum_token=False)) write_raw_text_to_files(all_files, urls_path, data_dir, tmp_dir, is_training) for example in example_generator(all_files, urls_path, sum_token=True): story, summary = _story_summary_split(example) encoded_summary = encoder.encode(summary) + [EOS] encoded_story = encoder.encode(story) + [EOS] yield {"inputs": encoded_story, "targets": encoded_summary}
def generator(self, data_dir, tmp_dir, _): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, page_generator(tmp_dir, max_docs=1000)) case_num = 0 for page in page_generator(tmp_dir): encoded = encoder.encode(page) for i in xrange(len(encoded) // self.sequence_length): case_num += 1 if self.max_cases and case_num > self.max_cases: return targets = encoded[ i * self.sequence_length:(i + 1) * self.sequence_length] inputs = self.scramble(targets) yield {"inputs": inputs, "targets": targets}
def generator(self, data_dir, tmp_dir, _): encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, lambda: page_generator(tmp_dir, max_docs=1000)) case_num = 0 for page in page_generator(tmp_dir): encoded = encoder.encode(page) for i in xrange(len(encoded) // self.sequence_length): case_num += 1 if self.max_cases and case_num > self.max_cases: return targets = encoded[i * self.sequence_length:(i + 1) * self.sequence_length] inputs = self.scramble(targets) yield {"inputs": inputs, "targets": targets}
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) encoders = {"inputs": encoder, "targets": encoder} else: encoder = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.vocab_filename, vocab_size=self.approx_vocab_size, generator=self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=50) encoders = {"inputs": encoder, "targets": encoder} return encoders
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) train_examples = self._examples(data_dir, tmp_dir, train=True) dev_examples = self._examples(data_dir, tmp_dir, train=False) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, (e['sentence1'] + ' ' + e['sentence2'] for e in train_examples + dev_examples) ) generator_utils.generate_dataset_and_shuffle( self._inputs_and_targets(encoder, train_examples), train_paths, self._inputs_and_targets(encoder, dev_examples), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) train_examples = self._examples(data_dir, tmp_dir, train=True) dev_examples = self._examples(data_dir, tmp_dir, train=False) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, (e['sentence1'] + ' ' + e['sentence2'] for e in train_examples + dev_examples)) generator_utils.generate_dataset_and_shuffle( self._inputs_and_targets(encoder, train_examples), train_paths, self._inputs_and_targets(encoder, dev_examples), dev_paths)
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type == VocabType.CHARACTER: encoder = text_encoder.ByteTextEncoder() elif self.vocab_type == VocabType.SUBWORD: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) else: encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.approx_vocab_size, self.generate_text_for_vocab(data_dir, tmp_dir)) elif self.vocab_type == VocabType.TOKEN: vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename) else: raise ValueError("Unrecognized VocabType") return encoder
def generate_vocab(self, data_dir, tmp_dir, vocab_filename, vocab_size, sources): """Generate a vocabulary from the datasets in sources.""" def generate(): tf.logging.info("Generating vocab from: %s", str(sources)) for lang_file in sources: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(tmp_dir, lang_file) # # # Extract from tar if needed. # if not tf.gfile.Exists(filepath): # read_type = "r:gz" if filename.endswith("tgz") else "r" # with tarfile.open(compressed_file, read_type) as corpus_tar: # corpus_tar.extractall(tmp_dir) # # # For some datasets a second extraction is necessary. # if lang_file.endswith(".gz"): # new_filepath = os.path.join(tmp_dir, lang_file[:-3]) # if tf.gfile.Exists(new_filepath): # tf.logging.info( # "Subdirectory %s already exists, skipping unpacking" % filepath) # else: # tf.logging.info("Unpacking subdirectory %s" % filepath) # gunzip_file(filepath, new_filepath) # filepath = new_filepath # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget = self.file_byte_budget counter = 0 countermax = int(source_file.size() / file_byte_budget / 2) for line in source_file: if counter < countermax: counter += 1 else: if file_byte_budget <= 0: break line = line.strip() file_byte_budget -= len(line) counter = 0 yield line return generator_utils.get_or_generate_vocab_inner( data_dir, vocab_filename, vocab_size, generate())
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type == VocabType.CHARACTER: encoder = text_encoder.ByteTextEncoder() elif self.vocab_type == VocabType.SUBWORD: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) else: other_problem = self.use_vocab_from_other_problem if other_problem: return other_problem.get_or_create_vocab( data_dir, tmp_dir, force_get) encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.approx_vocab_size, self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=self.max_subtoken_length, reserved_tokens=(text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens)) elif self.vocab_type == VocabType.SENTENCEPIECE: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename + '.model') encoder = text_encoder.SentencePieceEncoder(vocab_filepath) else: _, tmp_file_path = tempfile.mkstemp() with open(tmp_file_path, 'w') as fh: for i, text in enumerate( self.generate_text_for_vocab(data_dir, tmp_dir)): fh.write(text.rstrip() + '\n') encoder = text_encoder.SentencePieceEncoder.get_or_generate_vocab( data_dir, self.vocab_filename, self.approx_vocab_size, [tmp_file_path]) elif self.vocab_type == VocabType.TOKEN: vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov=self.oov_token) else: raise ValueError("Unrecognized VocabType: %s" % str(self.vocab_type)) return encoder
def get_or_generate_vocabulary(data_dir, tmp_dir, data_prefix, max_page_size_exp, approx_vocab_size=32768, strip=True): """Get or generate the vocabulary. Args: data_dir: a string tmp_dir: a string data_prefix: a string max_page_size_exp: an integer approx_vocab_size: an integer strip: a boolean Returns: a TextEncoder """ num_pages_for_vocab_generation = approx_vocab_size // 3 vocab_file = vocab_filename(approx_vocab_size, strip) def my_generator(data_prefix): """Line generator for vocab.""" count = 0 for page in corpus_page_generator( all_corpus_files(data_prefix)[::-1], tmp_dir, max_page_size_exp): revisions = page["revisions"] if revisions: text = get_text(revisions[-1], strip=strip) yield text count += 1 if count % 100 == 0: tf.logging.info("reading pages for vocab %d" % count) if count > num_pages_for_vocab_generation: break return generator_utils.get_or_generate_vocab_inner( data_dir, vocab_file, approx_vocab_size, my_generator(data_prefix))
def get_or_generate_vocabulary(data_dir, tmp_dir, data_prefix, max_page_size_exp, approx_vocab_size=32768, strip=True): """Get or generate the vocabulary. Args: data_dir: a string tmp_dir: a string data_prefix: a string max_page_size_exp: an integer approx_vocab_size: an integer strip: a boolean Returns: a TextEncoder """ num_pages_for_vocab_generation = approx_vocab_size // 3 vocab_file = vocab_filename(approx_vocab_size, strip) def my_generator(data_prefix): """Line generator for vocab.""" count = 0 for page in corpus_page_generator( all_corpus_files(data_prefix)[::-1], tmp_dir, max_page_size_exp): revisions = page["revisions"] if revisions: text = get_text(revisions[-1], strip=strip) yield text count += 1 if count % 100 == 0: tf.logging.info("reading pages for vocab %d" % count) if count > num_pages_for_vocab_generation: break return generator_utils.get_or_generate_vocab_inner(data_dir, vocab_file, approx_vocab_size, my_generator(data_prefix))
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if self.vocab_type == VocabType.CHARACTER: encoder = text_encoder.ByteTextEncoder() elif self.vocab_type == VocabType.SUBWORD: if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) else: encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.approx_vocab_size, self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=self.max_subtoken_length, reserved_tokens=( text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens)) elif self.vocab_type == VocabType.TOKEN: vocab_filename = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov=self.oov_token) else: raise ValueError( "Unrecognized VocabType: %s" % str(self.vocab_type)) return encoder
def generator(self, data_dir, tmp_dir, train): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate vocab encoder = generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, self.doc_generator(imdb_dir, "train")) # Generate examples dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { "inputs": encoder.encode(doc) + [EOS], "targets": [int(label)], }
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): if force_get: vocab_filepath = os.path.join(data_dir, self.vocab_filename) encoder = text_encoder.SubwordTextEncoder(vocab_filepath) encoders = {"inputs": encoder, "targets": encoder} for name in self.get_contexts_name(): encoders[name] = encoder else: encoder = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.vocab_filename, vocab_size=self.approx_vocab_size, generator=self.generate_text_for_vocab(data_dir, tmp_dir), max_subtoken_length=50, reserved_tokens=text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens) encoders = {"inputs": encoder, "targets": encoder} for name in self.get_contexts_name(): encoders[name] = encoder return encoders
def get_or_generate_vocab(data_dir, tmp_dir, vocab_filename, vocab_size, sources, file_byte_budget=1e7): """Generate a vocabulary from the datasets in sources.""" def generate(): tf.logging.info("Generating vocab from: %s", str(sources)) for source in sources: path = source[0] for lang_file in source[1]: tf.logging.info("Reading file: %s" % lang_file) filepath = os.path.join(path, lang_file) # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: file_byte_budget_ = file_byte_budget counter = 0 countermax = int(source_file.size() / file_byte_budget_ / 2) for line in source_file: if counter < countermax: counter += 1 else: if file_byte_budget_ <= 0: break line = line.strip() file_byte_budget_ -= len(line) counter = 0 yield line return generator_utils.get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, generate())
def get_or_generate_vocab(self, data_dir, tmp_dir): return generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_file, self.targeted_vocab_size, self.file_generator( self.train_text_filepaths(tmp_dir), max_chars_total=self.max_chars_for_vocab))
import sys import warnings warnings.filterwarnings("ignore") from tensor2tensor.data_generators import generator_utils # Next check is not necessary, it is only to show that you should not use whole parent corus print("Checking length of file") with open("mixed.txt") as f: for i, l in enumerate(f): pass if i > 200000 or i < 100000: print( "Your 'mixed.txt' does not contain 150k rows which means that you have not balanced all languages. In our toy example, this would not be a problem, however whenever you are dealing with high-resource parent and low-resource child. It can quickly happen, that most of the mixed corpora would contain only parent sentences and then the generated vocabulary would contain mainly parent subwords." ) sys.exit(0) def get_generator(): with open("mixed.txt") as f: for line in f: yield line.strip() gen = get_generator() print( "Generating vocabulary. It will take a moment. Please, read the next section of tutorial.\n\n" ) generator_utils.get_or_generate_vocab_inner("t2t_data", "vocab.cseten.wp", 32000, gen)
def generate_vocab(self, data_dir, wikis_dir, refs_dir): # Produce a SubwordTextEncoder from a subset of the data return generator_utils.get_or_generate_vocab_inner( data_dir, self.vocab_filename, self.target_vocab_size, self.generate_lines_for_vocab(wikis_dir, refs_dir))
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN # Called twice: for train and test # Get the list of the training samples (coding challenge samples) samples = list(generator_samples(tmp_dir, self.pb_constants)) # Split between train and dev # Shuffle to get problems from diverse sources (CodeChef and CodeForces) and # difficulties in each set. # Need to sort the samples first before shuffling (as walk() isn't # deterministic) samples.sort(key=lambda x: x.desc_file) # in-place rng = random.Random(7531) # Local fixed seed rng.shuffle(samples) # in-place # Train: 5019/5228 problems # Dev: 209/5228 problems len_samples = len(samples) split = len_samples // 25 samples = samples[split:] if train else samples[:split] tf.logging.info("Number of samples for {}: {}/{}".format( "train" if train else "dev", len(samples), len_samples )) def generator_samples_content(get_source, get_target): """Generate samples.""" source, target = None, None # Iterate over the coding samples for sample in samples: if get_source: with tf.gfile.GFile(sample.desc_file, mode="r") as source_file: source = source_file.read() if get_target: # Each challenge can have multiple implementations (or none) for code_file in sample.code_files: with tf.gfile.GFile(code_file, mode="r") as target_file: target = target_file.read() target = self.preprocess_target(target) yield source, target elif sample.code_files: # Only take the source if a target exists yield source, target def generator_target(): for _, target in generator_samples_content(False, True): yield target.strip() # Generate vocab for both source and target # TODO(lukaszkaiser): Fix vocab generation call. No sources given. assert not self.vocab_input_filename source_vocab = None # source_vocab = generator_utils.get_or_generate_vocab( # data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size) target_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.vocab_target_filename, vocab_size=self.target_vocab_size, generator=generator_target(),) # Yield the training and testing samples eos_list = [EOS] for source, target in generator_samples_content(True, True): source_ints = source_vocab.encode(source.strip()) + eos_list target_ints = target_vocab.encode(target.strip()) + eos_list yield { "inputs": source_ints, "targets": target_ints, }
def train_generator(self, data_dir, tmp_dir, train): # Called twice: for train and test # Get the list of the training samples (coding challenge samples) samples = list(generator_samples(tmp_dir)) # Split between train and dev # Suffle to get problems from diverse sources (CodeChef and CodeForces) and # dificulties in each set. # Need to sort the samples first before shuffling (as walk() isn't # deterministic) samples.sort(key=lambda x: x.desc_file) # in-place rng = random.Random(7531) # Local fixed seed rng.shuffle(samples) # in-place # Train: 5019/5228 problems # Dev: 209/5228 problems len_samples = len(samples) split = len_samples // 25 samples = samples[split:] if train else samples[:split] tf.logging.info("Number of samples for {}: {}/{}".format( "train" if train else "dev", len(samples), len_samples)) def generator_samples_content(get_source, get_target): source, target = None, None # Iterate over the coding samples for sample in samples: if get_source: with tf.gfile.GFile(sample.desc_file, mode="r") as source_file: source = source_file.read() if get_target: # Each challenge can have multiple implementations (or none) for code_file in sample.code_files: with tf.gfile.GFile(code_file, mode="r") as target_file: target = target_file.read() yield source, target elif sample.code_files: # Only take the source if a target exists yield source, target def generator_target(): for _, target in generator_samples_content(False, True): yield target.strip() # Generate vocab for both source and target source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size) target_vocab = generator_utils.get_or_generate_vocab_inner( data_dir=data_dir, vocab_filename=self.vocab_target_filename, vocab_size=self.target_vocab_size, generator_fn=generator_target, ) # Yield the training and testing samples eos_list = [EOS] for source, target in generator_samples_content(True, True): source_ints = source_vocab.encode(source.strip()) + eos_list target_ints = target_vocab.encode(target.strip()) + eos_list yield { "inputs": source_ints, "targets": target_ints, }
def get_or_generate_vocab(data_dir, vocab_filename, vocab_size, source): vocab_generator = generate_lines_for_vocab(data_dir, source) return generator_utils.get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size, vocab_generator)