def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [ (split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits ] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Ganerate data for this problem.""" del tmp_dir, task_id identity_problem = AlgorithmicIdentityBinary40() utils.generate_files( identity_problem.generator(self.num_symbols, 40, 100000), self.training_filepaths(data_dir, 1, shuffled=True), 100) utils.generate_files( identity_problem.generator(self.num_symbols, 400, 10000), self.dev_filepaths(data_dir, 1, shuffled=True), 100)
def testGenerateFiles(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Generate a trivial file and assert the file exists. def test_generator(): yield {"inputs": [1], "target": [1]} filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1) generator_utils.generate_files(test_generator(), filenames) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) # Clean up. os.remove(tmp_file_path + "-train-00000-of-00001") os.remove(tmp_file_path)
def generate_data(self, data_dir, tmp_dir, task_id=-1): # task_id should be in [0, self.num_output_shards) assert (0 <= task_id) and (task_id < self.num_output_shards) # A task_id is only supposed to write only one output shard, it can operate # over multiple *input* shards. input_files = self._task_id_to_input_files(task_id) output_file = self._task_id_to_output_file(data_dir, task_id) # Which output split is this task writing to? split, _, _ = self._task_id_to_output_split(task_id) # Actually generate examples. generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split, input_files), [output_file]) # Shuffle the output. generator_utils.shuffle_dataset([output_file], extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths(data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths(data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770 + 19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)