def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = dict([(split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits]) all_paths = [] for paths in split_paths.values(): all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths.items(): generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) else: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """ Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ # In case of parallel execution, each shard is generated by a different # process if self.multiprocess_generate: tf.logging.info("generate_data task_id=%s" % task_id) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths(data_dir, self.num_train_shards, shuffled=False)[task_id] dataset_split = problem.DatasetSplit.TRAIN else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] dataset_split = problem.DatasetSplit.EVAL generator_utils.generate_files( self.generator(data_dir, tmp_dir, dataset_split, task_id), [out_file]) generator_utils.shuffle_dataset([out_file]) else: filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [ (split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits ] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generator(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generator(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): self.data_dir = data_dir # Determine whether we are in training or validation mode. self.mode = {problem.DatasetSplit.TRAIN: 'train', problem.DatasetSplit.EVAL: 'dev', problem.DatasetSplit.TEST: 'test'} filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths} split_paths = [(split['split'], filepath_fns[split['split']]( data_dir, split['shards'], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: # Create the source and target txt files from the raw data. self.preprocess_data(self.mode[split]) generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN]) generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): del tmp_dir # unused argument filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split['split'], filepath_fns[split['split']](data_dir, split['shards'], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): # 构造对应文件名的方法dict filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } # 根据dataset_splits 的设置,得到各类文件名 split_paths = [ (split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits ] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) # split 是train/eval/test,对应的paths是对应类别下根据shards生成的文件名list if self.is_generate_per_split: for split, paths in split_paths: # generate_files(generator, filenames) 将generator生成的token id sample 写入 filenames generator_utils.generate_files( # generate_encoded_samples yield token id形式的sample self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): data_dir = os.path.join(data_dir, self.name) filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [] test_paths = [] for split in self.dataset_splits: if split["split"] is not problem.DatasetSplit.TEST: split_paths.append((split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=False))) else: test_paths.append((split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=True))) all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) else: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths) generator_utils.shuffle_dataset(all_paths) test_split_paths = [] for _, paths in test_paths: test_split_paths.extend(paths) if self.is_generate_per_split: for split, paths in test_paths: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) else: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TEST)), test_split_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): try: # Download source data if download_url specified h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, self.download_url) except NotImplementedError: # Otherwise, look for it locally h5_filepath = os.path.join(tmp_dir, self.h5_file) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, self.num_shards, "train", num_train_examples), (self.dev_filepaths, 10, "valid", num_dev_examples), (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process( target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], self.chunk_size, start_idx, end_idx)) processes.append(p) # 1 per training shard + 10 for dev + 10 for test assert len(processes) == self.num_shards + 20 # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) for i in xrange(num_batches): start = i * MAX_CONCURRENT_PROCESSES end = start + MAX_CONCURRENT_PROCESSES current = processes[start:end] for p in current: p.start() for p in current: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): try: # Download source data if download_url specified h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, self.download_url) except NotImplementedError: # Otherwise, look for it locally h5_filepath = os.path.join(tmp_dir, self.h5_file) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, self.num_shards, "train", num_train_examples), (self.dev_filepaths, 10, "valid", num_dev_examples), (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process( target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], self.chunk_size, start_idx, end_idx)) processes.append(p) # 1 per training shard + 10 for dev + 10 for test assert len(processes) == self.num_shards + 20 # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) for i in range(num_batches): start = i * MAX_CONCURRENT_PROCESSES end = start + MAX_CONCURRENT_PROCESSES current = processes[start:end] for p in current: p.start() for p in current: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, 10, shuffled=False) dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True) midi_files = glob.glob('data/maestro/maestro-v2.0.0/*/*.midi') random.seed(13) random.shuffle(midi_files) generator_utils.generate_files(self.generator(midi_files[:50]), dev_paths) generator_utils.generate_files(self.generator(midi_files[50:]), train_paths) generator_utils.shuffle_dataset(train_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths( data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770+19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """ TODO """ train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) # problem_name-train-00000-of-00001 dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) # problem_name-dev-00000-of-00001 if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): _maybe_get_pmjtc_dataset(tmp_dir) self._maybe_save_image_meta(data_dir, tmp_dir) self._maybe_build_vocab(data_dir, tmp_dir) train_meta = self._load_image_meta(data_dir, 'train') dev_meta = self._load_image_meta(data_dir, 'dev') encoder = text_encoder.TokenTextEncoder(os.path.join( data_dir, self.vocab_name), replace_oov=OOV) train_paths = self.training_filepaths(data_dir, self.train_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) train_meta_shards = _split_into_shards(train_meta, self.train_shards) dev_meta_shards = _split_into_shards(dev_meta, self.dev_shards) datasets = ((train_meta_shards, train_paths), (dev_meta_shards, dev_paths)) all_paths = [] threads = [] thread_counter = 0 for i in xrange(len(datasets)): for j in xrange(len(datasets[i][0])): meta_list = datasets[i][0][j] out_file = datasets[i][1][j] all_paths.append(out_file) t = threading.Thread(target=self.generate_data_shard, args=(thread_counter, meta_list, out_file, encoder)) threads.append(t) thread_counter += 1 num_batches = int(ceil(float(len(threads)) / MAX_CONCURRENT_THREADS)) for i in xrange(num_batches): coord = tf.train.Coordinator() start = i * MAX_CONCURRENT_THREADS end = start + MAX_CONCURRENT_THREADS current = threads[start:end] for t in current: t.start() coord.join(current) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, TEST_DATASET), test_paths) all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, TRAIN_DATASET), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): gen = self.generator(tmp_dir, shard_id=task_id) if self.mode == "train": paths = self.training_filepaths(data_dir, self.train_shards, shuffled=False) paths = sharded_subset_list(paths, self.train_shards, task_id) else: paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) paths = sharded_subset_list(paths, self.test_shards, task_id) generator_utils.generate_files(gen, paths) generator_utils.shuffle_dataset(paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): # task_id should be in [0, self.num_output_shards) assert (0 <= task_id) and (task_id < self.num_output_shards) # A task_id is only supposed to write only one output shard, it can operate # over multiple *input* shards. input_files = self._task_id_to_input_files(task_id) output_file = self._task_id_to_output_file(data_dir, task_id) # Which output split is this task writing to? split, _, _ = self._task_id_to_output_split(task_id) # Actually generate examples. generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, split, input_files), [output_file]) # Shuffle the output. generator_utils.shuffle_dataset([output_file], extra_fn=self._pack_fn())
def generate_data(self, data_dir, tmp_dir, task_id=-1): # 生成数据,按照TFRcord的格式输出到tmp_dir filepath_fns = { # 一个字典,三个键值对,每个键值是一个函数,每个函数的作用是获取该split下的一组分片文件的文件名 problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] # self.dataset_splits 中存储数据切分信息,是一个列表,两个元素,每个元素是一个字典,每个字典两个键值对:split_name, shards_num # training_filepaths(self, data_dir, num_shards, shuffled) all_paths = [ ] # 目前有两个split,每个split有num_shards个文件名,这个all_paths负责把所有文件名打包到一个list里面 for _, paths in split_paths: all_paths.extend(paths) # generate_encoded_samples 返回的是一个迭代器,每个样本是一个字典,两个键,inputs和outputs,值均为list类型,内容是token id # _maybe_pack_examples 貌似做点关于长度的事情,样本格式应该是没有变化的 # generate_files 负责将生成器中的样本数据按照TFRecord的格式输出 if self.is_generate_per_split: # for split, paths in split_paths: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) else: generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, num_shards=None): if num_shards is None: num_shards = 100 # Download source data h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, self.download_url) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, num_shards, "train", num_train_examples), (self.dev_filepaths, 1, "valid", num_dev_examples), (self.test_filepaths, 1, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process(target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], start_idx, end_idx)) processes.append(p) # Start and wait for processes assert len( processes) == num_shards + 2 # 1 per training shard + dev + test for p in processes: p.start() for p in processes: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) # dev_paths = self.dev_filepaths( # data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) data = self.generator(data_dir, tmp_dir, _LJSPEECH_TTS_DATASET, start_from=0, how_many=100) generator_utils.generate_files(data, test_paths) data = self.generator(data_dir, tmp_dir, _LJSPEECH_TTS_DATASET, start_from=100, how_many=-1) generator_utils.generate_files(data, train_paths) generator_utils.shuffle_dataset(train_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): raw_data_dir = _prepare_serchqa_data(tmp_dir) metadata_path = os.path.join(data_dir, "meta_data.json") train_file = os.path.join(raw_data_dir, "train.txt") dev_file = os.path.join(raw_data_dir, "val.txt") test_file = os.path.join(raw_data_dir, "test.txt") _build_vocab(generate_text_for_vocab(train_file), data_dir, self.vocab_filename, self.vocab_size) encoder = self.get_or_create_vocab(data_dir, tmp_dir) self._extract_searchqa_metadata(encoder, [train_file, dev_file, test_file], metadata_path) filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]](data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split, encoder), paths) generator_utils.shuffle_dataset(all_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths( data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
def main(_): train_shards = 100 dev_shards = 1 pred_shards = 1 train_file_names = [ os.path.join( FLAGS.data_dir, "{0}-{1}-train-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i, train_shards)) for i in range(train_shards) ] dev_file_names = [ os.path.join( FLAGS.data_dir, "{0}-{1}-dev-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i, dev_shards)) for i in range(dev_shards) ] pred_file_names = [ os.path.join( FLAGS.data_dir, "{0}-{1}-pred-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i, pred_shards)) for i in range(pred_shards) ] train_generator = example_generator(FLAGS.tmp_dir, "train") eval_generator = example_generator(FLAGS.tmp_dir, "dev") pred_generator = example_generator(FLAGS.tmp_dir, "pred") generator_utils.generate_files(train_generator, train_file_names, cycle_every_n=10) generator_utils.generate_files(eval_generator, dev_file_names, cycle_every_n=10) generator_utils.generate_files(pred_generator, pred_file_names, cycle_every_n=10) generator_utils.shuffle_dataset(train_file_names) generator_utils.shuffle_dataset(dev_file_names) generator_utils.shuffle_dataset(pred_file_names)
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates sharded Train, Dev and Test splits of the KGS and GoGoD Datasets. Assumes GoGoD zip from https://gogodonline.co.uk/ and KGS tar.gz from https://u-go.net/gamerecords/ are already downloaded and in the tmp_dir! Uses split fractions defined in self.split_fractions and num shards per split defined in self.{train/dev/test]_shards. Args: data_dir: (str), final data directory. tmp_dir: (str), directory containing KGS and GoGoD zips task_id: (int), task id. """ data = self.generate_dataset(tmp_dir) for k, v in data.items(): if v == []: raise ValueError("No {} files found!".format(k)) # generate sharded TFRecord files of the train sgf's and shuffle tf.logging.info("Generating GoGoD and KGS train data") train_gen = self.generator(data["train"]) train_paths = self.training_filepaths(data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) generator_utils.shuffle_dataset(train_paths) # generate sharded TFRecord files of the dev sgf's and shuffle tf.logging.info("Generating GoGoD and KGS dev data") dev_gen = self.generator(data["dev"]) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) generator_utils.shuffle_dataset(dev_paths) # generate sharded TFRecord files of the test sgf's and shuffle tf.logging.info("Generating GoGoD and KGS test data") test_gen = self.generator(data["test"]) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(test_paths)
def generate_data(self, data_dir, tmp_dir, task_id=-1): if task_id == -1 or task_id is None: for i in range(FLAGS.wiki_revision_num_train_shards + FLAGS.wiki_revision_num_dev_shards): self.generate_data(data_dir, tmp_dir, i) return tf.logging.info( "Flags for job (task_id {}): " "Dev shards: {}, Train shards: {}, " "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {}," "Percent Identical Examples: {}" "".format(task_id, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_revision_skip_factor, FLAGS.wiki_revision_max_page_size_exp, FLAGS.wiki_revision_introduce_errors, FLAGS.wiki_revision_percent_identical_examples)) if FLAGS.wiki_revision_vocab_file: encoder = wiki_revision_utils.get_encoder_from_vocab( FLAGS.wiki_revision_vocab_file) else: encoder = wiki_revision_utils.get_or_generate_vocabulary( data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix, FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size, self.strip) random.seed(123) if task_id < FLAGS.wiki_revision_num_train_shards: out_file = self.training_filepaths( data_dir, FLAGS.wiki_revision_num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, FLAGS.wiki_revision_num_dev_shards, shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards] tf.logging.info("Generating files for path: %s", out_file) self.corpus_files = wiki_revision_utils.corpus_files_for_shard( task_id, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_data_prefix) example_generator = self.generator(encoder, self.corpus_files, tmp_dir) packed_example_generator = self._maybe_pack_examples(example_generator) generator_utils.generate_files(packed_example_generator, [out_file]) generator_utils.shuffle_dataset([out_file]) tf.logging.info( "Job stats: identity examples: {}, total examples {}, ratio: {}".format( self.num_identity_examples, self.num_total_examples, (1 + self.num_identity_examples) / (1 + self.num_total_examples))) job_stats_string = self.aggregate_job_stats() out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1) stats_prefix = "/stats_" stats_file_path = "".join([out_dir, stats_prefix, filename]) if tf.gfile.Exists( stats_file_path) and tf.gfile.Open(stats_file_path).size() != 0: tf.logging.info("Skipping writing stats because output file exists.") else: with tf.gfile.Open(stats_file_path, "w") as out: tf.logging.info("Writing job stats to {}".format(stats_file_path)) out.write(job_stats_string) tf.logging.info(job_stats_string)
def generate_data(self, data_dir, tmp_dir, task_id=-1): if task_id == -1 or task_id is None: for i in range(FLAGS.wiki_revision_num_train_shards + FLAGS.wiki_revision_num_dev_shards): self.generate_data(data_dir, tmp_dir, i) return tf.logging.info( "Flags for job (task_id {}): " "Dev shards: {}, Train shards: {}, " "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {}," "Percent Identical Examples: {}" "".format(task_id, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_revision_skip_factor, FLAGS.wiki_revision_max_page_size_exp, FLAGS.wiki_revision_introduce_errors, FLAGS.wiki_revision_percent_identical_examples)) if FLAGS.wiki_revision_vocab_file: encoder = wiki_revision_utils.get_encoder_from_vocab( FLAGS.wiki_revision_vocab_file) else: encoder = wiki_revision_utils.get_or_generate_vocabulary( data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix, FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size, self.strip) random.seed(123) if task_id < FLAGS.wiki_revision_num_train_shards: out_file = self.training_filepaths( data_dir, FLAGS.wiki_revision_num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, FLAGS.wiki_revision_num_dev_shards, shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards] tf.logging.info("Generating files for path: %s", out_file) self.corpus_files = wiki_revision_utils.corpus_files_for_shard( task_id, FLAGS.wiki_revision_num_train_shards, FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_data_prefix) example_generator = self.generator(encoder, self.corpus_files, tmp_dir) packed_example_generator = self._maybe_pack_examples(example_generator) generator_utils.generate_files(packed_example_generator, [out_file]) generator_utils.shuffle_dataset([out_file]) tf.logging.info( "Job stats: identity examples: {}, total examples {}, ratio: {}". format(self.num_identity_examples, self.num_total_examples, (1 + self.num_identity_examples) / (1 + self.num_total_examples))) job_stats_string = self.aggregate_job_stats() out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1) stats_prefix = "/stats_" stats_file_path = "".join([out_dir, stats_prefix, filename]) if tf.gfile.Exists(stats_file_path ) and tf.gfile.Open(stats_file_path).size() != 0: tf.logging.info( "Skipping writing stats because output file exists.") else: with tf.gfile.Open(stats_file_path, "w") as out: tf.logging.info( "Writing job stats to {}".format(stats_file_path)) out.write(job_stats_string) tf.logging.info(job_stats_string)
def main(_): train_shards = 10 dev_shards = 1 pred_shards = 1 user_item_train_file_names = [ os.path.join( FLAGS.data_dir, "{0}-train-000{1}-of-00{2}".format(FLAGS.user_item, i, train_shards)) for i in range(train_shards) ] user_item_dev_file_names = [ os.path.join( FLAGS.data_dir, "{0}-dev-000{1}-of-00{2}".format(FLAGS.user_item, i, dev_shards)) for i in range(dev_shards) ] user_item_pred_file_names = [ os.path.join( FLAGS.data_dir, "{0}-pred-000{1}-of-00{2}".format(FLAGS.user_item, i, pred_shards)) for i in range(pred_shards) ] user_item_train_generator = user_item_example_generator( FLAGS.tmp_dir, "train") user_item_dev_generator = user_item_example_generator(FLAGS.tmp_dir, "dev") user_item_pred_generator = user_item_example_generator( FLAGS.tmp_dir, "pred") generator_utils.generate_files(user_item_train_generator, user_item_train_file_names, cycle_every_n=10) generator_utils.generate_files(user_item_dev_generator, user_item_dev_file_names, cycle_every_n=10) generator_utils.generate_files(user_item_pred_generator, user_item_pred_file_names, cycle_every_n=10) generator_utils.shuffle_dataset(user_item_train_file_names) generator_utils.shuffle_dataset(user_item_dev_file_names) generator_utils.shuffle_dataset(user_item_pred_file_names) also_view_train_file_names = [ os.path.join( FLAGS.data_dir, "{0}-train-000{1}-of-00{2}".format(FLAGS.also_view, i, train_shards)) for i in range(train_shards) ] also_view_dev_file_names = [ os.path.join( FLAGS.data_dir, "{0}-dev-000{1}-of-00{2}".format(FLAGS.also_view, i, dev_shards)) for i in range(dev_shards) ] also_view_pred_file_names = [ os.path.join( FLAGS.data_dir, "{0}-pred-000{1}-of-00{2}".format(FLAGS.also_view, i, pred_shards)) for i in range(pred_shards) ] also_view_train_generator = also_view_example_generator( FLAGS.tmp_dir, "train") also_view_dev_generator = also_view_example_generator(FLAGS.tmp_dir, "dev") also_view_pred_generator = also_view_example_generator( FLAGS.tmp_dir, "pred") generator_utils.generate_files(also_view_train_generator, also_view_train_file_names, cycle_every_n=10) generator_utils.generate_files(also_view_dev_generator, also_view_dev_file_names, cycle_every_n=10) generator_utils.generate_files(also_view_pred_generator, also_view_pred_file_names, cycle_every_n=10) generator_utils.shuffle_dataset(also_view_train_file_names) generator_utils.shuffle_dataset(also_view_dev_file_names) generator_utils.shuffle_dataset(also_view_pred_file_names)
def generate_data(self, data_dir, tmp_dir, task_id=-1): test = 20 train_paths = self.training_filepaths(data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths(data_dir, self.num_test_shards, shuffled=True) try_num = 0 if test: try_num = test manager = Manager() lock = manager.Lock() #shared_dict=manager.dict({'current_id':id_init,"current_last_updated":0,"record_num":0,"source_index":0}) def process_files(train_paths, datasets, num_run, shared_dict): total_file_num = len(train_paths) num_per_partition = int(math.floor(total_file_num / num_run)) train_paths_list = [] for i in range(num_run): if i == num_run - 1: train_paths_list.append(train_paths[i * num_per_partition:]) else: train_paths_list.append( train_paths[i * num_per_partition:(i + 1) * num_per_partition]) generator_list = [] for i in range(num_run): generator_list.append( self.generator(data_dir, tmp_dir, datasets, lock, shared_dict, how_many=try_num)) p = [] for i in range(num_run): p.append( Process(target=generator_utils.generate_files, args=(generator_list[i], train_paths_list[i], try_num))) p[i].start() my_logger.error("Time: {} All processes started".format( str(datetime.datetime.now()))) for q in p: q.join() my_logger.error("Time: {} All processes ended".format( str(datetime.datetime.now()))) shared_dict = manager.dict({ 'current_id': id_init, "current_last_updated": 0, "record_num": 0, "source_index": 0 }) num_run = min(self.process_num, self.num_shards) process_files(train_paths, self.train_sources, num_run, shared_dict) if len(self.eval_sources) == 0: generator_utils.shuffle_dataset(train_paths) else: shared_dict["current_id"] = id_init shared_dict["current_last_updated"] = 0 shared_dict["record_num"] = 0 shared_dict["source_index"] = 0 num_run = min(self.process_num, self.num_dev_shards) my_logger.error("Time: {} process dev dataset".format( str(datetime.datetime.now()))) process_files(dev_paths, self.eval_sources, num_run, shared_dict) my_logger.error("Time: {} shuffle dataset".format( str(datetime.datetime.now()))) generator_utils.shuffle_dataset(train_paths + dev_paths) shared_dict["current_id"] = id_init shared_dict["current_last_updated"] = 0 shared_dict["record_num"] = 0 shared_dict["source_index"] = 0 num_run = min(self.process_num, self.num_test_shards) process_files(test_paths, self.test_sources, num_run, shared_dict)
import youtokentome as yttm MAX_LEN = 128 bpe = yttm.BPE(model="models/model_ruzh_47k.yttm") def gen(path): dfs = spark.read.load(path) for row in dfs.head(1000000000): sample = { "inputs": bpe.encode([row.zh], eos=True)[0][:MAX_LEN], "targets": bpe.encode([row.ru], eos=True)[0][:MAX_LEN] } yield sample total = 32 for i in range(0, total): generator_utils.generate_files( gen("hdfs://ryzen:9000/user/root/dataset/mt/shuffled-ru-zh.parquet/part-%05d-*" % i), [ "tfrecords/translate_zhru-train-%05d-of-%05d-unshuffled" % (i, total) ]) all_paths = [ "tfrecords/translate_zhru-train-%05d-of-%05d-unshuffled" % (i, total) for i in range(total) ] generator_utils.shuffle_dataset(all_paths)