def list_files(datapath, days, match_pattern): with ops.name_scope("list_files"): file_pattern = [] for day in days: file_pattern.append(os.path.join(datapath, day, match_pattern)) file_pattern = ops.convert_to_tensor( file_pattern, dtype=dtypes.string, name="file_pattern") matching_files = gen_io_ops.matching_files(file_pattern) # Raise an exception if `file_pattern` does not match any files. condition = math_ops.greater(array_ops.shape(matching_files)[0], 0, name="match_not_empty") message = math_ops.add( "No files matched pattern: ", string_ops.reduce_join(file_pattern, separator=", "), name="message") assert_not_empty = control_flow_ops.Assert( condition, [message], summarize=1, name="assert_not_empty") with ops.control_dependencies([assert_not_empty]): matching_files = array_ops.identity(matching_files) dataset = dataset_ops.Dataset.from_tensor_slices(matching_files) return dataset
def _count_num_records(self): """ Counts the number of non-empty lines (the data samples) from the data_files. This function is called from get_size the first time. :return int: the number of non-empty lines in the data_files """ # TODO in TF 1.3 use: dataset = Dataset.list_files(self.data_files_pattern).repeat(1) from tensorflow.python.ops import gen_io_ops dataset = Dataset.from_tensor_slices( gen_io_ops.matching_files(self.data_files_pattern)).repeat(1) files = self._read_files_once(dataset) with tf.Graph().as_default(): dataset = self.dataset_class(files).repeat(1) samples = 0 try: next_element = dataset.make_one_shot_iterator().get_next() with tf.Session() as sess: while True: sess.run(next_element) samples += 1 except: pass return samples
def stateless_list_files(file_pattern, shuffle=None, seed=None): """A dataset of all files matching one or more glob patterns. Note that, if `shuffle` is not None, it will use a stateless shuffle implementation. Then the returned dataset supports the TF1 compatibility API `tf.data.make_one_shot_iterator()` in TF2. Example: >>> dataset = tf.stateless_list_files("some_file_pattern") Args: file_pattern: A string, a list of strings, or a `tf.Tensor` of string type (scalar or vector), representing the filename glob (i.e. shell wildcard) pattern(s) that will be matched. shuffle: (Optional.) If `True`, the file names will be shuffled randomly based on a stateless implementation. Defaults to `True`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See `tf.random.set_seed` for behavior. Returns: Dataset: A `Dataset` of strings corresponding to file names. """ with ops.name_scope("list_files"): if shuffle is None: shuffle = True file_pattern = ops.convert_to_tensor(file_pattern, dtype=string, name="file_pattern") matching_files = gen_io_ops.matching_files(file_pattern) # Raise an exception if `file_pattern` does not match any files. condition = math_ops.greater(array_ops.shape(matching_files)[0], 0, name="match_not_empty") message = math_ops.add("No files matched pattern: ", strings.reduce_join(file_pattern, separator=", "), name="message") assert_not_empty = debugging.Assert(condition, [message], summarize=1, name="assert_not_empty") with control_dependencies([assert_not_empty]): matching_files = identity(matching_files) dataset = data.Dataset.from_tensor_slices(matching_files) if shuffle: buffer_size = math_ops.maximum( shape(matching_files, out_type=dtypes.int64)[0], 1) # Use stateless shuffled dataset dataset = dataset.apply( stateless_shuffle_dataset(buffer_size, seed=seed)) return dataset
def list_files(file_pattern): """A dataset of all files matching a pattern. Example: If we had the following files on our filesystem: - /path/to/dir/a.txt - /path/to/dir/b.py - /path/to/dir/c.py If we pass "/path/to/dir/*.py" as the directory, the dataset would produce: - /path/to/dir/b.py - /path/to/dir/c.py Args: file_pattern: A string or scalar string `tf.Tensor`, representing the filename pattern that will be matched. Returns: A `Dataset` of strings corresponding to file names. """ return Dataset.from_tensor_slices( gen_io_ops.matching_files(file_pattern))
def list_files(file_pattern): """A dataset of all files matching a pattern. Example: If we had the following files on our filesystem: - /path/to/dir/a.txt - /path/to/dir/b.py - /path/to/dir/c.py If we pass "/path/to/dir/*.py" as the directory, the dataset would produce: - /path/to/dir/b.py - /path/to/dir/c.py Args: file_pattern: A string or scalar string `tf.Tensor`, representing the filename pattern that will be matched. Returns: A `Dataset` of strings corresponding to file names. """ return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
def read(self, batch_size, num_epochs=1, shuffle=False, task_spec=None): """ Reads the data and return a tuple of (inputs,outputs) :param batch_size: the batch size of the returned inputs/outputs :param num_epochs: the number of epochs to read the dataset :param shuffle: whether to shuffle the data or not :param task_spec: the task spec of the training. I will help to know whether it is distributed training or not :return: The result of calling dataset.make_one_shot_iterator().get_next() """ # create the dataset of files with the data # TODO in TF 1.3 use: dataset = Dataset.list_files(self.data_files_pattern) from tensorflow.python.ops import gen_io_ops dataset = Dataset.from_tensor_slices( gen_io_ops.matching_files(self.data_files_pattern)) if shuffle: # read one sample per file # TODO in TF 1.3 use: # dataset = dataset.interleave(self.dataset_class, # # number of readers the same as number of CPUs # cycle_length=multiprocessing.cpu_count() + 1, # # block size is 1 to get directly a flat map # block_length=1) files = self._read_files_once(dataset) import random random.shuffle(files) dataset = self.dataset_class(files) else: # reads files sequentially files = self._read_files_once(dataset) dataset = self.dataset_class(files) # set the number of epochs dataset = dataset.repeat(num_epochs) if task_spec and task_spec.num_workers > 1: # split the dataset in shards # TODO in TF 1.4 use: dataset = dataset.shard(task_spec.num_workers, task_spec.index) from tensorflow.python.ops import math_ops def filter_fn(elem_index, _): mod_result = math_ops.mod(elem_index, task_spec.num_workers) return math_ops.equal(mod_result, task_spec.index) dataset = dataset.enumerate().filter(filter_fn).map( lambda _, elem: elem) if shuffle: # shuffle the samples if self.shuffle_size is None: raise ValueError('shuffle_size has not been set') dataset = dataset.shuffle(buffer_size=self.shuffle_size) # process each example. We check the method is defined in the child class: if self._flat_map.__func__ not in TFDataSet.__dict__.values(): dataset = dataset.flat_map(self._flat_map) if self._map.__func__ not in TFDataSet.__dict__.values(): dataset = dataset.map( self._map, # use as many threads as CPUs + 1 # TODO in TF 1.4 use: num_parallel_calls=multiprocessing.cpu_count() + 1, num_threads=multiprocessing.cpu_count() + 1, # buffer the data as CPUs * batch_size + minimum_size output_buffer_size=batch_size * multiprocessing.cpu_count() + self.min_queue_examples) if self.padded_shapes is not None: dataset = dataset.padded_batch(batch_size, self.padded_shapes, self.padded_values) else: dataset = dataset.batch(batch_size) return dataset.make_one_shot_iterator().get_next()