def __call__(self, params): """Input function which provides a single batch for train or eval.""" # Retrieves the batch size for the current shard. The # of shards is # computed according to the input pipeline deployment. See # `tf.contrib.tpu.RunConfig` for details. batch_size = params['batch_size'] logging.info('call ToyModelInput() with batch size {}'.format(batch_size)) ds = Dataset.from_tensor_slices((self._images, self._labels)).repeat() dataset = ds.batch(batch_size, drop_remainder=True).prefetch(2) return dataset
def _get_dataset(self, x, y, name_scope='data'): ''' get dataset :param name_scope: :param x: :param y: :return: train_init_op, train_next_op ''' batch_size = self._batch_size with tf.name_scope(name=name_scope): dataset = Dataset.from_tensor_slices((x, y)) dataset = dataset.shuffle(buffer_size=batch_size * 10) \ .repeat() \ .map(map_func=map_fn, num_parallel_calls=8) \ .batch(batch_size=batch_size) \ .prefetch(buffer_size=batch_size * 10) init_op = dataset.make_initializable_iterator(shared_name='init_op') next_op = init_op.get_next(name='next_op') return init_op, next_op
def __init__(self, mode, batch_size, num_classes, shuffle=True, buffer_size=1000): """Create a new ImageDataGenerator. Recieves a path string to a text file, which consists of many lines, where each line has first a path string to an image and seperated by a space an integer, referring to the class number. Using this data, this class will create TensrFlow datasets, that can be used to train e.g. a convolutional neural network. Args: mode: Either 'training' or 'inference'. Depending on this value, different parsing functions will be used. batch_size: Number of images per batch. num_classes: Number of classes in the dataset. shuffle: Wether or not to shuffle the data in the dataset and the initial file list. buffer_size: Number of images used as buffer for TensorFlows shuffling of the dataset. Raises: ValueError: If an invalid mode is passed. """ self.num_classes = num_classes self.duplicated_tag_list = [] fp = open("/media/jiwunghyun/DATA/nus-wide/Concepts81.txt", 'r') tag_list = [] for i in range(81): a = fp.readline().split('\n')[0] tag_list.append(a) fp.close() fp2 = open( "/media/jiwunghyun/DATA/nus-wide/NUS_WID_Tags/TagList1k.txt", 'r') tag1000_list = [] for i in range(1000): a = fp2.readline().split('\n')[0] tag1000_list.append(a) fp2.close() for i in range(len(tag1000_list)): for j in range(len(tag_list)): if tag1000_list[i] == tag_list[j]: self.duplicated_tag_list.append(i) # retrieve the data from the text file if mode == 'training': self._read_train_txt_file() elif mode == 'inference': self._read_test_txt_file() # number of samples in the dataset self.data_size = len(self.labels) # initial shuffling of the file and label lists (together!) if shuffle: self._shuffle_lists() # convert lists to TF tensor self.img_paths = convert_to_tensor(self.img_paths, dtype=dtypes.string) self.labels = convert_to_tensor(self.labels, dtype=dtypes.int32) self.noisy_tags = convert_to_tensor(self.noisy_tags, dtype=dtypes.int32) # create dataset data = Dataset.from_tensor_slices( (self.img_paths, self.labels, self.noisy_tags)) # distinguish between train/infer. when calling the parsing functions if mode == 'training': data = data.map(self._parse_function_train) elif mode == 'inference': data = data.map(self._parse_function_inference) else: raise ValueError("Invalid mode '%s'." % (mode)) # shuffle the first `buffer_size` elements of the dataset if shuffle: data = data.shuffle(buffer_size=buffer_size) # create a new dataset with batches of images data = data.batch(batch_size) self.data = data