def read_data_sets(train_dir, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, source_url=None): # omit url since we are using our own dataset TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 'test-images-idx3-ubyte.gz' TEST_LABELS = 'test-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, None) # omit url, local file will be a path # type: DataSets with gfile.Open(local_file, 'rb') as f: train_images = mnist_module.extract_images(f) print (train_images.shape) local_file = base.maybe_download(TRAIN_LABELS, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: train_labels = mnist_module.extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: test_images = mnist_module.extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: test_labels = mnist_module.extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = mnist_module.DataSet(train_images, train_labels, **options) validation = mnist_module.DataSet(validation_images, validation_labels, **options) test = mnist_module.DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def apply_gaussian_to_dataset(dataset): blur_dataset = collections.namedtuple('Datasets', ['train', 'validation', 'test']) train_images = apply_gaussian_filter(dataset.train.images) test_images = apply_gaussian_filter(dataset.test.images) validation_images = apply_gaussian_filter(dataset.validation.images) blur_dataset.train = mnist.DataSet(train_images, dataset.train.labels, reshape=False) blur_dataset.test = mnist.DataSet(test_images, dataset.test.labels, reshape=False) blur_dataset.validation = mnist.DataSet(validation_images, dataset.validation.labels, reshape=False) return blur_dataset
def convert_to_data_sets(data_gzs, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): """ Modified version of tensorflow/tensorflow/contrib/learn/python/learn/datasets/mnist.py """ with gfile.Open(data_gzs['train-images'][0], 'rb') as f: train_images = tf_mnist.extract_images(f) with gfile.Open(data_gzs['train-labels'][0], 'rb') as f: train_labels = tf_mnist.extract_labels(f, one_hot=one_hot) with gfile.Open(data_gzs['t10k-images'][0], 'rb') as f: test_images = tf_mnist.extract_images(f) with gfile.Open(data_gzs['t10k-labels'][0], 'rb') as f: test_labels = tf_mnist.extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = tf_mnist.DataSet(train_images, train_labels, **options) validation = tf_mnist.DataSet(validation_images, validation_labels, **options) test = tf_mnist.DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def extract_n_data_sets(datasets, label=[1, 2, 3]): test_data_set_image = datasets.images test_data_set_label = datasets.labels extract_images = np.array([]) extract_labels = np.array([]) cnt = 0 for i in range(datasets.num_examples): # for i in range(10): if (np.argmax(test_data_set_label[i]) in label): cnt += 1 extract_images = np.append(extract_images, test_data_set_image[i]) extract_labels = np.append(extract_labels, test_data_set_label[i]) extract_images = extract_images.astype(np.float32) # return mnist.DataSet(extract_images, extract_labels, dtype = dtypes.float32, reshape = True) return mnist.DataSet(extract_images.reshape(cnt, 784), extract_labels.reshape(cnt, 10), dtype=dtypes.uint8, reshape=False)
def ConvertImg(imgFolder): RawImgSize = (512, 512) if os.path.isdir(imgFolder) is False: logging.warning('Raw image folder doesn\'t exist') train_directory = os.path.join(imgFolder) all_entries = os.listdir(train_directory) dirnames = [] for entry in all_entries: if os.path.isdir(os.path.join(train_directory, entry)): dirnames.append(entry) arr = [] label = [] for dirname in dirnames: files = os.listdir(os.path.join(train_directory, dirname)) for file in files: # read file as gray image img = Image.open(os.path.join(train_directory, dirname, file)).convert('L') if img.size[0] != RawImgSize[0] or img.size[1] != RawImgSize[1]: print('Error on Image Size != ', RawImgSize) else: # Label vector is generated from folder name. It add one label(folder name) to 'label' label.append(dirname) for i in range(RawImgSize[0]): for j in range(RawImgSize[1]): pixel = float(img.getpixel((j, i))) arr.append(pixel) # 'arr' is 1D vector. reshape arr to #file * imageRow * imageCol * 1 numpy array. # Then combine with label by mnist default class 'DataSet' # return the MNIST-like dataset train_labels = np.array(label) train_images = np.array(arr).reshape( (len(label), RawImgSize[0], RawImgSize[1], 1)) dtype = dtypes.float32 reshape = True seed = None options = dict(dtype=dtype, reshape=reshape, seed=seed) mnData = mnist.DataSet(train_images, train_labels, **options) return mnData
def select_data(n=10, expand_with_deform=0, train_dir='MNIST-data'): """Extracts a subset of mnist train data. If doublt_with_deform is True, dataset size is doubled adding a deformed duplicate. n is number of examples for each digit/class. return normal_dataset, expanded_dataset """ # The 2 datasets to be constructed. normal = None expanded = None train_images, train_labels = load_mnist_data(train_dir) numbers = [[] for i in range(10)] # 10 classes, 10 arrays. # Take n datapoints for each number. for i in range(len(train_labels)): if sum(len(x) for x in numbers) == 10 * n: break number = np.where(train_labels[i] == 1)[0][0] if len(numbers[number]) < n: numbers[number].append(i) # import vis_mnist as vm # for i in range(10): # vm.show_image(numbers[i]) # Scramble subset. 'numbers' contain indices into train_labels. numbers = np.asarray(numbers) numbers = numbers.reshape(10 * n) np.random.shuffle(numbers) # Actually retrieve the subset. subset_images = [] subset_labels = [] for i in numbers: subset_images.append(train_images[i]) subset_labels.append(train_labels[i]) options = dict(dtype=dtypes.float32, reshape=True, seed=None) # Construct normal dataset normal = mnist.DataSet(np.asarray(subset_images), np.asarray(subset_labels), **options) for j in range(expand_with_deform): print("Deforming all 'train' images..") count = 0 for i in numbers: shape = train_images[i].shape image = train_images[i].reshape((28, 28)) new_image = ed.rotate(image) new_image = ed.translate(new_image) new_image = ed.deform(new_image) subset_images.append(new_image.reshape(shape)) subset_labels.append(train_labels[i]) count += 1 print('Processed image {}'.format(count), end='\r') print("\nDeformation done.") subset_images = np.asarray(subset_images) subset_labels = np.asarray(subset_labels) # Shuffle expanded set. perm = np.arange(len(subset_images)) np.random.shuffle(perm) subset_images = subset_images[perm] subset_labels = subset_labels[perm] expanded = mnist.DataSet(subset_images, subset_labels, **options) return normal, expanded
def read_semeion(fname='semeion/semeion.data'): file = open(fname, 'r') lines = file.readlines() width = 16 height = 16 size = width * height classes = 10 images = [] labels = [] fnumber = 0 for line in lines: data = line.split(' ') image = [] label = [] for i in range(0, size): image.append(int(float(data[i]))) images.append(image) for i in range(size, size + classes): label.append(int(float(data[i]))) labels.append(label) fnumber += 1 for i in range(len(images)): ii = scale(numpy.reshape(images[i], (width, height)), 28, 28) ii = numpy.reshape(ii, (28, 28, 1)) images[i] = ii width = 28 height = 28 # Shuffle data images_shuffle = [] labels_shuffle = [] indexes = list(range(len(images))) random.shuffle(indexes) for i in indexes: images_shuffle.append(images[i]) labels_shuffle.append(labels[i]) images = images_shuffle labels = labels_shuffle for i in range(len(labels)): labels[i] = numpy.reshape(labels[i], (10, )) samples = len(lines) train_samples = 1400 val_samples = 120 test_samples = 73 # Train set image_train = numpy.array(images[:train_samples], dtype=numpy.float32) image_train = image_train.reshape(train_samples, width, height, 1) label_train = numpy.array(labels[:train_samples], dtype=numpy.float32) # Validation Set image_val = numpy.array(images[train_samples:train_samples + val_samples], dtype=numpy.float32) image_val = image_val.reshape(val_samples, width, height, 1) label_val = numpy.array(labels[train_samples:train_samples + val_samples], dtype=numpy.float32) # test set image_test = numpy.array(images[train_samples + val_samples:], dtype=numpy.float32) image_test = image_test.reshape(test_samples, width, height, 1) label_test = numpy.array(labels[train_samples + val_samples:], dtype=numpy.float32) options = dict(dtype=dtypes.float32, reshape=True, seed=None) train = mnist.DataSet(image_train, label_train, **options) validation = mnist.DataSet(image_val, label_val, **options) test = mnist.DataSet(image_test, label_test, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_opt(fname='optical/optdigits_csv.csv'): file = open(fname, 'r') lines = file.readlines() lines = lines[1:] width = 8 height = 8 size = width * height classes = 10 images = [] labels = [] fnumber = 0 for line in lines: data = line.split(',') image = [] for i in range(0, size): image.append(int(float(data[i]))) images.append(image) label = numpy.zeros((10, )) label[int(data[-1])] = 1 labels.append(label) fnumber += 1 images_scale = [[]] * len(images) for i in range(len(images)): im_8 = numpy.reshape(images[i], (8, 8)) im_reshape = scale(im_8, 28, 28) images_scale[i] = numpy.reshape(im_reshape, -1) images = images_scale # Shuffle data images_shuffle = [] labels_shuffle = [] indexes = list(range(len(images))) random.shuffle(indexes) for i in indexes: images_shuffle.append(images[i]) labels_shuffle.append(labels[i]) images = images_shuffle labels = labels_shuffle samples = len(images) width = 28 height = 28 train_samples = 1400 val_samples = 1400 test_samples = 2800 # Train set image_train = numpy.array(images[:train_samples], dtype=numpy.float32) image_train = image_train.reshape(train_samples, width, height, 1) label_train = numpy.array(labels[:train_samples], dtype=numpy.float32) # Validation Set image_val = numpy.array(images[train_samples:train_samples + val_samples], dtype=numpy.float32) image_val = image_val.reshape(val_samples, width, height, 1) label_val = numpy.array(labels[train_samples:train_samples + val_samples], dtype=numpy.float32) # test set image_test = numpy.array(images[train_samples + val_samples:train_samples + val_samples + test_samples], dtype=numpy.float32) image_test = image_test.reshape(test_samples, width, height, 1) label_test = numpy.array(labels[train_samples + val_samples:train_samples + val_samples + test_samples], dtype=numpy.float32) options = dict(dtype=dtypes.float32, reshape=True, seed=None) train = mnist.DataSet(image_train, label_train, **options) validation = mnist.DataSet(image_val, label_val, **options) test = mnist.DataSet(image_test, label_test, **options) return base.Datasets(train=train, validation=validation, test=test)
def fake(): return mnist.DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)