def load_data(task_name=None): """Loads data associated with given task Args: task_name: str, the name of the task to load. Returns: Tuple of numpy arrays (inputs, data_to_generate) normalized between 0 and 1. """ assert task_name is not None if task_name == 'mnist': with open(MNIST_IMGS_PATH, 'rb') as imgs_file: data = extract_images(imgs_file) inputs = None elif task_name == 'cmnist': with open(MNIST_IMGS_PATH, 'rb') as imgs_file: data = extract_images(imgs_file) with open(MNIST_LABELS_PATH, 'rb') as labels_file: inputs = extract_labels(labels_file) inputs = np.eye(10)[inputs] else: raise ValueError('Unknown task: {}'.format(task_name)) data = (data - data.min()) / (data.max() - data.min()) # Normalize data = (data * 2) - 1 # Move to range [-1, 1] if inputs is not None: inputs = (inputs - inputs.min()) / (inputs.max() - inputs.min() ) # Normalize inputs = (inputs * 2) - 1 # Move to range [-1, 1] return inputs, data
def maybe_download_minst(train_dir, SOURCE_URL, train=True, one_hot=True): TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if train: return train_images, train_labels else: return test_images, test_labels
def import_mnist(): """ This import mnist and saves the data as an object of our DataSet class :return: """ SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' VALIDATION_SIZE = 0 ONE_HOT = True TRAIN_DIR = 'MNIST_data' local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(open(local_file)) local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(open(local_file), one_hot=ONE_HOT) local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR, SOURCE_URL + TEST_IMAGES) test_images = extract_images(open(local_file)) local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(open(local_file), one_hot=ONE_HOT) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] ## Process images train_images = process_mnist(train_images) validation_images = process_mnist(validation_images) test_images = process_mnist(test_images) ## Standardize data train_mean, train_std = get_data_info(train_images) # train_images = standardize_data(train_images, train_mean, train_std) # validation_images = standardize_data(validation_images, train_mean, train_std) # test_images = standardize_data(test_images, train_mean, train_std) # data = DataSet(train_images, train_labels) # test = DataSet(test_images, test_labels) # val = DataSet(validation_images, validation_labels) data = DataSet(train_images, train_images) test = DataSet(test_images, test_images) val = DataSet(validation_images, validation_images) return data, test, val
def import_mnist(validation_size=0): """ This import mnist and saves the data as an object of our DataSet class :param concat_val: Concatenate training and validation :return: """ SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' ONE_HOT = True TRAIN_DIR = 'experiments/data/MNIST_data' local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR, SOURCE_URL + TRAIN_IMAGES) with open(local_file) as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR, SOURCE_URL + TRAIN_LABELS) with open(local_file) as f: train_labels = extract_labels(f, one_hot=ONE_HOT) local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR, SOURCE_URL + TEST_IMAGES) with open(local_file) as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR, SOURCE_URL + TEST_LABELS) with open(local_file) as f: test_labels = extract_labels(f, one_hot=ONE_HOT) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] # process images train_images = process_mnist(train_images) validation_images = process_mnist(validation_images) test_images = process_mnist(test_images) # standardize data train_mean, train_std = get_data_info(train_images) train_images = standardize_data(train_images, train_mean, train_std) validation_images = standardize_data(validation_images, train_mean, train_std) test_images = standardize_data(test_images, train_mean, train_std) data = DataSet(train_images, train_labels) test = DataSet(test_images, test_labels) val = DataSet(validation_images, validation_labels) return data, test, val
def load_data(): with open('../../data/mnist/train-images-idx3-ubyte.gz', 'rb') as f: train_images = np.squeeze(extract_images(f)) with open('../../data/mnist/train-labels-idx1-ubyte.gz', 'rb') as f: train_labels = extract_labels(f) with open('../../data/mnist/t10k-images-idx3-ubyte.gz', 'rb') as f: test_images = np.squeeze(extract_images(f)) with open('../../data/mnist/t10k-labels-idx1-ubyte.gz', 'rb') as f: test_labels = extract_labels(f) return train_images, train_labels, test_images, test_labels
def read_data_sets( fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, ): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) with gfile.Open(train_data_dir, 'rb') as f: train_images = extract_images(f) with gfile.Open(train_labels_dir, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) with gfile.Open(eval_data_dir, 'rb') as f: test_images = extract_images(f) with gfile.Open(eval_labels_dir, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def loadModelData(): # load EMNIST data with open(TRAIN_IMAGES_PATH, 'rb') as f: train_images = extract_images(f) with open(TRAIN_LABELS_PATH, 'rb') as f: train_labels = extract_labels(f) with open(TEST_IMAGES_PATH, 'rb') as f: test_images = extract_images(f) with open(TEST_LABELS_PATH, 'rb') as f: test_labels = extract_labels(f) # "rename" to make it similar to the tutorial # https://github.com/tflearn/tflearn/blob/master/examples/images/convnet_mnist.py X, Y, testX, testY = train_images, train_labels, test_images, test_labels # data preprocessing X = X.reshape([-1, 28, 28, 1]) testX = testX.reshape([-1, 28, 28, 1]) Y = to_categorical(Y, nb_classes=62) testY = to_categorical(testY, nb_classes=62) # Building convolutional network # the input is a 28x28 image with 1 channel network = input_data(shape=[None, 28, 28, 1], name='input') # 3 x convolution + max pooling network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) # fully connected with 512 nodes + some dropout network = fully_connected(network, 512, activation='relu') network = dropout(network, 0.5) # fully connected with 62 nodes which are the outputs network = fully_connected(network, 62, activation='softmax') # train the network with regression network = regression(network, optimizer='adam', loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path='classifier.tfl.ckpt') return model, X, Y, testX, testY
def read_data_sets(train_dir, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, source_url=None): # omit url since we are using our own dataset TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 'test-images-idx3-ubyte.gz' TEST_LABELS = 'test-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, None) # omit url, local file will be a path # type: DataSets with gfile.Open(local_file, 'rb') as f: train_images = mnist_module.extract_images(f) print (train_images.shape) local_file = base.maybe_download(TRAIN_LABELS, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: train_labels = mnist_module.extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: test_images = mnist_module.extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, None) # omit url with gfile.Open(local_file, 'rb') as f: test_labels = mnist_module.extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = mnist_module.DataSet(train_images, train_labels, **options) validation = mnist_module.DataSet(validation_images, validation_labels, **options) test = mnist_module.DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, one_hot=False, dtype=dtypes.float32, reshape=True, seed=None, source_url=DEFAULT_SOURCE_URL): if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) return base.Datasets(train=train, validation=None, test=None)
def write_mnist_data(input_images, input_labels, output, partitions): with open(input_images, 'rb') as f: images = numpy.array(mnist.extract_images(f)) with open(input_labels, 'rb') as f: labels = numpy.array(mnist.extract_labels(f, one_hot=True)) shape = images.shape print("images.shape: {0}".format(shape)) print("labels.shape: {0}".format(labels.shape)) images = images.reshape(shape[0], shape[1], shape[2]) num_per_part = int(math.ceil(float(shape[0]) / partitions)) seq = 0 filename = output + "/" + str(seq) + ".tfrecords" writer = tf.python_io.TFRecordWriter(filename) for i in range(shape[0]): if i != 0 and i % num_per_part == 0: writer.close() seq += 1 filename = output + "/" + str(seq) + ".tfrecords" writer = tf.python_io.TFRecordWriter(filename) image_raw = images[i].tostring() example = tf.train.Example(features=tf.train.Features( feature={ 'image_raw': _bytes_feature(image_raw), 'label': _int64_feature(labels[i].astype(int)) })) writer.write(example.SerializeToString()) writer.close()
def load_data_fashion_mnist(data_dir, one_hot=False, num_classes=10): train_image_file = os.path.join(data_dir, 'train-images-idx3-ubyte.gz') train_labels_file = os.path.join(data_dir, 'train-labels-idx1-ubyte.gz') test_image_file = os.path.join(data_dir, 't10k-images-idx3-ubyte.gz') test_labels_file = os.path.join(data_dir, 't10k-labels-idx1-ubyte.gz') with gfile.Open(train_image_file, 'rb') as f: train_images = extract_images(f) with gfile.Open(train_labels_file, 'rb') as f: train_labels = extract_labels(f, one_hot=True, num_classes=10) with gfile.Open(test_image_file, 'rb') as f: test_images = extract_images(f) with gfile.Open(test_labels_file, 'rb') as f: test_labels = extract_labels(f, one_hot=True, num_classes=10) return train_images, train_labels, test_images, test_labels
def load(data_dir, subset): SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' if subset == 'train' or subset == 'validate': local_file = base.maybe_download(TRAIN_IMAGES, data_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: images = extract_images(f) elif subset == 'test': local_file = base.maybe_download(TEST_IMAGES, data_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: images = extract_images(f) else: raise Exception('subset must be train or test') return images
def load_data(): cwd = os.getcwd() # training images with open(os.path.join(cwd, "train-images-idx3-ubyte.gz"), "rb") as f: train_images = extract_images(f) # training labels with open(os.path.join(cwd, "train-labels-idx1-ubyte.gz"), "rb") as f: train_labels = extract_labels(f) # testing images with open(os.path.join(cwd, "t10k-images-idx3-ubyte.gz"), "rb") as f: test_images = extract_images(f) # testing labels with open(os.path.join(cwd, "t10k-labels-idx1-ubyte.gz"), "rb") as f: test_labels = extract_labels(f) return (train_images, train_labels), (test_images, test_labels)
def load_minst(src=None, path=None, one_hot=False): mnist = DataSets() if src: mnist = input_data.read_data_sets("MNIST_data/", one_hot=one_hot) if path: if path[-1] != '/': path += '/' train_images = extract_images(path + TRAIN_IMAGES) train_labels = extract_labels(path + TRAIN_LABELS, one_hot=one_hot) test_images = extract_images(path + TEST_IMAGES) test_labels = extract_labels(path + TEST_LABELS, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] mnist.train = DataSet(train_images, train_labels) mnist.validation = DataSet(validation_images, validation_labels) mnist.test = DataSet(test_images, test_labels) return mnist
def convert_to_data_sets(data_gzs, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): """ Modified version of tensorflow/tensorflow/contrib/learn/python/learn/datasets/mnist.py """ with gfile.Open(data_gzs['train-images'][0], 'rb') as f: train_images = tf_mnist.extract_images(f) with gfile.Open(data_gzs['train-labels'][0], 'rb') as f: train_labels = tf_mnist.extract_labels(f, one_hot=one_hot) with gfile.Open(data_gzs['t10k-images'][0], 'rb') as f: test_images = tf_mnist.extract_images(f) with gfile.Open(data_gzs['t10k-labels'][0], 'rb') as f: test_labels = tf_mnist.extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = tf_mnist.DataSet(train_images, train_labels, **options) validation = tf_mnist.DataSet(validation_images, validation_labels, **options) test = tf_mnist.DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def mnist_data_loader(one_hot=False, reshape=True): """Load MNIST dataset.""" # Download the dataset if not exist. # CVDF mirror of http://yann.lecun.com/exdb/mnist/ DATA_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' file_list = [TRAIN_IMAGES, TRAIN_LABELS, TEST_IMAGES, TEST_LABELS] file_list = [ maybe_download(MNIST_DIR, DATA_URL + fil) for fil in file_list ] with gfile.Open(file_list[0], 'rb') as f: train_data = mnist.extract_images(f) / 255 with gfile.Open(file_list[1], 'rb') as f: train_labels = mnist.extract_labels(f, one_hot) with gfile.Open(file_list[2], 'rb') as f: test_data = mnist.extract_images(f) / 255 with gfile.Open(file_list[3], 'rb') as f: test_labels = mnist.extract_labels(f, one_hot) # Convert the shape of image, if reshape # [n_samples, width, length, 1] ==> [n_samples, n_features] if reshape: assert train_data.shape[1:] == test_data.shape[1:] n_train, width, length, _ = train_data.shape n_test = test_data.shape[0] train_data = train_data.reshape(n_train, width * length) test_data = test_data.reshape(n_test, width * length) return train_data, train_labels, test_data, test_labels
def load_mnist_data(): path = '/content/gdrive/My Drive/542HW4/' with open((path + "train-images-idx3-ubyte.gz"), "rb") as f: train_allimages = extract_images(f) with open((path + "train-labels-idx1-ubyte.gz"), "rb") as f: train_alllabels = extract_labels(f) valid_set_size = 10000 split = len(train_allimages) - valid_set_size valid_images = train_allimages[split:] valid_labels = train_alllabels[split:] train_images = train_allimages[:split] train_labels = train_alllabels[:split] with open((path + "t10k-images-idx3-ubyte.gz"), "rb") as f: test_images = extract_images(f) with open((path + "t10k-labels-idx1-ubyte.gz"), "rb") as f: test_labels = extract_labels(f) return (train_images, train_labels), (valid_images, valid_labels), (test_images, test_labels)
def load_mnist_data(train_dir, one_hot=True): """Returns all 'train' data --- images and labels.""" TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' SOURCE_URL = mnist.SOURCE_URL local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = mnist.extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = mnist.extract_labels(f, one_hot=one_hot) return train_images, train_labels
def load_gz(self): result = [] data = { 'train': ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz'], 'test': ['t10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz'] } image_path = os.path.join(self.data_dir, data[self.__mode][0]) label_path = os.path.join(self.data_dir, data[self.__mode][1]) with open(image_path, 'rb') as f: result.append(extract_images(f)) with open(label_path, 'rb') as f: result.append(extract_labels(f, self.one_hot)) return self.sc.parallelize(zip(*result))
def writeMNIST(sc, input_images, input_labels, output, format, num_partitions): """Writes MNIST image/label vectors into parallelized files on HDFS""" # load MNIST gzip into memory with open(input_images, 'rb') as f: images = numpy.array(mnist.extract_images(f)) with open(input_labels, 'rb') as f: if format == "csv2": labels = numpy.array(mnist.extract_labels(f, one_hot=False)) else: labels = numpy.array(mnist.extract_labels(f, one_hot=True)) shape = images.shape print("images.shape: {0}".format(shape)) # 60000 x 28 x 28 print("labels.shape: {0}".format(labels.shape)) # 60000 x 10 # create RDDs of vectors imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]), num_partitions) labelRDD = sc.parallelize(labels, num_partitions) output_images = output + "/images" output_labels = output + "/labels" # save RDDs as specific format if format == "pickle": imageRDD.saveAsPickleFile(output_images) labelRDD.saveAsPickleFile(output_labels) elif format == "csv": imageRDD.map(toCSV).saveAsTextFile(output_images) labelRDD.map(toCSV).saveAsTextFile(output_labels) elif format == "csv2": imageRDD.map(toCSV).zip(labelRDD).map( lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output) else: # format == "tfr": tfRDD = imageRDD.zip(labelRDD).map( lambda x: (bytearray(toTFExample(x[0], x[1])), None)) # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar tfRDD.saveAsNewAPIHadoopFile( output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable")
def import_mnist(): """ This import mnist and saves the data as an object of our DataSet class :return: """ VALIDATION_SIZE = 0 ONE_HOT = True TRAIN_DIR = 'INFMNIST_data/' train_images = extract_images_2( open(TRAIN_DIR + 'mnist8m-patterns-idx3-ubyte.gz')) train_labels = extract_labels(open(TRAIN_DIR + 'mnist8m-labels-idx1-ubyte.gz'), one_hot=ONE_HOT) test_images = extract_images(open(TRAIN_DIR + 'test10k-patterns.gz')) test_labels = extract_labels(open(TRAIN_DIR + 'test10k-labels.gz'), one_hot=ONE_HOT) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] ## Process images train_images = process_mnist(train_images) validation_images = process_mnist(validation_images) test_images = process_mnist(test_images) ## Standardize data train_mean, train_std = get_data_info(train_images) # train_images = standardize_data(train_images, train_mean, train_std) # validation_images = standardize_data(validation_images, train_mean, train_std) # test_images = standardize_data(test_images, train_mean, train_std) data = DataSet(train_images, train_labels) test = DataSet(test_images, test_labels) val = DataSet(validation_images, validation_labels) return data, test, val
def _get_data(self): from tensorflow.contrib.learn.python.learn.datasets.base \ import maybe_download from tensorflow.contrib.learn.python.learn.datasets.mnist \ import extract_images, extract_labels if self.is_train: IMAGES = 'train-images-idx3-ubyte.gz' LABELS = 'train-labels-idx1-ubyte.gz' else: print('using test dataset..') IMAGES = 't10k-images-idx3-ubyte.gz' LABELS = 't10k-labels-idx1-ubyte.gz' SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' local_file = maybe_download(IMAGES, self.path, SOURCE_URL) with open(local_file, 'rb') as f: images = extract_images(f) local_file = maybe_download(LABELS, self.path, SOURCE_URL) with open(local_file, 'rb') as f: labels = extract_labels(f, one_hot=False) values, counts = np.unique(labels, return_counts=True) data = [] for i in range(10): label = values[i] count = counts[i] arr = np.empty([count, 1, 28, 28], dtype=np.float32) data.append(arr) l_iter = [0] * 10 for i in range(labels.shape[0]): label = labels[i] data[label][l_iter[label]] = np.reshape(images[i], [1, 28, 28]) / 255. l_iter[label] += 1 self.data = data self.l_iter = l_iter return data
def generate_mnist_jpg(subdatadir, source_image_path, source_label_path): create_folder(subdatadir) local_file = base.maybe_download(source_image_path, train_dir, SOURCE_URL + source_image_path) with open(local_file, 'rb') as f: images = mnist.extract_images(f) local_file = base.maybe_download(source_label_path, train_dir, SOURCE_URL + source_label_path) with open(local_file, 'rb') as f: labels = mnist.extract_labels(f, one_hot=False) for img in range(labels.size): subdirpath = subdatadir + str(labels[img]) create_folder(subdirpath) filepath = (subdirpath + '/' + str(labels[img]) + '_' + str(img) + '.jpg') im = Image.fromarray(images[img,:,:,0]) im.save(filepath)
def _get_data(self): from tensorflow.examples.tutorials.mnist import input_data from tensorflow.contrib.learn.python.learn.datasets.base \ import maybe_download from tensorflow.contrib.learn.python.learn.datasets.mnist \ import extract_images, extract_labels if self.is_train: IMAGES = 'train-images-idx3-ubyte.gz' LABELS = 'train-labels-idx1-ubyte.gz' else: IMAGES = 't10k-images-idx3-ubyte.gz' LABELS = 't10k-labels-idx1-ubyte.gz' SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' # local_file = maybe_download(IMAGES, self.path, SOURCE_URL) with open("MNIST_data/" + IMAGES, 'rb') as f: images = extract_images(f) # local_file = maybe_download(LABELS, self.path, SOURCE_URL) with open("MNIST_data/" + LABELS, 'rb') as f: labels = extract_labels(f, one_hot=False) values, counts = np.unique(labels, return_counts=True) data = [] for i in range(10): label = values[i] count = counts[i] arr = np.empty([count, 28, 28, 1], dtype=np.float32) data.append(arr) l_iter = [0] * 10 for i in range(labels.shape[0]): label = labels[i] data[label][l_iter[label]] = images[i] / 255. l_iter[label] += 1 return data
def writeMNIST(sc, input_images, input_labels, output, format, num_partitions): """Writes MNIST image/label vectors into parallelized files on HDFS""" # load MNIST gzip into memory with open(input_images, 'rb') as f: images = numpy.array(mnist.extract_images(f)) with open(input_labels, 'rb') as f: if format == "csv2": labels = numpy.array(mnist.extract_labels(f, one_hot=False)) else: labels = numpy.array(mnist.extract_labels(f, one_hot=True)) shape = images.shape print("images.shape: {0}".format(shape)) # 60000 x 28 x 28 print("labels.shape: {0}".format(labels.shape)) # 60000 x 10 # create RDDs of vectors imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]), num_partitions) labelRDD = sc.parallelize(labels, num_partitions) output_images = output + "/images" output_labels = output + "/labels" # save RDDs as specific format if format == "pickle": imageRDD.saveAsPickleFile(output_images) labelRDD.saveAsPickleFile(output_labels) elif format == "csv": imageRDD.map(toCSV).saveAsTextFile(output_images) labelRDD.map(toCSV).saveAsTextFile(output_labels) elif format == "csv2": imageRDD.map(toCSV).zip(labelRDD).map(lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output) else: # format == "tfr": tfRDD = imageRDD.zip(labelRDD).map(lambda x: (bytearray(toTFExample(x[0], x[1])), None)) # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar tfRDD.saveAsNewAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable")
print("Completed processing 10000 images") if i == 20000: print("Completed processing 20000 images") if i == 30000: print("Completed processing 30000 images") if i == 40000: print("Completed processing 40000 images") if i == 50000: print("Completed processing 50000 images") if i == 59999: print("Completed processing 60000 images") return w, bias with open('train-images-idx3-ubyte.gz', 'rb') as f: X_train = extract_images(f) with open('train-labels-idx1-ubyte.gz', 'rb') as f: Y_train = extract_labels(f) with open('t10k-images-idx3-ubyte.gz', 'rb') as f: x_test = extract_images(f) with open('t10k-labels-idx1-ubyte.gz', 'rb') as f: y_test = extract_labels(f) num_pixels = X_train.shape[1] * X_train.shape[2] X_train = X_train.reshape((X_train.shape[0], num_pixels)).astype('float32') x_test = x_test.reshape((x_test.shape[0], num_pixels)).astype('float32') Y_train_new = convert(Y_train) y_test_new = convert(y_test) # normalize inputs from 0-255 to 0-1 X_train = X_train / 255
""" Created on Wed May 16 00:35:00 2018 @author: ram """ import numpy as np """ Reading MNIST data """ ''' ============================================================================================================== ''' from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels with open('train-images-idx3-ubyte.gz', 'rb') as f: train_images = extract_images(f) with open('train-labels-idx1-ubyte.gz', 'rb') as f: train_labels = extract_labels(f) with open('t10k-images-idx3-ubyte.gz', 'rb') as f: test_images = extract_images(f) with open('t10k-labels-idx1-ubyte.gz', 'rb') as f: test_labels = extract_labels(f) ''' ============================================================================================================== ''' #printing shapes of all train and test data
"""JYI, 11/13/2018 """ # load data set, data exploration from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels with open('train-images-idx3-ubyte.gz', 'rb') as f: train_x = extract_images(f) with open('train-labels-idx1-ubyte.gz', 'rb') as f: train_y = extract_labels(f) with open('t10k-images-idx3-ubyte.gz', 'rb') as f: test_x = extract_images(f) with open('t10k-labels-idx1-ubyte.gz', 'rb') as f: test_y = extract_labels(f) import matplotlib.pyplot as plt fig1 = plt.figure(1, figsize=(9, 6)) plt.imshow(train_x[0].reshape((28, 28))) fig1.suptitle('Training data sample', fontsize=10) fig2 = plt.figure(2, figsize=(9, 6)) plt.imshow(test_x[0].reshape((28, 28))) fig2.suptitle('Testing data sample', fontsize=10) plt.show() print('train_y[0]:{}'.format(train_y[0])) # 5 print('train_x.shape:{}'.format(train_x.shape)) # (60000, 28, 28, 1) print('train_y.shape:{}'.format(train_y.shape)) # (60000,) print('test_x.shape:{}'.format(test_x.shape)) # (10000, 28, 28, 1) print('test_y.shape:{}'.format(test_y.shape)) # (10000,) # data set pre-processing import numpy as np num_class = 10 num_feature = 784
def load_data(): return extract_images(open('data/emnist-bymerge-train-images-idx3-ubyte.gz', 'rb')), \ extract_labels(open('data/emnist-bymerge-train-labels-idx1-ubyte.gz', 'rb')), \ extract_images(open('data/emnist-bymerge-test-images-idx3-ubyte.gz', 'rb')), \ extract_labels(open('data/emnist-bymerge-test-labels-idx1-ubyte.gz', 'rb'))
from tensorflow.examples.tutorials.mnist import input_data import tensorflow as tf import random import matplotlib.pyplot as plt import Dataset_main tf.set_random_seed(777) # for reproducibility from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels _TEST_DATA_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-test-images-idx3-ubyte.gz' _TEST_LABELS_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-test-labels-idx1-ubyte.gz' _TRAIN_DATA_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-train-images-idx3-ubyte.gz' _TRAIN_LABELS_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-train-labels-idx1-ubyte.gz' with open('my/directory/train-images-idx3-ubyte.gz', 'rb') as f: train_images = extract_images(f) with open('my/directory/train-labels-idx1-ubyte.gz', 'rb') as f: train_labels = extract_images(f) with open('my/directory/t10k-images-idx3-ubyte.gz', 'rb') as f: test_images = extract_labels(f) with open('my/directory/t10k-labels-idx1-ubyte.gz', 'rb') as f: test_labels = extract_labels(f) mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) nb_classes = 10 # MNIST data image of shape 28 * 28 = 784 X = tf.placeholder(tf.float32, [None, 784])
def read_data_sets(train_dir, fake_data=False, one_hot=False, shuffle=False, validation_percentage=0.1): comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() class DataSets(object): pass data_sets = DataSets() if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot) data_sets.train = fake() data_sets.validation = fake() data_sets.test = fake() return data_sets TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' if 0 == rank: local_file = maybe_download(TRAIN_IMAGES, train_dir, "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz") train_images = extract_images(local_file) if shuffle: # shuffle the data perm = np.arange(train_images.shape[0]) np.random.shuffle(perm) train_images = train_images[perm] # bcast the data shape = train_images.shape shape = comm.bcast(shape, root=0) comm.Bcast(train_images, root=0) local_file = maybe_download(TRAIN_LABELS, train_dir, "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz") train_labels = extract_labels(local_file, one_hot=one_hot) if shuffle: # shuffle the data, using same indices as images above train_labels = train_labels[perm] # bcast the data shape = train_labels.shape shape = comm.bcast(shape, root=0) comm.Bcast(train_labels, root=0) local_file = maybe_download(TEST_IMAGES, train_dir, "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz") test_images = extract_images(local_file) shape = test_images.shape shape = comm.bcast(shape, root=0) comm.Bcast(test_images, root=0) local_file = maybe_download(TEST_LABELS, train_dir, "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz") test_labels = extract_labels(local_file, one_hot=one_hot) shape = test_labels.shape shape = comm.bcast(shape, root=0) comm.Bcast(test_labels, root=0) else: shape = None shape = comm.bcast(shape, root=0) train_images = np.ndarray(shape=shape, dtype=np.uint8) comm.Bcast(train_images, root=0) shape = None shape = comm.bcast(shape, root=0) train_labels = np.ndarray(shape=shape) comm.Bcast(train_labels, root=0) shape = None shape = comm.bcast(shape, root=0) test_images = np.ndarray(shape=shape, dtype=np.uint8) comm.Bcast(test_images, root=0) shape = None shape = comm.bcast(shape, root=0) test_labels = np.ndarray(shape=shape) comm.Bcast(test_labels, root=0) VALIDATION_SIZE = int(0) total = train_images.shape[0] - VALIDATION_SIZE count = total / size remain = total % size if 0 == rank: print "total images", total print "image subset (%d,%d)=%d" % (total,size,count) print "image subset remainder", remain start = rank * count stop = rank * count + count if rank < remain: start += rank stop += rank + 1 else : start += remain stop += remain validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] train_images = train_images[start:stop] train_labels = train_labels[start:stop] data_sets.train = DataSet(train_images, train_labels) data_sets.validation = DataSet(validation_images, validation_labels) data_sets.test = DataSet(test_images, test_labels) if 0 == rank: print "Rank Start Stop NumExamples" sys.stdout.flush() for i in xrange(size): if rank == i: print i,start,stop,data_sets.train.num_examples sys.stdout.flush() comm.Barrier() return data_sets
def read_data_sets(train_dir, fake_data=False, one_hot=False, shuffle=False, validation_percentage=0.1): comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() class DataSets(object): pass data_sets = DataSets() if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot) data_sets.train = fake() data_sets.validation = fake() data_sets.test = fake() return data_sets SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' WORK_DIRECTORY = 'data' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' if 0 == rank: local_file = maybe_download(TRAIN_IMAGES) temp_file = open(local_file, 'r') train_images = extract_images(temp_file) if shuffle: # shuffle the data perm = np.arange(train_images.shape[0]) np.random.shuffle(perm) train_images = train_images[perm] # bcast the data shape = train_images.shape shape = comm.bcast(shape, root=0) comm.Bcast(train_images, root=0) local_file = maybe_download(TRAIN_LABELS) temp_file = open(local_file, 'r') train_labels = extract_labels(temp_file, one_hot=one_hot) if shuffle: # shuffle the data, using same indices as images above train_labels = train_labels[perm] # bcast the data shape = train_labels.shape shape = comm.bcast(shape, root=0) comm.Bcast(train_labels, root=0) local_file = maybe_download(TEST_IMAGES) temp_file = open(local_file, 'r') test_images = extract_images(temp_file) shape = test_images.shape shape = comm.bcast(shape, root=0) comm.Bcast(test_images, root=0) local_file = maybe_download(TEST_LABELS) temp_file = open(local_file, 'r') test_labels = extract_labels(temp_file, one_hot=one_hot) shape = test_labels.shape shape = comm.bcast(shape, root=0) comm.Bcast(test_labels, root=0) else: shape = None shape = comm.bcast(shape, root=0) train_images = np.ndarray(shape=shape, dtype=np.uint8) comm.Bcast(train_images, root=0) shape = None shape = comm.bcast(shape, root=0) train_labels = np.ndarray(shape=shape) comm.Bcast(train_labels, root=0) shape = None shape = comm.bcast(shape, root=0) test_images = np.ndarray(shape=shape, dtype=np.uint8) comm.Bcast(test_images, root=0) shape = None shape = comm.bcast(shape, root=0) test_labels = np.ndarray(shape=shape) comm.Bcast(test_labels, root=0) VALIDATION_SIZE = train_images.shape[0] * validation_percentage total = train_images.shape[0] - VALIDATION_SIZE count = total / size remain = total % size if 0 == rank: print "total images", total print "image subset (%d,%d)=%d" % (total, size, count) print "image subset remainder", remain start = rank * count stop = rank * count + count if rank < remain: start += rank stop += rank + 1 else: start += remain stop += remain VALIDATION_SIZE = int(VALIDATION_SIZE) start = int(start) stop = int(stop) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] train_images = train_images[start:stop] train_labels = train_labels[start:stop] data_sets.train = DataSet(train_images, train_labels) data_sets.validation = DataSet(validation_images, validation_labels) data_sets.test = DataSet(test_images, test_labels) if 0 == rank: print "Rank Start Stop NumExamples" sys.stdout.flush() for i in xrange(size): if rank == i: print i, start, stop, data_sets.train.num_examples sys.stdout.flush() comm.Barrier() return data_sets