示例#1
0
def loadData(path, ):
    # Vengono caricati dai rispettivi file i training set ed i test set
    emnistLoader = MNIST(path)
    trainingData, trainingLabels = emnistLoader.load(path + '/emnist-byclass-train-images-idx3-ubyte',
                                   path + '/emnist-byclass-train-labels-idx1-ubyte')
    testData, testLabels = emnistLoader.load(path + '/emnist-byclass-test-images-idx3-ubyte',
                                 path + '/emnist-byclass-test-labels-idx1-ubyte')

    # Si convertono i valori ACII in caratteri
    mapping = []

    with open(path + '/emnist-byclass-mapping.txt') as f:
        for line in f:
            mapping.append(chr(int(line.split()[1])))

    trainingData = np.array(trainingData)
    trainingLabels = np.array(trainingLabels)
    testData = np.array(testData)
    testLabels = np.array(testLabels)

    trainingData = normalize(trainingData)
    testData = normalize(testData)

    trainingData = reshape(trainingData)
    testData = reshape(testData)

    trainingLabels = preprocess_labels(trainingLabels, len(mapping))
    testLabels = preprocess_labels(testLabels, len(mapping))

    return trainingData, trainingLabels, testData, testLabels, mapping
示例#2
0
def load_train_and_test_dataset(data_path='data/gzip'):
    data = MNIST(data_path)

    X_train, y_train = data.load(
        os.path.join(data_path, 'emnist-byclass-train-images-idx3-ubyte'),
        os.path.join(data_path, 'emnist-byclass-train-labels-idx1-ubyte'))

    X_test, y_test = data.load(
        os.path.join(data_path, 'emnist-byclass-test-images-idx3-ubyte'),
        os.path.join(data_path, 'emnist-byclass-test-labels-idx1-ubyte'))

    # Normalização
    X_train = np.array(X_train) / 255.0
    y_train = np.array(y_train)
    X_test = np.array(X_test) / 255.0
    y_test = np.array(y_test)

    # Formatação
    X_train = X_train.reshape(X_train.shape[0], 28, 28)
    X_test = X_test.reshape(X_test.shape[0], 28, 28)

    X_train = X_train.reshape(X_train.shape[0], 784, 1)
    X_test = X_test.reshape(X_test.shape[0], 784, 1)

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
    }
def load_data(path, ):
    """Load data from the EMNIST dataset.

    All the data files should be using the original file names as given by
    the EMNIST website (https://www.nist.gov/itl/iad/image-group/emnist-dataset).

    Args:
        path (str): Directory containing all data files.

    Returns:
        Train and test data arrays with their respective label arrays
        and label mapping.
    """

    # Read all EMNIST test and train data
    mndata = MNIST(path)

    X_train, y_train = mndata.load(
        path + '/emnist-byclass-train-images-idx3-ubyte',
        path + '/emnist-byclass-train-labels-idx1-ubyte')
    X_test, y_test = mndata.load(
        path + '/emnist-byclass-test-images-idx3-ubyte',
        path + '/emnist-byclass-test-labels-idx1-ubyte')

    # Read mapping of the labels and convert ASCII values to chars
    mapping = []

    with open(path + '/emnist-byclass-mapping.txt') as f:
        for line in f:
            mapping.append(chr(int(line.split()[1])))

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    X_train = normalize(X_train)
    X_test = normalize(X_test)

    X_train = reshape_for_cnn(X_train)
    X_test = reshape_for_cnn(X_test)

    y_train = preprocess_labels(y_train, len(mapping))
    y_test = preprocess_labels(y_test, len(mapping))

    return X_train, y_train, X_test, y_test, mapping
def getData():
    # This loads in the character datasets that we want to interpret
    mndata = MNIST('data')
    x_train, y_train = mndata.load(
        'data/emnist-letters-train-images-idx3-ubyte',
        'data/emnist-letters-train-labels-idx1-ubyte')
    x_test, y_test = mndata.load('data/emnist-letters-test-images-idx3-ubyte',
                                 'data/emnist-letters-test-labels-idx1-ubyte')

    print("Finished loading data.\n")
    # Normalizes our data from 0 to 255 to 0 to 1
    x_train = np.array(x_train) / 255.0
    x_test = np.array(x_test) / 255.0

    # The data given is from 1 to 26, so this 0 indexes it
    y_train = np.array(y_train) - 1
    y_test = np.array(y_test) - 1

    # This puts it into the form that we want to pass into keras (samples, rows, columns, channels)
    # Channels is just 1 because our images are gray-scale
    # x_train.shape[0] tells how many samples we have
    x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
    x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

    # Rotates the image and reflects it so it looks "normal" to us
    x_train = np.rot90(x_train, axes=(1, 2), k=3)
    x_test = np.rot90(x_test, axes=(1, 2), k=3)
    x_train = x_train[:, :, ::-1, :]
    x_test = x_test[:, :, ::-1, :]

    # This just shows how some of our samples look like, comment it out after trying it out
    for z in range(0, 10):
        fig = plt.figure()
        plt.imshow(255 * x_train[z].reshape(28, 28),
                   interpolation="nearest",
                   cmap="gray")
        fig.suptitle(chr(y_train[z] + 65), fontsize=20)
        plt.show()

    # This one-hot encodes our data. This post explains why we need to do it, if you're interested
    # https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f
    y_train = np_utils.to_categorical(y_train, 26)
    y_test = np_utils.to_categorical(y_test, 26)

    return x_train, y_train, x_test, y_test
示例#5
0
def load_preprocess_EMNIST():
    emndata = MNIST('emnist_data')
    X_train, y_train = emndata.load(
        'emnist_data/emnist-byclass-train-images-idx3-ubyte',
        'emnist_data/emnist-byclass-train-labels-idx1-ubyte')
    X_test, y_test = emndata.load(
        'emnist_data/emnist-byclass-test-images-idx3-ubyte',
        'emnist_data/emnist-byclass-test-labels-idx1-ubyte')
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    X_train = X_train.astype('float32') / (255 / 2) - 1  # for tanh
    X_test = X_test.astype('float32') / (255 / 2) - 1
    y_train = to_categorical(y_train, n_label)
    y_test = to_categorical(y_test, n_label)

    return X_train, y_train, X_test, y_test
def load_data(path, ):
    mndata = MNIST(path)

    X_train, y_train = mndata.load(
        path + '/emnist-bymerge-train-images-idx3-ubyte',
        path + '/emnist-bymerge-train-labels-idx1-ubyte')
    X_test, y_test = mndata.load(
        path + '/emnist-bymerge-test-images-idx3-ubyte',
        path + '/emnist-bymerge-test-labels-idx1-ubyte')

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    X_train = normalize(X_train)
    X_test = normalize(X_test)

    y_train = preprocess_labels(y_train, 47)
    y_test = preprocess_labels(y_test, 47)

    return X_train, y_train, X_test, y_test
示例#7
0
#import other libraries
import numpy as np
import matplotlib.pyplot as plt
from fg import freeze_graph
from mnist import MNIST

mndata = MNIST('data')

#Data Sets
EMNIST_TRAINING_IMAGES = "data/emnist-letters-train-images-idx3-ubyte"
EMNIST_TRAINING_LABELS = "data/emnist-letters-train-labels-idx1-ubyte"
EMNIST_TEST_IMAGES = "data/emnist-letters-test-images-idx3-ubyte"
EMNIST_TEST_LABELS = "data/emnist-letters-test-labels-idx1-ubyte"

#Load data
X_train, y_train = mndata.load(EMNIST_TRAINING_IMAGES, EMNIST_TRAINING_LABELS)
X_test, y_test = mndata.load(EMNIST_TEST_IMAGES, EMNIST_TEST_LABELS)

#Convert data to numpy arrays
#Also normalize the images so they become between [0, 1]
X_train = np.array(X_train) / 255.0
y_train = np.array(y_train)
X_test = np.array(X_test) / 255.0
y_test = np.array(y_test)

#Resize the images to 28*28 for pre-processing
X_train = X_train.reshape(X_train.shape[0], 28, 28)
X_test = X_test.reshape(X_test.shape[0], 28, 28)

#for train data
for t in range(112800):
示例#8
0
cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))

#% of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

#define how much distance is minimized per training iteration
optimizer = tf.train.GradientDescentOptimizer(0.003)
train_step = optimizer.minimize(cross_entropy)

#start session (instantiate network)
sess = tf.Session()
sess.run(init)

#load training and test set
mn_full_data = MNIST.load('train-images.idx3-ubyte', 'train-labels.idx1-ubyte')
mn_train_data = MNIST.load('t10k-images.idx3-ubyte', 't10k-labels.idx1-ubyte')

idx = 0
for i in range(100):
    #load a batch of images and correct answers
    batch_X, batch_Y, idx = next_batch(100, idx, mn_train_data)
    train_data = {X: batch_X, Y_: batch_Y}

    #train
    sess.run(train_step, feed_dict=train_data)

#determine training success
a, c = sess.run([accuracy, cross_entropy], feed=train_data)

#Run full test
from PIL import Image

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Import the modules
from sklearn.externals import joblib
from sklearn import datasets
from skimage.feature import hog
from sklearn.svm import LinearSVC
import numpy as np

# Read all EMNIST test and train data
mndata = MNIST('C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data')

X_train, y_train = mndata.load('C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data/emnist-letters-train-images-idx3-ubyte', 
							'C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data/emnist-letters-train-labels-idx1-ubyte')


'''
X_train, y_train = mndata.load('C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data/emnist-letters-test-images-idx3-ubyte', 
							'C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data/emnist-letters-test-labels-idx1-ubyte')
'''
# Read mapping of the labels and convert ASCII values to chars
print("Read mapping of the labels and convert ASCII values to chars")
mapping = []
with open('C:/Users/manoj/Documents/py_workspace/ocr/EMNIST/data/emnist-letters-mapping.txt') as f:
	for line in f:
		mapping.append(chr(int(line.split()[1])))
print("Convert data to numpy arrays and normalize images to the interval [0, 1]")
# Convert data to numpy arrays and normalize images to the interval [0, 1]
print(len(X_train))
示例#10
0
    def load_data(self, get_first: bool = False):
        """Load all the unpacked data into numpy arrays.

        Args:
            get_first (bool): Download and unpack data first.
        """
        if get_first:
            self.get_data()

        dtype = 'balanced' if self.balanced else 'byclass'

        idx_paths = [
            (
                os.path.join(self.data_path, 'gzip',
                             f'emnist-{dtype}-train-images-idx3-ubyte'),
                os.path.join(self.data_path, 'gzip',
                             f'emnist-{dtype}-train-labels-idx1-ubyte'),
            ),
            (
                os.path.join(self.data_path, 'gzip',
                             f'emnist-{dtype}-test-images-idx3-ubyte'),
                os.path.join(self.data_path, 'gzip',
                             f'emnist-{dtype}-test-labels-idx1-ubyte'),
            )
        ]
        mapping = []

        with open(os.path.join(self.data_path, 'gzip',
                               f'emnist-{dtype}-mapping.txt'),
                  mode='r') as lm:
            for line in lm:
                mapping.append(chr(int(line.split()[1])))

        mndata = MNIST(os.path.join(self.data_path, 'gzip'))
        mndata.gz = True

        print("\n[~] Loading dataset...")

        x_train, y_train = mndata.load(idx_paths[0][0], idx_paths[0][1])
        x_test, y_test = mndata.load(idx_paths[1][0], idx_paths[1][1])

        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        # Normalize an array with data in an interval of [0, 255] to [0, 1]
        x_train = x_train.astype('float32') / 255
        x_test = x_test.astype('float32') / 255

        # Reshape the image to be used in a CNN
        x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
        x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

        # Perform one-hot encoding on label array
        y_train = tf.keras.utils.to_categorical(y_train, len(mapping))
        y_test = tf.keras.utils.to_categorical(y_test, len(mapping))

        print("[!] Dataset loaded!")

        return x_train, y_train, x_test, y_test, mapping