def load(self): """ Downloads the dataset following the specifification of the instantiated instance """ # donwload proper dataset if self.onlyDigits: logging.info('Downloading digits from EMNIST repository') X, y = extract_test_samples('digits') if self.kerasDB: logging.info( 'Downloading the Original MNIST dataset from TensorFlow Datasets' ) (X_train, y_train), (X_test, y_test) = mnist.load_data() # concatenate train and test together # we want to perform the training with the highest number of training instances X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) else: logging.info( 'Downloading letters and digits from EMNIST repository') X, y = extract_test_samples('balanced') # save downloded data in the dataset folder as numpy array np.save(os.path.join(self.dirData, 'data.npy'), X) np.save(os.path.join(self.dirData, 'label.npy'), y) if self.verbose: self.printInfo(y) return X, y
def load_dataset(): # load dataset (trainX, trainY) = em.extract_training_samples('letters') trainX, trainY = shuffle(trainX, trainY) (testX, testY) = em.extract_test_samples('letters') # reshape dataset to have a single channel #(trainX,trainY),(testX,testY)=load_data() trainX = trainX.reshape((trainX.shape[0], 28, 28, 1)) testX = testX.reshape((testX.shape[0], 28, 28, 1)) # one hot encode target values #trainY = to_categorical(trainY) testY = to_categorical(testY) tX = [] tY = [] # print("tX.shape",tX.shape) # print("tY.shape",tY.shape) # #print(trainy[0]) shot = 300 ctr = [shot] * 27 for i in range(len(trainY)): label = trainY[i] ctr[label] = ctr[label] - 1 if (ctr[label] > 0): tX.append(trainX[i]) tY.append(trainY[i]) print("tX.shape", len(tX)) tY = to_categorical(tY) # print("tY.shape",tY.shape) return tX, tY, testX, testY
def loadTestData(self, category='letters'): """ Load testing data for EMNIST uppercase/lowercase 26 characters and format appropriately """ testImages, testLabels = emnist.extract_test_samples(category) return reformat(testImages, testLabels)
def train(mode, dataset): from tensorflow import keras from emnist import list_datasets, extract_training_samples, extract_test_samples import numpy as np from numpy.random import seed from tensorflow import set_random_seed name = mode[0] mode = mode[1] seed(4) set_random_seed(4) (train_images, train_labels) = extract_training_samples(dataset) (test_images, test_labels) = extract_test_samples(dataset) train_labels = keras.utils.to_categorical(train_labels) test_labels = keras.utils.to_categorical(test_labels) if mode["reshape"]: # Reshaping the array to 4-dims so that it can work with the Keras API # The last number is 1, which signifies that the images are greyscale. train_images = np.reshape(train_images, (train_images.shape[0], 28, 28, 1)) test_images = np.reshape(test_images, (test_images.shape[0], 28, 28, 1)) train_images = keras.utils.normalize(train_images, axis=1) test_images = keras.utils.normalize(test_images, axis=1) model = keras.Sequential() for l in mode["architecture"]: model.add(l) es = keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=2) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) print(model.summary()) model.fit(x=train_images, y=train_labels, epochs=100, validation_split=0.1, callbacks=[es]) model_name = dataset + "_" + name model.save("./" + model_name + ".h5") print("saved model to " + model_name + ".h5") print("evaluating...") val_loss, val_acc = model.evaluate(x=test_images, y=test_labels) del train_images del train_labels del test_images del test_labels import gc gc.collect()
def load_data() -> tuple: images_train, labels_train = extract_training_samples("letters") images_test, labels_test = extract_test_samples("letters") images = np.concatenate((images_train, images_test)) labels = np.concatenate((labels_train, labels_test)) images = np.expand_dims(images, axis=-1) labels = labels - 1 return images, labels
def load_emnist_balanced(): """ Load EMNIST Balanced :return: training inputs, training outputs, test inputs, test outputs, number of classes """ training_images, training_labels = emnist.extract_training_samples( 'balanced') test_images, test_labels = emnist.extract_test_samples('balanced') return training_images, training_labels, test_images, test_labels, len( set(training_labels))
def load_emnist_letters(): """ Load EMNIST Letters :return: training inputs, training outputs, test inputs, test outputs, number of classes """ training_images, training_labels = emnist.extract_training_samples( 'letters') test_images, test_labels = emnist.extract_test_samples('letters') return training_images, training_labels, test_images, test_labels, len( set(training_labels))
def save_emnist_uppercase_reduced_letters64_dataset(): x_train, y_train = emnist.extract_training_samples('byclass') x_test, y_test = emnist.extract_test_samples('byclass') train_mask = emnsit_uppercase_label_filter(y_train) test_mask = emnsit_uppercase_label_filter(y_test) x_train_reduced = x_train[train_mask] x_train_reduced = [ cv2.resize(i, (64, 64), interpolation=cv2.INTER_NEAREST) for i in x_train_reduced ] y_train_reduced = y_train[train_mask] # shift to 0 label y_train_reduced -= 10 y_train_reduced = replace_x_letter_label(y_train_reduced) x_test_reduced = x_test[test_mask] x_test_reduced = [ cv2.resize(i, (64, 64), interpolation=cv2.INTER_NEAREST) for i in x_test_reduced ] y_test_reduced = y_test[test_mask] y_test_reduced -= 10 y_test_reduced = replace_x_letter_label(y_test_reduced) x_train_reduced, x_val_reduced, y_train_reduced, y_val_reduced = train_test_split( x_train_reduced, y_train_reduced, test_size=0.1) x_train_reduced = np.divide(x_train_reduced, 255).astype("float64") x_val_reduced = np.divide(x_val_reduced, 255).astype("float64") x_test_reduced = np.divide(x_test_reduced, 255).astype("float64") # x_train_reduced = x_train_reduced.reshape(x_train_reduced.shape[0], x_train_reduced.shape[1], x_train_reduced.shape[2], 1) x_val_reduced = x_val_reduced.reshape(x_val_reduced.shape[0], x_val_reduced.shape[1], x_val_reduced.shape[2], 1) x_test_reduced = x_test_reduced.reshape(x_test_reduced.shape[0], x_test_reduced.shape[1], x_test_reduced.shape[2], 1) letters_dataset = { "x_train": x_train_reduced, "y_train": y_train_reduced, "x_val": x_val_reduced, "y_val": y_val_reduced, "x_test": x_test_reduced, "y_test": y_test_reduced } with open("eng_uppercase_letters64_dataset.bin", "wb") as file: pickle.dump(letters_dataset, file)
def load_dataset(): # load dataset trainX, trainY = extract_training_samples('letters') testX, testY = extract_test_samples('letters') # reshape dataset to have a single channel trainX = trainX.reshape((trainX.shape[0], 28, 28, 1)) testX = testX.reshape((testX.shape[0], 28, 28, 1)) # one hot encode target values trainY = to_categorical(trainY) testY = to_categorical(testY) return trainX, trainY, testX, testY
def preprocess(): train_images, train_labels = emnist.extract_training_samples('mnist') train_images = train_images.reshape( (train_images.shape[0], 1, 28, 28)).astype(np.float32) train_images /= 255 train_labels = one_hot(train_labels.reshape(train_labels.shape[0], 1), 10) test_images, test_labels = emnist.extract_test_samples('mnist') test_images = test_images.reshape( (test_images.shape[0], 1, 28, 28)).astype(np.float32) test_images /= 255 return (train_images, train_labels), (test_images, test_labels)
def saveDataSet(dataSetType): if dataSetType == 'digits': # Extract Dataset print('Extraction Dataset') X_train, y_train = extract_training_samples('digits') X_test, y_test = extract_test_samples('digits') # Reshape Dataset print('Reshaping Dataset ') images_train, labels_train = manageDataSet(len(y_train), X_train, y_train) images_test, labels_test = manageDataSet(len(y_test), X_test, y_test) # Save the Dataset print('Saving Dataset') save("images_numbers_train.npy", images_train) save("labels_numbers_train.npy", labels_train) save("images_numbers_test.npy", images_test) save("labels_numbers_test.npy", labels_test) if dataSetType == 'letters': # Extract Dataset print('Extraction Dataset') X_train, y_train = extract_training_samples('letters') X_test, y_test = extract_test_samples('letters') # Reshape Dataset print('Reshaping Dataset ') imgs_train, labels_train = manageDataSet(len(y_train), X_train, y_train) imgs_test, labels_test = manageDataSet(len(y_test), X_test, y_test) # Save reshape Dataset print('Extraction Dataset') save("images_letters_train.npy", imgs_train) save("labels_letters_train.npy", labels_train) save("images_letters_test.npy", imgs_test) save("labels_letters_test.npy", labels_test)
def loadEmnist(self): """ Load Emnist dataset and do some data pre-processing Split the training set 80/20% for training and validation set Convert y labels to 1-k hot array """ x_train, y_train = extract_training_samples('balanced') x_test, y_test = extract_test_samples('balanced') # Get only the upper case letters train_alphabet_list = (np.array(y_train) < 36) & (np.array(y_train) > 9) test_alphabet_list = (np.array(y_test) < 36) & (np.array(y_test) > 9) y_train = y_train[train_alphabet_list] - 10 x_train = x_train[train_alphabet_list] y_test = y_test[test_alphabet_list] - 10 x_test = x_test[test_alphabet_list] self.nclass = 26 self.width = x_train.shape[1] self.height = x_train.shape[2] self.total_train_size = len(x_train) self.ntrain = int(0.9 * self.total_train_size) self.nval = int(0.1 * self.total_train_size) self.ntest = len(x_test) self.train_counter = 0 self.train_index = np.arange(self.ntrain) x_train = x_train.reshape(x_train.shape[0], self.width, self.height, 1) x_test = x_test.reshape(x_test.shape[0], self.width, self.height, 1) input_shape = (self.width, self.height, 1) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 self.x_test = x_test/255 self.x_val = x_train[self.ntrain:self.total_train_size] self.x_train = x_train[0:self.ntrain] y_val = y_train[self.ntrain:self.total_train_size] y_train = y_train[0:self.ntrain] # convert class vectors to binary class matrices self.y_train = keras.utils.to_categorical(y_train, 26) self.y_val = keras.utils.to_categorical(y_val, 26) self.y_test = keras.utils.to_categorical(y_test, 26) print(self.x_train.shape) print(self.x_val.shape) print(self.x_test.shape)
def __init__(self, number_of_authors, number_of_pixels=4, poisoned_ratio=0.2, backdoor_value=1, initial_shuffle=True, seed=None): X_train, y_train = emnist.extract_training_samples('digits') X_test, y_test = emnist.extract_test_samples('digits') X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) # IMPORTANT: # create imbalanced datasets, i.e., the number of elements in each digit class of the same author may vary. # But the number of samples per author is balanced, i.e., each author has the same number of samples. samples_per_author = len(X) // number_of_authors author = np.repeat(np.arange(number_of_authors), samples_per_author) # throw leftover datasamples away such that we have same number of samples for each author skip_at_end = len(X) - len(author) assert skip_at_end < samples_per_author, "Why do you throw so many samples away?" if skip_at_end > 0: print( f"Warning: throwing {skip_at_end} samples away to have balanced number of samples per author" ) X = X[:len(author)] y = y[:len(author)] # flatten X[:,-] print(X.shape) X = X.reshape((len(X), 784)) print(X.shape) # binarize data # X[X<128] = 0 # X[X>127] = 255 X = X / 255 super(PoisonedDataset_EMNIST_DIGITS, self).__init__(X, y, author, number_of_classes=10, number_of_pixels=number_of_pixels, poisoned_ratio=poisoned_ratio, backdoor_value=backdoor_value, initial_shuffle=initial_shuffle, seed=seed)
def load_data(): # Get numbers and letters data from EMNIST X_train, train_labels = extract_training_samples('byclass') X_test, test_labels = extract_test_samples('byclass') # Remove capital letters X_train, train_labels = remove_upper(X_train, train_labels) X_test, test_labels = remove_upper(X_test, test_labels) # Merge train and test datasets X = np.vstack((X_train, X_test)) labels = np.hstack((train_labels, test_labels)) return X, labels
def load_data(): X_train, train_labels = extract_training_samples('byclass') X_test, test_labels = extract_test_samples('byclass') X_train, train_labels = remove_upper(X_train, train_labels) X_test, test_labels = remove_upper(X_test, test_labels) chars = '0123456789' + string.ascii_lowercase num_chars = len(chars) X_train = X_train.reshape(-1, 28, 28, 1) X_test = X_test.reshape(-1, 28, 28, 1) return X_train, X_test, train_labels, test_labels
def save_emnist_reduced_letters_dataset(): x_train, y_train = emnist.extract_training_samples('letters') x_test, y_test = emnist.extract_test_samples('letters') # Переход к меткам диапазона [0..25] y_train = np.subtract(y_train, 1) y_test = np.subtract(y_test, 1) # train_mask = label_filter(y_train) train_mask = label_filter(y_train) test_mask = label_filter(y_test) x_train_reduced = x_train[train_mask] y_train_reduced = y_train[train_mask] y_train_reduced = replace_x_letter_label(y_train_reduced) x_test_reduced = x_test[test_mask] y_test_reduced = y_test[test_mask] y_test_reduced = replace_x_letter_label(y_test_reduced) x_train_reduced, x_val_reduced, y_train_reduced, y_val_reduced = train_test_split( x_train_reduced, y_train_reduced, test_size=0.1) x_train_reduced = np.divide(x_train_reduced, 255).astype("float64") x_val_reduced = np.divide(x_val_reduced, 255).astype("float64") x_test_reduced = np.divide(x_test_reduced, 255).astype("float64") # x_train_reduced = x_train_reduced.reshape(x_train_reduced.shape[0], x_train_reduced.shape[1], x_train_reduced.shape[2], 1) x_val_reduced = x_val_reduced.reshape(x_val_reduced.shape[0], x_val_reduced.shape[1], x_val_reduced.shape[2], 1) x_test_reduced = x_test_reduced.reshape(x_test_reduced.shape[0], x_test_reduced.shape[1], x_test_reduced.shape[2], 1) letters_dataset = { "x_train": x_train_reduced, "y_train": y_train_reduced, "x_val": x_val_reduced, "y_val": y_val_reduced, "x_test": x_test_reduced, "y_test": y_test_reduced } with open("eng_letters_dataset.bin", "wb") as file: pickle.dump(letters_dataset, file)
def get_data(self, s0): self.x_train, self.y_train = extract_training_samples('byclass') self.x_test, self.y_test = extract_test_samples('byclass') self.y_test = oneHotEncodeY(self.y_test, 62) self.y_train = oneHotEncodeY(self.y_train, 62) self.x_train = self.x_train.astype('float32') self.y_train = self.y_train.astype('float32') self.x_test = self.x_test.astype('float32') self.y_test = self.y_test.astype('float32') #print(np.amax(self.y_train)) #print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape) self.x_train = self.x_train /255. self.y_train = self.y_train self.x_test = self.x_test/ 255. self.y_test = self.y_test self.x_train = np.reshape(self.x_train,(self.x_train.shape[0], 28, 28, 1)) self.x_test = np.reshape(self.x_test,(self.x_test.shape[0], 28, 28, 1)) #self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],1)) #self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],1)) self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] #images = np.reshape(images,(images.shape[0], 28, 28, 1)) #self.x_train, self.y_train = mnist.get_set('train') #self.x_test, self.y_test = mnist.get_set('test') #print("//////////////////////////////") #print(type(images)) #print(images.shape[1:4]) #print(labels.shape) #print(images.shape) ''' self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] print(np.amax(self.y_train)) ''' self.nb_classes = 62 self.x_sub = self.x_test[:s0] self.y_sub = np.argmax(self.y_test[:s0], axis=1) self.x_test = self.x_test[s0:] self.y_test = self.y_test[s0:]
def load_data(self): """ Load data from emnist package # Returns: all_data : train data, train labels, test data and test labels """ self._train_data, self._train_labels = emnist.extract_training_samples( 'digits') self._train_labels = np.eye(10)[self._train_labels] self._test_data, self._test_labels = emnist.extract_test_samples( 'digits') self._test_labels = np.eye(10)[self._test_labels] self.shuffle() return self.data
def get_data(experiment, occlusion=None, bars_type=None, one_hot=False): # Load EMNIST data, as part of TensorFlow. (train_images, train_labels), (test_images, test_labels) = emnist.extract_training_samples( 'balanced'), emnist.extract_test_samples('balanced') # (train_images, train_labels), (test_images, test_labels) = emnist.extract_training_samples( # 'letters'), emnist.extract_test_samples('letters') # # train_labels = train_labels.reshape(-1, ) # # test_labels = test_labels.reshape(-1, ) all_data = np.concatenate((train_images, test_images), axis=0) all_labels = np.concatenate((train_labels, test_labels), axis=0) # all_labels = all_labels - 1 # Change to 0-base index for letters # Para tabla 1 y el experimento 2 # for i, l in enumerate(all_labels): # all_labels[i] = { # 36: 10, # 37: 11, # 38: 13, # 39: 14, # 40: 15, # 41: 16, # 42: 17, # 43: 23, # 44: 26, # 45: 27, # 46: 29 # }.get(l, l) all_data = add_noise(all_data, experiment, occlusion, bars_type) all_data = all_data.reshape( (131600, img_columns, img_rows, constants.colors)) all_data = all_data.astype('float32') / 255 if one_hot: # Changes labels to binary rows. Each label correspond to a column, and only # the column for the corresponding label is set to one. all_labels = to_categorical(all_labels) return (all_data, all_labels)
def test(): testData, testLabels = emnist.extract_test_samples('letters') # import testing data from emnist # call them data and labels correct = 0 for x in range(testData.shape[0]): inputs = norm(np.ndarray.flatten(testData[x])) guess = nn.guess(inputs) print("the number was: " + str(testLabels[x])) guess = whatIndex(guess) + 1 print("it guessed it was :" + str(guess)) if guess == testLabels[x]: correct += 1 # print("it correctly predicted " + str(correct / len(data) * 100) + "%") return (correct / testData.shape[0] * 100)
def load_data(plot=True): # extract data from EMNIST [letters] images_train, labels_train = extract_training_samples('letters') images_test, labels_test = extract_test_samples('letters') if plot: # randomly plot 25 letters f, axarr = plt.subplots(5, 5) indices, ctr = random.sample(range(labels_train.shape[0]), 25), 0 for i in range(5): for j in range(5): idx = indices[ctr] axarr[i, j].imshow(images_train[idx], cmap="gray") axarr[i, j].set_title(f"{letters[labels_train[idx] - 1]}") ctr += 1 plt.show() # flatten last two dimensions to be (N, 784,) return images_train.reshape((images_train.shape[0], images_train.shape[1] * images_train.shape[2])), images_test.reshape((images_test.shape[0], images_test.shape[1] * images_test.shape[2])), labels_train, labels_test
def test(networkFilePath): n = NN.NeuralNetwork() n.loadFrom(networkFilePath) test_images, test_labels = emnist.extract_test_samples("letters") scorecard = [] for i in range(len(test_images)): correctLabel = test_labels[i] - 1 print("Corret label is", correctLabel) inputs = (np.asfarray(test_images[i].flatten()) / 255 * 0.99) + 0.01 outputs = n.query(inputs) label = np.argmax(outputs) print("Network respone is", label) print() scorecard.append(correctLabel == label) print("Report: ", scorecard) print("Total:", len(scorecard)) print("Correct:", sum(scorecard)) print(sum(scorecard) / len(scorecard))
def load_mnist_data(type='channel_last'): from emnist import extract_training_samples, extract_test_samples from keras.utils import np_utils # input image dimensions nb_classes = 26 img_rows, img_cols = 28, 28 X_train, Y_1 = extract_training_samples('letters') X_test, Y_2 = extract_test_samples('letters') y_train = [] y_test = [] for i in range(Y_1.shape[0]): y_train.append( Y_1[i] - 1 ) for i in range(Y_2.shape[0]): y_test.append( Y_2[i] - 1 ) y_train = np.array(y_train) y_test = np.array(y_test) if type == 'channel_first': X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) else: X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) X_train = X_train.astype('float32') X_test = X_test.astype('float32') print('X_train shape:', X_train.shape) print(X_train.shape[0], 'train samples') print(X_test.shape[0], 'test samples') # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) return X_train, Y_train, X_test, Y_test
def main(): np.set_printoptions(suppress=True) # prepare training and testing datasets training_images, training_labels = extract_training_samples('digits') test_images, test_labels = extract_test_samples('digits') training_images = training_images[0:10000] training_labels = training_labels[0:10000] tr_i = [training_images[i].flatten().reshape(784).tolist() for i in range(len(training_images))] for i in range(len(tr_i)): for j in range(len(tr_i[i])): tr_i[i][j] /= 255.0 tr_o = [[x] for x in training_labels.tolist()] tr_o = [[0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01] for i in range(len(training_labels))] for i in range(len(tr_o)): tr_o[i][training_labels[i]] = 0.99 # initialize and train the network nn = NeuralNetwork(784, [16,16], 10) nn.train(tr_i, tr_o, 1000) # gauge performance correct = 0 for test_image, test_label in zip(test_images[0:500], test_labels[0:500]): result = nn.feed_forward(test_image.flatten().reshape(784).tolist()) print("network result:\n", result); max = 0 guess = -1 for i, res in enumerate(result): if res > max: max = res guess = i print('network thinks this is a: ', guess) print("real answer:", test_label) if guess == int(test_label): correct += 1 print('network was correct on ', correct, '/', 500, 'images')
def build_dataset_manual(dataset_name, opts): """ Build train, valid, test datasets based on model options """ opts = MnistDataOptions(**opts) logging.info('Building dataset with options: %s', opts) # Load train and test data (MNIST) # train: 60k instances # test: 10k instances if (dataset_name == 'MNIST'): train, test = tf.keras.datasets.mnist.load_data() elif (dataset_name == 'EMNIST'): train = extract_training_samples('bymerge') test = extract_test_samples('bymerge') else: raise ValueError('Dataset is not supported!') # ALL we need is the test dataset so the train/valid doesnot matter here if (opts.split == 'train'): images, labels = train[0][0:50000], train[1][0:50000] elif (opts.split == 'valid'): images, labels = train[0][50000:60000], train[1][50000:60000] elif (opts.split == 'test'): images, labels = test else: raise ValueError('opts.split is not valid!') # Change images size if (dataset_name == 'MNIST') | (dataset_name == 'EMNIST'): images = np.expand_dims(images, -1) images = images / 255 if opts.rotate_degs: images = scipy.ndimage.rotate(images, opts.rotate_degs, axes=[-2, -3]) images = _crop_center(images, 28) if opts.roll_pixels: images = np.roll(images, opts.roll_pixels, axis=-2) return images, labels
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip print("download: trying to download") if self._check_exists(): print("download: already exists so exiting") return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') train_label, train_non_few_shot_ids, train_few_shot_ids = read_label_file( os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'), self.few_shot_class) train_img = read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte'), non_few_shot_ids=train_non_few_shot_ids) training_set = (train_img, train_label) test_label, test_non_few_shot_ids, test_few_shot_ids = read_label_file( os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'), self.few_shot_class) test_img = read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte'), few_shot_ids=test_few_shot_ids) if self.test_emnist: print("Download: Entering Emnist test") from emnist import extract_test_samples images, labels = extract_test_samples('letters') print(images.shape) print(labels.shape) #randomly grab a letter import random rand_letter_idx = random.randint(0, 25) #idx for selected letter clas test_sample_ids = np.where(labels < 10)[0] np.random.seed(10) np.random.shuffle(test_sample_ids) print('test_sample_ids_len', len(test_sample_ids)) #grab labels and images from that class labels = labels[test_sample_ids] images = images[test_sample_ids] print("After selecting one class") print(images.shape) print(labels.shape) #assert(self.few_shot_class not in labels) if self.max_test_sample: test_set = { torch.ByteTensor(list(images[:self.max_test_sample])).view( -1, 28, 28), torch.LongTensor(list(labels[:self.max_test_sample])) } else: test_set = { torch.ByteTensor(list(images)).view(-1, 28, 28), torch.LongTensor(list(labels)) } else: # test_label, test_non_few_shot_ids, test_few_shot_ids= read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'), self.few_shot_class) # test_img = read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte'), few_shot_ids=test_few_shot_ids) if (self.max_test_sample): print('testing max test sample') test_set = (test_img[:self.max_test_sample], test_label[:self.max_test_sample]) else: test_set = (test_img, test_label) print('confirming test size') #print(len(test_set[0]), len(test_set[1])) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'emnist-letters-train-images-idx3-ubyte.gz' TRAIN_LABELS = 'emnist-letters-train-labels-idx1-ubyte.gz' TEST_IMAGES = 'emnist-letters-test-images-idx3-ubyte.gz' TEST_LABELS = 'emnist-letters-test-labels-idx1-ubyte.gz' eminst_train_images, eminst_train_labels = extract_training_samples( 'letters') eminst_test_images, eminst_test_labels = extract_test_samples('letters') train_images = eminst_train_images train_labels = makes_one_hot_vectors(eminst_train_labels) test_images = eminst_test_images test_labels = makes_one_hot_vectors(eminst_test_labels) ## local_file = train_dir+'emnist-letters-train-images-idx3-ubyte.gz' ## with open(local_file, 'rb') as f: ## train_images = extract_images(f) ## local_file = train_dir+'emnist-letters-train-labels-idx1-ubyte.gz' ## with open(local_file, 'rb') as f: ## train_labels = extract_labels(f, one_hot=one_hot) ## local_file = train_dir+'emnist-letters-test-images-idx3-ubyte.gz' ## with open(local_file, 'rb') as f: ## test_images = extract_images(f) ## local_file = train_dir+'emnist-letters-test-labels-idx1-ubyte.gz' ## with open(local_file, 'rb') as f: ## test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
# from sklearn.preprocessing import MinMaxScaler from tensorflow import keras from tensorflow.keras.utils import to_categorical from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D # from keras.layers.convolutional import Conv2D # from keras.layers.convolutional import MaxPooling2D # from keras import backend as K # from tensorflow.keras.datasets import mnist # from sklearn.model_selection import train_test_split # from sklearn.metrics import confusion_matrix from emnist import extract_training_samples images_train, labels_train = extract_training_samples('balanced') from emnist import extract_test_samples images_test, labels_test = extract_test_samples('balanced') dims = images_train.shape[1] * images_train.shape[2] ## DENSE NN X_train = images_train.reshape(images_train.shape[0], dims) X_test = images_test.reshape(images_test.shape[0], dims) ## CONV NN # X_train = images_train.reshape(images_train.shape[0], 28,28,1) # X_test = images_test.reshape(images_test.shape[0], 28,28,1) print("Training Shape:", X_train.shape) print("Testing Shape:", X_test.shape) X_train = X_train.astype('float32') / 255
#ignore warning messages import warnings warnings.filterwarnings('ignore') sns.set() # pip install emnist # Import Dataset(s) from emnist import list_datasets list_datasets() from emnist import extract_training_samples images_train, labels_train = extract_training_samples('letters') from emnist import extract_test_samples images_test, labels_test = extract_test_samples('letters') # Flatten Data dims = images_train.shape[1] * images_train.shape[2] X_train = images_train.reshape(images_train.shape[0], dims) X_test = images_test.reshape(images_test.shape[0], dims) # Rescale to 0 -> 1 by dividing by max pixel value (255) X_train = X_train.astype('float32') / 255 X_test = X_test.astype('float32') / 255 # One-Hot Encoding from keras.utils import np_utils # used to convert array of labeled data to one-hot vector # should be 26 but out of index? # Effects accuracy as have a class where their will be no results
from google.colab import drive drive.mount('/content/gdrive/') # Commented out IPython magic to ensure Python compatibility. !pip install emnist import emnist import tensorflow as tf from tensorflow import keras from keras.utils import np_utils import matplotlib.pyplot as plt import os # %matplotlib inline #Load Dataset train_data,train_labels=emnist.extract_training_samples('bymerge') test_data,test_labels=emnist.extract_test_samples('bymerge') test=test_labels plt.imshow(test_data[0]) #Reshaping Training Data to make 28X28 grid image train_data=train_data.reshape(train_data.shape[0],28,28,1).astype('float32') test_data=test_data.reshape(test_data.shape[0],28,28,1).astype('float32') #Normalizing data train_data=train_data/255 test_data=test_data/255 #One Hot Encoding train_labels=np_utils.to_categorical(train_labels) test_labels=np_utils.to_categorical(test_labels)