def load_train(): datasets = load_pickle('notMNIST_clean.pickle') train_dataset = datasets['train_dataset'] train_labels = datasets['train_labels'] valid_dataset = datasets['valid_dataset'] valid_labels = datasets['valid_labels'] classifier_name = 'classifier.pickle' if os.path.exists(classifier_name): classifier = load_pickle(classifier_name) else: classifier = LogisticRegression() classifier.fit(train_dataset.reshape(train_dataset.shape[0], -1), train_labels) save_obj(classifier_name, classifier) # simple valid valid_idx_s = 3000 valid_idx_e = 3014 x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)[valid_idx_s: valid_idx_e]) print(x) print(valid_labels[valid_idx_s:valid_idx_e]) # whole valid x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)) fail_cnt = 0 for i, pred in enumerate(x): if pred != valid_labels[i]: fail_cnt += 1 print("success rate:" + str((1 - float(fail_cnt) / len(x)) * 100) + "%")
def load_reformat_not_mnist(image_size, num_labels, num_channels): pickle_file = '../not_mnist/notMNIST_clean.pickle' save = load_pickle(pickle_file) train_dataset = save['train_dataset'] train_labels = save['train_labels'] valid_dataset = save['valid_dataset'] valid_labels = save['valid_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] del save # hint to help gc free up memory print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) train_dataset, train_labels = reformat(train_dataset, train_labels, image_size, num_labels, num_channels) valid_dataset, valid_labels = reformat(valid_dataset, valid_labels, image_size, num_labels, num_channels) test_dataset, test_labels = reformat(test_dataset, test_labels, image_size, num_labels, num_channels) print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
def load_reformat_not_mnist(image_size, num_labels): pickle_file = '../not_mnist/notMNIST_clean.pickle' save = load_pickle(pickle_file) train_dataset = save['train_dataset'] train_labels = save['train_labels'] valid_dataset = save['valid_dataset'] valid_labels = save['valid_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] del save # hint to help gc free up memory print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) train_dataset, train_labels = reformat(train_dataset, train_labels, image_size, num_labels) valid_dataset, valid_labels = reformat(valid_dataset, valid_labels, image_size, num_labels) test_dataset, test_labels = reformat(test_dataset, test_labels, image_size, num_labels) print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
def clean(): datasets = load_pickle('notMNIST.pickle') test_dataset = datasets['test_dataset'] test_labels = datasets['test_labels'] print('test_dataset:%d' % len(test_dataset)) print('test_labels:%d' % len(test_labels)) except_valid_idx, valid_dataset = imgs_idx_hash_except(datasets['valid_dataset'], test_dataset) valid_labels = np.delete(datasets['valid_labels'], except_valid_idx) print('valid_dataset:%d' % len(valid_dataset)) print('valid_labels:%d' % len(valid_labels)) # except with valid_dataset except_train_idx, train_dataset = imgs_idx_hash_except(datasets['train_dataset'], valid_dataset) train_labels = np.delete(datasets['train_labels'], except_train_idx) # except with test_dataset except_train_idx, train_dataset = imgs_idx_hash_except(train_dataset, test_dataset) train_labels = np.delete(train_labels, except_train_idx) print('train_dataset:%d' % len(train_dataset)) print('train_labels:%d' % len(train_labels)) print('valid_dataset:%d' % len(valid_dataset)) print('valid_labels:%d' % len(valid_labels)) print('test_dataset:%d' % len(test_dataset)) print('test_labels:%d' % len(test_labels)) pickle_file = 'notMNIST_clean.pickle' save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } save_obj(pickle_file, save)
import numpy as np import tensorflow as tf from not_mnist.img_pickle import save_obj, load_pickle from not_mnist.load_data import maybe_download def read_data(filename): f = zipfile.ZipFile(filename) for name in f.namelist(): return tf.compat.as_str(f.read(name)) f.close() data_set = load_pickle('text8_text.pickle') if data_set is None: # load data url = 'http://mattmahoney.net/dc/' filename = maybe_download('text8.zip', 31344016, url=url) # read data text = read_data(filename) print('Data size %d' % len(text)) save_obj('text8_text.pickle', text) else: text = data_set # Create a small validation set. valid_size = 1000 valid_text = text[:valid_size]
if bj == target: met_target = True if met_target: batchs[bj, i * num_skips + j] = buffer[bj + 1] else: batchs[bj, i * num_skips + j] = buffer[bj] buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) # print('generate batch') # print(batchs) return batchs, labels vocabulary_size = 50000 data_set = load_pickle('text8_data.pickle') if data_set is None: # load data url = 'http://mattmahoney.net/dc/' filename = maybe_download('text8.zip', 31344016, url=url) # read data words = read_data(filename) print('Data size %d' % len(words)) data, count, dictionary, reverse_dictionary = build_dataset( words, vocabulary_size) print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10]) del words # Hint to reduce memory. data_set = { 'data': data,
_, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict) if step % 500 == 0: print("Minibatch loss at step %d: %f" % (step, l)) print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels)) print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) if __name__ == '__main__': # First reload the data we generated in 1_notmnist.ipynb. pickle_file = '../not_mnist/notMNIST.pickle' save = load_pickle(pickle_file) train_dataset = save['train_dataset'] train_labels = save['train_labels'] valid_dataset = save['valid_dataset'] valid_labels = save['valid_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] del save # hint to help gc free up memory print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) # Reformat into a shape that's more adapted to the models we're going to train: # data as a flat matrix, # labels as float 1-hot encodings. image_size = 28
# and the value is the numpy array to feed to it. feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict) if step % 500 == 0: print("Minibatch loss at step %d: %f" % (step, l)) print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels)) print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) if __name__ == '__main__': # First reload the data we generated in 1_notmnist.ipynb. pickle_file = '../not_mnist/notMNIST.pickle' save = load_pickle(pickle_file) train_dataset = save['train_dataset'] train_labels = save['train_labels'] valid_dataset = save['valid_dataset'] valid_labels = save['valid_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] del save # hint to help gc free up memory print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) # Reformat into a shape that's more adapted to the models we're going to train: # data as a flat matrix, # labels as float 1-hot encodings. image_size = 28
for bj in range(context_size): if bj == target: met_target = True if met_target: batchs[bj, i * num_skips + j] = buffer[bj + 1] else: batchs[bj, i * num_skips + j] = buffer[bj] buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) # print('generate batch') # print(batchs) return batchs, labels vocabulary_size = 50000 data_set = load_pickle('text8_data.pickle') if data_set is None: # load data url = 'http://mattmahoney.net/dc/' filename = maybe_download('text8.zip', 31344016, url=url) # read data words = read_data(filename) print('Data size %d' % len(words)) data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size) print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10]) del words # Hint to reduce memory. data_set = { 'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary, }