Пример #1
0
def load_train():
    datasets = load_pickle('notMNIST_clean.pickle')
    train_dataset = datasets['train_dataset']
    train_labels = datasets['train_labels']
    valid_dataset = datasets['valid_dataset']
    valid_labels = datasets['valid_labels']

    classifier_name = 'classifier.pickle'

    if os.path.exists(classifier_name):
        classifier = load_pickle(classifier_name)
    else:
        classifier = LogisticRegression()
        classifier.fit(train_dataset.reshape(train_dataset.shape[0], -1), train_labels)
        save_obj(classifier_name, classifier)

    # simple valid
    valid_idx_s = 3000
    valid_idx_e = 3014
    x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)[valid_idx_s: valid_idx_e])
    print(x)
    print(valid_labels[valid_idx_s:valid_idx_e])

    # whole valid
    x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1))
    fail_cnt = 0
    for i, pred in enumerate(x):
        if pred != valid_labels[i]:
            fail_cnt += 1
    print("success rate:" + str((1 - float(fail_cnt) / len(x)) * 100) + "%")
Пример #2
0
def load_reformat_not_mnist(image_size, num_labels, num_channels):
    pickle_file = '../not_mnist/notMNIST_clean.pickle'
    save = load_pickle(pickle_file)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)
    train_dataset, train_labels = reformat(train_dataset, train_labels,
                                           image_size, num_labels,
                                           num_channels)
    valid_dataset, valid_labels = reformat(valid_dataset, valid_labels,
                                           image_size, num_labels,
                                           num_channels)
    test_dataset, test_labels = reformat(test_dataset, test_labels, image_size,
                                         num_labels, num_channels)
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)
    return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
Пример #3
0
def load_reformat_not_mnist(image_size, num_labels):
    pickle_file = '../not_mnist/notMNIST_clean.pickle'
    save = load_pickle(pickle_file)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)
    train_dataset, train_labels = reformat(train_dataset, train_labels, image_size, num_labels)
    valid_dataset, valid_labels = reformat(valid_dataset, valid_labels, image_size, num_labels)
    test_dataset, test_labels = reformat(test_dataset, test_labels, image_size, num_labels)
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)
    return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels
def clean():
    datasets = load_pickle('notMNIST.pickle')
    test_dataset = datasets['test_dataset']
    test_labels = datasets['test_labels']
    print('test_dataset:%d' % len(test_dataset))
    print('test_labels:%d' % len(test_labels))

    except_valid_idx, valid_dataset = imgs_idx_hash_except(datasets['valid_dataset'], test_dataset)
    valid_labels = np.delete(datasets['valid_labels'], except_valid_idx)
    print('valid_dataset:%d' % len(valid_dataset))
    print('valid_labels:%d' % len(valid_labels))

    # except with valid_dataset
    except_train_idx, train_dataset = imgs_idx_hash_except(datasets['train_dataset'], valid_dataset)
    train_labels = np.delete(datasets['train_labels'], except_train_idx)
    # except with test_dataset
    except_train_idx, train_dataset = imgs_idx_hash_except(train_dataset, test_dataset)
    train_labels = np.delete(train_labels, except_train_idx)

    print('train_dataset:%d' % len(train_dataset))
    print('train_labels:%d' % len(train_labels))
    print('valid_dataset:%d' % len(valid_dataset))
    print('valid_labels:%d' % len(valid_labels))
    print('test_dataset:%d' % len(test_dataset))
    print('test_labels:%d' % len(test_labels))

    pickle_file = 'notMNIST_clean.pickle'
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    save_obj(pickle_file, save)
Пример #5
0
import numpy as np
import tensorflow as tf

from not_mnist.img_pickle import save_obj, load_pickle
from not_mnist.load_data import maybe_download


def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()


data_set = load_pickle('text8_text.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    text = read_data(filename)
    print('Data size %d' % len(text))
    save_obj('text8_text.pickle', text)
else:
    text = data_set

# Create a small validation set.
valid_size = 1000
valid_text = text[:valid_size]
Пример #6
0
import numpy as np
import tensorflow as tf

from not_mnist.img_pickle import save_obj, load_pickle
from not_mnist.load_data import maybe_download


def read_data(filename):
    f = zipfile.ZipFile(filename)
    for name in f.namelist():
        return tf.compat.as_str(f.read(name))
    f.close()


data_set = load_pickle('text8_text.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    text = read_data(filename)
    print('Data size %d' % len(text))
    save_obj('text8_text.pickle', text)
else:
    text = data_set

# Create a small validation set.
valid_size = 1000
valid_text = text[:valid_size]
Пример #7
0
                if bj == target:
                    met_target = True
                if met_target:
                    batchs[bj, i * num_skips + j] = buffer[bj + 1]
                else:
                    batchs[bj, i * num_skips + j] = buffer[bj]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # print('generate batch')
    # print(batchs)
    return batchs, labels


vocabulary_size = 50000
data_set = load_pickle('text8_data.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    words = read_data(filename)
    print('Data size %d' % len(words))
    data, count, dictionary, reverse_dictionary = build_dataset(
        words, vocabulary_size)
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10])
    del words  # Hint to reduce memory.
    data_set = {
        'data': data,
Пример #8
0
            _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if step % 500 == 0:
                print("Minibatch loss at step %d: %f" % (step, l))
                print("Minibatch accuracy: %.1f%%" %
                      accuracy(predictions, batch_labels))
                print("Validation accuracy: %.1f%%" %
                      accuracy(valid_prediction.eval(), valid_labels))
        print("Test accuracy: %.1f%%" %
              accuracy(test_prediction.eval(), test_labels))


if __name__ == '__main__':
    # First reload the data we generated in 1_notmnist.ipynb.
    pickle_file = '../not_mnist/notMNIST.pickle'
    save = load_pickle(pickle_file)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

    # Reformat into a shape that's more adapted to the models we're going to train:
    # data as a flat matrix,
    # labels as float 1-hot encodings.
    image_size = 28
Пример #9
0
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
            _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if step % 500 == 0:
                print("Minibatch loss at step %d: %f" % (step, l))
                print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
                print("Validation accuracy: %.1f%%" % accuracy(
                    valid_prediction.eval(), valid_labels))
        print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))


if __name__ == '__main__':
    # First reload the data we generated in 1_notmnist.ipynb.
    pickle_file = '../not_mnist/notMNIST.pickle'
    save = load_pickle(pickle_file)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

    # Reformat into a shape that's more adapted to the models we're going to train:
    # data as a flat matrix,
    # labels as float 1-hot encodings.
    image_size = 28
Пример #10
0
            for bj in range(context_size):
                if bj == target:
                    met_target = True
                if met_target:
                    batchs[bj, i * num_skips + j] = buffer[bj + 1]
                else:
                    batchs[bj, i * num_skips + j] = buffer[bj]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # print('generate batch')
    # print(batchs)
    return batchs, labels

vocabulary_size = 50000
data_set = load_pickle('text8_data.pickle')
if data_set is None:
    # load data
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', 31344016, url=url)

    # read data
    words = read_data(filename)
    print('Data size %d' % len(words))
    data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10])
    del words  # Hint to reduce memory.
    data_set = {
        'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary,
    }