def merge_prune(train_floders, test_folders): train_datasets = maybe_pickle(train_folders, 45000) test_datasets = maybe_pickle(test_folders, 1800) train_size = 200000 valid_size = 10000 test_size = 10000 valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets( train_datasets, train_size, valid_size) _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size) print('Training:', train_dataset.shape, train_labels.shape) print('Validation:', valid_dataset.shape, valid_labels.shape) print('Testing:', test_dataset.shape, test_labels.shape) train_dataset, train_labels = randomize(train_dataset, train_labels) test_dataset, test_labels = randomize(test_dataset, test_labels) valid_dataset, valid_labels = randomize(valid_dataset, valid_labels) pickle_file = 'notMNIST.pickle' save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } save_obj(pickle_file, save)
def merge_prune(train_folders, test_folders): train_datasets = maybe_pickle(train_folders, 45000) test_datasets = maybe_pickle(test_folders, 1800) train_size = 200000 valid_size = 10000 test_size = 10000 valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets( train_datasets, train_size, valid_size) _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size) print('Training:', train_dataset.shape, train_labels.shape) print('Validation:', valid_dataset.shape, valid_labels.shape) print('Testing:', test_dataset.shape, test_labels.shape) train_dataset, train_labels = randomize(train_dataset, train_labels) test_dataset, test_labels = randomize(test_dataset, test_labels) valid_dataset, valid_labels = randomize(valid_dataset, valid_labels) pickle_file = 'notMNIST.pickle' save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } save_obj(pickle_file, save)
def load_train(): datasets = load_pickle('notMNIST_clean.pickle') train_dataset = datasets['train_dataset'] train_labels = datasets['train_labels'] valid_dataset = datasets['valid_dataset'] valid_labels = datasets['valid_labels'] classifier_name = 'classifier.pickle' if os.path.exists(classifier_name): classifier = load_pickle(classifier_name) else: classifier = LogisticRegression() classifier.fit(train_dataset.reshape(train_dataset.shape[0], -1), train_labels) save_obj(classifier_name, classifier) # simple valid valid_idx_s = 3000 valid_idx_e = 3014 x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)[valid_idx_s: valid_idx_e]) print(x) print(valid_labels[valid_idx_s:valid_idx_e]) # whole valid x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)) fail_cnt = 0 for i, pred in enumerate(x): if pred != valid_labels[i]: fail_cnt += 1 print("success rate:" + str((1 - float(fail_cnt) / len(x)) * 100) + "%")
def clean(): datasets = load_pickle('notMNIST.pickle') test_dataset = datasets['test_dataset'] test_labels = datasets['test_labels'] print('test_dataset:%d' % len(test_dataset)) print('test_labels:%d' % len(test_labels)) except_valid_idx, valid_dataset = imgs_idx_hash_except(datasets['valid_dataset'], test_dataset) valid_labels = np.delete(datasets['valid_labels'], except_valid_idx) print('valid_dataset:%d' % len(valid_dataset)) print('valid_labels:%d' % len(valid_labels)) # except with valid_dataset except_train_idx, train_dataset = imgs_idx_hash_except(datasets['train_dataset'], valid_dataset) train_labels = np.delete(datasets['train_labels'], except_train_idx) # except with test_dataset except_train_idx, train_dataset = imgs_idx_hash_except(train_dataset, test_dataset) train_labels = np.delete(train_labels, except_train_idx) print('train_dataset:%d' % len(train_dataset)) print('train_labels:%d' % len(train_labels)) print('valid_dataset:%d' % len(valid_dataset)) print('valid_labels:%d' % len(valid_labels)) print('test_dataset:%d' % len(test_dataset)) print('test_labels:%d' % len(test_labels)) pickle_file = 'notMNIST_clean.pickle' save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } save_obj(pickle_file, save)
f = zipfile.ZipFile(filename) for name in f.namelist(): return tf.compat.as_str(f.read(name)) f.close() data_set = load_pickle('text8_text.pickle') if data_set is None: # load data url = 'http://mattmahoney.net/dc/' filename = maybe_download('text8.zip', 31344016, url=url) # read data text = read_data(filename) print('Data size %d' % len(text)) save_obj('text8_text.pickle', text) else: text = data_set # Create a small validation set. valid_size = 1000 valid_text = text[:valid_size] train_text = text[valid_size:] train_size = len(train_text) print(train_size, train_text[:64]) print(valid_size, valid_text[:64]) # Utility functions to map characters to vocabulary IDs and back. vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' ' # ascii code for character first_letter = ord(string.ascii_lowercase[0])
print('Average loss at step %d: %f' % (step, average_loss)) average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps) if step % 10000 == 0: sim = similarity.eval() for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log = '%s %s,' % (log, close_word) print(log) final_embeddings = normalized_embeddings.eval() save_obj('text8_embed.pickle', final_embeddings) num_points = 400 tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :]) def plot(embeddings, labels): assert embeddings.shape[0] >= len(labels), 'More labels than embeddings' pylab.figure(figsize=(15, 15)) # in inches for i, label in enumerate(labels): x, y = embeddings[i, :] pylab.scatter(x, y) pylab.annotate(label, xy=(x, y),
# read data words = read_data(filename) print('Data size %d' % len(words)) data, count, dictionary, reverse_dictionary = build_dataset( words, vocabulary_size) print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10]) del words # Hint to reduce memory. data_set = { 'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary, } save_obj('text8_data.pickle', data_set) else: data = data_set['data'] count = data_set['count'] dictionary = data_set['dictionary'] reverse_dictionary = data_set['reverse_dictionary'] # split data data_index = 0 print('data:', [reverse_dictionary[di] for di in data[:8]]) for num_skips, skip_window in [(2, 1), (4, 2)]: test_size = 8 batch, labels = generate_batch(batch_size=test_size, num_skips=num_skips,
print('Average loss at step %d: %f' % (step, average_loss)) average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps) if step % 10000 == 0: sim = similarity.eval() for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log = '%s %s,' % (log, close_word) print(log) final_embeddings = normalized_embeddings.eval() save_obj('text8_embed.pickle', final_embeddings) num_points = 400 tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :]) def plot(embeddings, labels): assert embeddings.shape[0] >= len(labels), 'More labels than embeddings' pylab.figure(figsize=(15, 15)) # in inches for i, label in enumerate(labels): x, y = embeddings[i, :] pylab.scatter(x, y) pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
if data_set is None: # load data url = 'http://mattmahoney.net/dc/' filename = maybe_download('text8.zip', 31344016, url=url) # read data words = read_data(filename) print('Data size %d' % len(words)) data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size) print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10]) del words # Hint to reduce memory. data_set = { 'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary, } save_obj('text8_data.pickle', data_set) else: data = data_set['data'] count = data_set['count'] dictionary = data_set['dictionary'] reverse_dictionary = data_set['reverse_dictionary'] # split data data_index = 0 print('data:', [reverse_dictionary[di] for di in data[:8]]) for num_skips, skip_window in [(2, 1), (4, 2)]: test_size = 8 batch, labels = generate_batch(batch_size=test_size, num_skips=num_skips, skip_window=skip_window) print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))