def __init__( self, use_mnist=False ): self.use_mnist = use_mnist if self.use_mnist: #self.digits = fetch_mldata('MNIST original') self.mnist_digits_train = fetch_mldata('MNIST original', subset='train') self.mnist_digits_test = fetch_mldata('MNIST original', subset='test') else: self.digits = load_digits() self.X = self.digits.data self.y = self.digits.target self.best_f1_score = 0 self.best_score = 0 """
def testScript(): print "\n---> Started Logistic Regression - Iris dataset - Own function - k class...\n" attributes, outcomes = getDataFromFile("../Data/iriskc.data.shuffled") min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) attributes, outcomes = min_max_scaler.fit_transform(np.array(attributes)), np.array(outcomes) #attributes, outcomes = np.array(attributes), np.array(outcomes) accrValues, presValues, recallValues, fMeasValues = crossValidate(attributes, outcomes, 10, learningRate=0.01, iterCountMax=750, threshold=0.005, ownFunction=True) for itr in range(10): print "Fold %d: \tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f" %(itr+1,accrValues[itr],presValues[itr],recallValues[itr],fMeasValues[itr]) print "\nMean values:\tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f\n" % (np.mean(accrValues),np.mean(presValues),\ np.mean(recallValues),np.mean(fMeasValues)) print "---> Started Logistic Regression - Iris dataset - Inbuilt function - k class...\n" attributes, outcomes = getDataFromFile("../Data/iriskc.data.shuffled") min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) attributes, outcomes = min_max_scaler.fit_transform(np.array(attributes)), np.array(outcomes) #attributes, outcomes = np.array(attributes), np.array(outcomes) accrValues, presValues, recallValues, fMeasValues = crossValidate(attributes, outcomes, 10, learningRate=0.01, iterCountMax=750, threshold=0.005, ownFunction=False) for itr in range(10): print "Fold %d: \tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f" %(itr+1,accrValues[itr],presValues[itr],recallValues[itr],fMeasValues[itr]) print "\nMean values:\tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f\n" % (np.mean(accrValues),np.mean(presValues),\ np.mean(recallValues),np.mean(fMeasValues)) print "---> Started Logistic Regression - Digits dataset - Own function - k class...\n" mnist = datasets.fetch_mldata('MNIST original') X, y = mnist.data / 255., mnist.target attributes = X[:20000] outcomes = y[:20000] #print list(set(outcomes)) accrValues, presValues, recallValues, fMeasValues = crossValidate(attributes, outcomes, 10, learningRate=0.01, iterCountMax=100, threshold=0.005, ownFunction=False) for itr in range(10): print "Fold %d: \tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f" %(itr+1,accrValues[itr],presValues[itr],recallValues[itr],fMeasValues[itr]) print "\nMean values:\tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f\n" % (np.mean(accrValues),np.mean(presValues),\ np.mean(recallValues),np.mean(fMeasValues)) print "---> Started Logistic Regression - Digits dataset - Inbuilt function - k class...\n" mnist = datasets.fetch_mldata('MNIST original') X, y = mnist.data / 255., mnist.target attributes = X[:20000] outcomes = y[:20000] #print list(set(outcomes)) accrValues, presValues, recallValues, fMeasValues = crossValidate(attributes, outcomes, 10, learningRate=0.01, iterCountMax=100, threshold=0.005, ownFunction=False) for itr in range(10): print "Fold %d: \tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f" %(itr+1,accrValues[itr],presValues[itr],recallValues[itr],fMeasValues[itr]) print "\nMean values:\tAccuracy: %f\tPrecision: %f\tRecall: %f\tF-Measure: %f\n" % (np.mean(accrValues),np.mean(presValues),\ np.mean(recallValues),np.mean(fMeasValues))
def get_data(): """ Get MNIST data ready to learn with. Returns ------- dict With keys 'train' and 'test'. Both do have the keys 'X' (features) and'y' (labels) """ from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') x = mnist.data y = mnist.target # Scale data to [-1, 1] - This is of major importance!!! x = x/255.0*2 - 1 from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}} return data
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_mldata from sklearn.model_selection import cross_val_score db_name = 'iris' hid_num = 1000 data_set = fetch_mldata(db_name) data_set.data = preprocessing.scale(data_set.data) print(db_name) print('ECOBELM', hid_num) e = ECOBELM(hid_num, c=2**5) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave)) print('ELM', hid_num) e = ELM(hid_num) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave))
def get_data(downsample=10,plshow=False): global data, target, mnist print("get_data ...") # Get the MNist Database from Internet or local disk # This is very powerful for me (the author renyuan) custom_data_home = '.' # the current directory mnist = datasets.fetch_mldata('MNIST original', data_home= custom_data_home) # Downsample mnist as the training set # I know that there are 70000 pictures in MNist database # I wish sample a small fraction of the pictures data = mnist.data[0:60000:downsample] target = mnist.target[0:60000:downsample] if plshow: n_sample = len(data) data_image = data.reshape(n_sample,28,28) image_and_target = list(zip(data_image, target)) pl.figure() for i, (im, tg) in enumerate(image_and_target): if i>=100: break pl.subplot(10, 10, i+ 1) pl.axis('off') pl.imshow(im, cmap=pl.cm.gray_r) pl.title('tg=%d'%tg, color='blue') pl.show() return data, target
def main(): files = [ join(SCRIPT_DIR, "train_x.npy"), join(SCRIPT_DIR, "train_y.npy"), join(SCRIPT_DIR, "validate_x.npy"), join(SCRIPT_DIR, "validate_y.npy"), join(SCRIPT_DIR, "test_x.npy"), join(SCRIPT_DIR, "test_y.npy") ] if all([exists(fname) and stat(fname).st_size > 100 for fname in files]): print("Already downloaded. Skipping") else: mnist = fetch_mldata('MNIST original') np.random.seed(1234) data = mnist.data target = mnist.target indices = np.arange(len(data)) np.random.shuffle(indices) data = data[indices] target = target[indices] train_x, train_y = (data[:-10000].astype(np.float32) / 255.0).astype(np.float32), target[:-10000].astype(np.int32) test_x, test_y = (data[-10000:].astype(np.float32) / 255.0).astype(np.float32), target[-10000:].astype(np.int32) np.save(join(SCRIPT_DIR, "train_x.npy"), train_x[:int(0.9 * train_x.shape[0])]) np.save(join(SCRIPT_DIR, "train_y.npy"), train_y[:int(0.9 * train_y.shape[0])]) np.save(join(SCRIPT_DIR, "validate_x.npy"), train_x[int(0.9 * train_x.shape[0]):]) np.save(join(SCRIPT_DIR, "validate_y.npy"), train_y[int(0.9 * train_y.shape[0]):]) np.save(join(SCRIPT_DIR, "test_x.npy"), test_x) np.save(join(SCRIPT_DIR, "test_y.npy"), test_y) print("Done.")
def load(config, test=False): """Load MNIST dataset using scikit-learn. Returns a dict with the following entries: - images: n x 28 x 28 array - data: n x 784 array - target: n array """ dataset = fetch_mldata('mnist-original') X, y = dataset.data, dataset.target X = X.astype(np.float32) / 255.0 if test: idx_start, idx_end = config['test_set'] else: idx_start, idx_end = config['train_set'] X, y = shuffle(X, y, random_state=42) X = X[idx_start:idx_end] y = y[idx_start:idx_end] return { 'images': X.reshape(-1, 28, 28), 'data': X, 'target': y, }
def run(data_path): print "Reading the dataset:", data_path mnist = fetch_mldata('MNIST original') mnist.data, mnist.target = shuffle(mnist.data, mnist.target) # Trunk the data n_train = 600 n_test = 400 # Define training and testing sets indices = arange(len(mnist.data)) random.seed(0) train_idx = random.sample(indices, n_train) test_idx = random.sample(indices, n_test) X_train, y_train = mnist.data[train_idx], mnist.target[train_idx] X_test, y_test = mnist.data[test_idx], mnist.target[test_idx] # Apply a learning algorithm print "Applying a learning algorithm..." clf = RandomForestClassifier(n_estimators=10, n_jobs=1) clf.fit(X_train, y_train) # Make a prediction print "Making predictions..." y_pred = clf.predict(X_test) print y_pred # Evaluate the prediction print "Evaluating results..." print "Precision: \t", metrics.precision_score(y_test, y_pred) print "Recall: \t", metrics.recall_score(y_test, y_pred) print "F1 score: \t", metrics.f1_score(y_test, y_pred) print "Mean accuracy: \t", clf.score(X_test, y_test)
def get_mnist(start=None, end=None, random=False, num=None): mnist = fetch_mldata('MNIST original', data_home='~/diss/mnist') if random is not None and num is not None: idx = np.random.choice(range(mnist.data.shape[0]), num) elif start is not None and end is not None: idx = range(start, end) return mnist.data[idx], mnist.target[idx]
def getdataset(datasetname, onehot_encode_strings=True): # load dataset = fetch_mldata(datasetname) # get X and y X = dshape(dataset.data) try: target = dshape(dataset.target) except: print("WARNING: No target found. Taking last column of data matrix as target") target = X[:, -1] X = X[:, :-1] if len(target.shape) > 1 and target.shape[1] > X.shape[1]: # some mldata sets are mixed up... X = target target = dshape(dataset.data) if len(X.shape) == 1 or X.shape[1] <= 1: for k in dataset.keys(): if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]: X = np.hstack((X, dshape(dataset[k]))) # one-hot for categorical values if onehot_encode_strings: cat_ft = [i for i in range(X.shape[1]) if 'str' in str( type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))] if len(cat_ft): for i in cat_ft: X[:, i] = tonumeric(X[:, i]) X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X) # if sparse, make dense try: X = X.toarray() except: pass # convert y to monotonically increasing ints y = tonumeric(target).astype(int) return np.nan_to_num(X.astype(float)), y
def main(description, gpu, output): logging.basicConfig(level=logging.INFO) logging.info('fetch MNIST dataset') mnist = fetch_mldata(description) mnist.data = mnist.data.astype(numpy.float32) mnist.data /= 255 mnist.target = mnist.target.astype(numpy.int32) data_train, data_test, target_train, target_test = train_test_split(mnist.data, mnist.target) data = data_train, data_test target = target_train, target_test start_time = time.time() if gpu >= 0: cuda.check_cuda_available() cuda.get_device(gpu).use() logging.info("Using gpu device {}".format(gpu)) else: logging.info("Not using gpu device") mlp = MLP(data=data, target=target, gpu=gpu) mlp.train_and_test(n_epoch=1) end_time = time.time() logging.info("time = {} min".format((end_time - start_time) / 60.0)) logging.info('saving trained mlp into {}'.format(output)) with open(output, 'wb') as fp: pickle.dump(mlp, fp)
def get_mnist(): np.random.seed(1234) # set seed for deterministic ordering mnist = fetch_mldata('MNIST original', data_home='../../data') p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def get_datasets(): mnist = fetch_mldata('MNIST original') data = mnist['data'] target = mnist['target'] data = (data - data.mean(axis=0)) std = data.std(axis=0) data[:, std > 0] /= std[std > 0] train_split = 60000 output_size = 10 train_ordered = data[:train_split] train_labels_ordered = target[:train_split] training_data = zip(train_ordered, train_labels_ordered) random.shuffle(training_data) train = np.array([p[0] for p in training_data]) train_labels = np.array([p[1] for p in training_data]) train_outs = np.array([one_hot(i, output_size) for i in train_labels]) test = data[train_split:] test_labels = target[train_split:] test_outs = np.array([one_hot(i, output_size) for i in test_labels]) return train, train_outs, test, test_outs
def load_script(script_vars): def define(var_name, fun, overwrite=False): if script_vars.has_key(var_name) and not overwrite: print('%s is already defined' % var_name) return script_vars[var_name] else: print('computing variables %s' % var_name) value = fun() script_vars[var_name] = value globals()[var_name] = value return value print(globals().keys()) custom_data_home="/home/stefan2/mnistdata" custom_data_home="/home/stefan2/mnistdata" define('mnist', lambda: fetch_mldata('MNIST original', data_home=custom_data_home)) data = mnist.data.astype(float) #[0:100,:] #convert to float labels = mnist.target #[0:100] n,m = data.shape print("num data points %s" % n) #run the method after successive othogonalization for j in range(0, 50): print("iteration: " + str(j)) res = find_dominant_directions(data) plot_vector_png("pattern_" + str(j), res) for i in range(0, n): v = data[i,:] proj = np.reshape(v, (1, m)).dot(np.reshape(res, (m,1)))[0,0] data[i,:] = v - proj*res
def load_dataset(randomize = False, overfit = False): mnist = fetch_mldata('mnist-original', data_home=DATA_DIR) data = mnist.data target = mnist.target data = data.reshape((-1, 28, 28)) target = target.astype(np.uint8) operator_train = pickle.load(open(DATA_DIR + 'four_operators.pickle', 'rb')) operator_data = np.array([x[0].reshape((28, 28)) for x in operator_train]) operator_target = np.array([y[1] for y in operator_train]) # # # Overfitting meme if overfit == True: temp_data = operator_data temp_target = operator_target while operator_data.shape[0] < 20000: operator_data = np.concatenate((operator_data, temp_data)) operator_target = np.concatenate((operator_target, temp_target)) # # # Overfitting meme data = np.concatenate((data, operator_data)) target = np.concatenate((target, operator_target)) if randomize: print 'shuffling data' data, target = shuffle(data, target, random_state=0) target = target.astype(np.uint8) return data, target
def get_mnist_data(data_home=None): """ load data on your directry ~/scikit_learn_data/mldata/ if data doesn't exist, it downloads the data from site. """ mnist = fetch_mldata('MNIST original') return mnist
def run(): logging.info("Starting test") # cone_pipeline = Pipeline([('feature selection', SelectKBest(k=100)), # ('classification', ConeEstimator())]) # cone_pipeline = Pipeline([('random PCA', RandomizedPCA(n_components=50)), # ('classification', ConeEstimator(3))]) # classifiers = [DecisionTreeClassifier(), # MultinomialNB(), # LinearSVC(), # ConeEstimator(10)] classifiers = [ConeEstimator(10)] #cone_pipeline] dataset = fetch_mldata('mnist-original') #dataset = fetch_mldata('sonar') print "Dataset size: ", len(dataset.data) print "Features: ", len(dataset.data[0]) binary_map = np.vectorize(lambda x : 1 if x == 1 else 0) dataset.target = binary_map(dataset.target) for classifier in classifiers: method = ShuffleSplit(len(dataset.data), n_iterations = 1, train_size=400, test_size=400) result = cross_val_score( classifier, dataset.data, dataset.target, cv = method, score_func = f1_score) print classifier, result logging.info("Test complete")
def load_pure_mnist(): mnist = fetch_mldata('mnist-original', data_home=DATA_DIR) data = mnist.data target = mnist.target data = data.reshape((-1, 28, 28)) target = target.astype(np.uint8) return data, target
def MNIST(): add_fit_and_score(RegularizedNet) from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') X = numpy.asarray(mnist.data, dtype='float32') #X = numpy.asarray(mnist.data, dtype='float64') if SCALE: #X = preprocessing.scale(X) X /= 255. y = numpy.asarray(mnist.target, dtype='int32') #y = numpy.asarray(mnist.target, dtype='int64') print("Total dataset size:") print("n samples: %d" % X.shape[0]) print("n features: %d" % X.shape[1]) print("n classes: %d" % len(set(y))) from sklearn import cross_validation, preprocessing x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=42) dnn=RegularizedNet(numpy_rng=numpy.random.RandomState(123), theano_rng=None, n_ins=x_train.shape[1], layers_types=[ReLU, ReLU, LogisticRegression], layers_sizes=[200, 200], n_outs=10, rho=0.95, eps=1.E-6, max_norm=0., debugprint=False, L1_reg=0., L2_reg=1./x_train.shape[0])#, dnn.fit(x_train, y_train, max_epochs=60, method='adadelta', verbose=True, plot=False) test_error = dnn.score(x_test, y_test) print("score: %f" % (1. - test_error))
def load_banana(): data = da.fetch_mldata("Banana IDA") x = data.data y = data.target x_test = x y_test = y return x, x_test, y, y_test
def main(): """TODO: Docstring for main. :returns: TODO """ alpha = 1. decay = 0.0006 iter_num = 600 finetune_iter = 220 hyper_params = { 'hidden_layers_sizes':[196,], 'iter_nums':[400,], 'alphas':[1.,], 'decays':[0.003,], 'betas':[3,], 'rhos':[0.1,] } enc = OneHotEncoder(sparse=False) mnist = fetch_mldata('MNIST original', data_home='./') x_train, x_test, y_train, y_test = \ train_test_split(scale(mnist.data.astype(float)).astype('float32'), mnist.target.astype('float32'), test_size=0.5, random_state=0) x_unlabeled = scale(mnist.data[mnist.target>=5,:].astype(float)).astype('float32') y_train = enc.fit_transform(y_train.reshape(y_train.shape[0],1)).astype('float32') t_x = T.matrix() params, extracted = pretrain_sae(x_unlabeled, hyper_params) extracted = function(inputs=[t_x], outputs=[sae_extract(t_x, params)])(x_train)[0] params.append(train_softmax(extracted, y_train, iter_num, alpha, decay)) weights = finetune_sae(x_train, y_train, params, finetune_iter, alpha, decay) all_label = np.array(range(0, 10)) pred = all_label[softmax2class_max(sae_predict(x_test, weights))] print accuracy_score(y_test, pred) print classification_report(y_test, pred) print confusion_matrix(y_test, pred)
def run(data_path): print "Reading the dataset:", data_path ## http://continuum.io/blog/wiserf-use-cases-and-benchmarks mnist = fetch_mldata('MNIST original') # Define training and testing sets inds = arange(len(mnist.data)) test_i = random.sample(xrange(len(inds)), int(0.1 * len(inds))) train_i = numpy.delete(inds, test_i) X_train = mnist.data[train_i].astype(numpy.double) y_train = mnist.target[train_i].astype(numpy.double) X_test = mnist.data[test_i].astype(numpy.double) y_test = mnist.target[test_i].astype(numpy.double) # Trunk the data X_digits, y_digits = shuffle(X_train, y_train) X_digits_train = X_digits[:1000] y_digits_train = y_digits[:1000] X_digits_valid = X_digits[1000:2000] y_digits_valid = y_digits[1000:2000] X_digits_test = X_digits[2000:3000] y_digits_test = y_digits[2000:3000] knn_digits = KNeighborsClassifier(n_neighbors=10) knn_digits.fit(X_digits_train, y_digits_train) print "KNN validation accuracy on MNIST digits: ", print knn_digits.score(X_digits_valid, y_digits_valid)
def load(train_n, test_n): mnist = fetch_mldata('MNIST original', data_home='.') mnist.data = mnist.data.astype(np.float32) / 256.0 mnist.target = mnist.target.astype(np.int32) N = len(mnist.data) order = np.random.permutation(N) train = {i: [] for i in range(10)} test = {i: [] for i in range(10)} train_m = math.ceil(train_n / 10) train_sum = 0 test_m = math.ceil(test_n / 10) test_sum = 0 for i in range(N): x = mnist.data[order[i]] y = mnist.target[order[i]] if train_sum < train_n and len(train[y]) < train_m: train[y].append(x) train_sum += 1 if test_sum < test_n and len(test[y]) < test_m: test[y].append(x) test_sum += 1 return train, test
def main(): print '... get mnist data' mnist = fetch_mldata('MNIST original', data_home='.') fig, axes = plt.subplots(5, 3, figsize=(6, 8)) data = mnist.data[[0, 7000, 14000, 21000, 28000]] print '... start training' for i, (axrow, img) in enumerate(zip(axes, data)): img = img.reshape(28, 28) img = (img >= 128).astype(int) corrupted = get_corrupted_input(img, 0.05) mrf = MRF(corrupted) if i == 0: axes[i][0].set_title('元画像') axes[i][1].set_title('ノイズあり') axes[i][2].set_title('ノイズ除去') axes[i][0].imshow(img, cmap=cm.Greys_r) axes[i][1].imshow(corrupted, cmap=cm.Greys_r) axes[i][2].imshow(mrf.denoised, cmap=cm.Greys_r) for ax in axrow: ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) plt.show()
def make_data(N): print("fetch MNIST dataset") mnist = fetch_mldata('MNIST original',data_home='.') mnist.data = mnist.data.astype(np.float32) mnist.data /= 255 mnist.taret = mnist.target.astype(np.int32) # make y label mnist_target = np.zeros((mnist.target.shape[0],10)) for index, num in enumerate(mnist.target): mnist_target[index][num] = 1. # print(mnist_target) # mazemaze index = random.sample(range(mnist.target.shape[0]), (mnist.target.shape[0])) tmp_target = [mnist_target[i] for i in index] tmp_data = [mnist.data[i] for i in index] # print("N : ", len(tmp_target)) # print("tmp_target : ", tmp_target) x_train, x_test = np.split(tmp_data, [N]) y_train, y_test = np.split(tmp_target, [N]) return [x_train, x_test, y_train, y_test]
def download__by_category(): # mnist = fetch_mldata('MNIST original') mnist = fetch_mldata('MNIST original') # mnist.data = random.sample(mnist.data, 1000) # mnist.target = random.sample(mnist.target, 1000) # mnist.data (70000, 784), mnist.target (70000, 1) trainX, trainY = mnist.data[:-10000], mnist.target[:-10000] testX, testY = mnist.data[-10000:], mnist.target[-10000:] if not exists('train'): os.makedirs('train') x = {i:[] for i in range(10)} for i in range(len(trainY)): tmp = x[trainY[i]] tmp.append(trainX[i]) x[trainY[i]] = tmp for i in range(10): cPickle.dump(x[i], open(join('train', '{}.pkl'.format(i)), 'w+')) if not exists('test'): os.makedirs('test') x = {i:[] for i in range(10)} for i in range(len(testY)): tmp = x[testY[i]] tmp.append(testX[i]) x[testY[i]] = tmp for i in range(10): cPickle.dump(x[i], open(join('test', '{}.pkl'.format(i)), 'w+'))
def prepare_dataset(): print('load MNIST dataset') mnist = fetch_mldata('MNIST original') mnist['data'] = mnist['data'].astype(np.float32) mnist['data'] /= 255 mnist['target'] = mnist['target'].astype(np.int32) return mnist
def iris_binary(): iris = fetch_mldata('iris') X = iris.data y = iris.target idx = y < 3 # only binary y[y == 2] = -1 return X[idx, :], y[idx]
def test_configs(): from sklearn import datasets from datetime import datetime import sys import os import logging log = logging.getLogger() handler = logging.StreamHandler(sys.stdout) fmt = logging.Formatter('%(asctime)s %(levelname)s: %(message)s','%Y-%m-%d %H:%M:%S') handler.setFormatter(fmt) log.addHandler(handler) log.setLevel(logging.DEBUG) custom_data_home = os.getcwd() + '/sk_data' digits = datasets.fetch_mldata('MNIST original', data_home=custom_data_home) X = np.asarray(digits.data, 'float32') X = X # images = [imresize(im.reshape(28, 28), (32, 32)) for im in X] # X = np.vstack([im.flatten() for im in images]) X[X < 128] = 0 X[X >= 128] = 1 X /= 256. models = [] for w_sigma in [.1, .5, 1, 2, 5]: for sparsity in [.001, .01, .05, .1, .5]: log.info('Building RBM_dl:\n w_sigma=%s\n sparsity=%s' %(w_sigma,sparsity,)) model = ConvRBM((28, 28), 40, w_size=11, n_iter=3, verbose=True, w_sigma=w_sigma, sparsity=sparsity) model.fit(X) models.append({ 'model' : model, 'w_sigma' : w_sigma, 'sparsity' : sparsity, }) log.info('Done') return models
import matplotlib.pyplot as plt from sklearn.datasets import fetch_mldata from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report # Fetch Data mnist = fetch_mldata('MNIST original', data_home='data/mnist') # Show parts of Image # counter = 1 # for i in range(0,10): # for j in range(1,11): # plt.subplot(10,10,counter) # plt.imshow(mnist.data[i*7000+j].reshape(28,28),cmap=plt.cm.gray) # plt.axis('off') # counter += 1 # plt.show() # Data X, y = mnist.data, mnist.target X = X / 255.0 * 2 - 1 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11) print(y_train) # SVC clf = SVC(kernel='rbf', C=3, gamma=0.01) clf.fit(X_train[:10000], y_train[:10000]) # Prediction predictions = clf.predict(X_test)
random.seed(1) np.random.seed(1) NUM_USERS = 50 # Setup directory for train/test data train_path = './data/train/all_data_0_niid_0_keep_10_train_9.json' test_path = './data/test/all_data_0_niid_0_keep_10_test_9.json' dir_path = os.path.dirname(train_path) if not os.path.exists(dir_path): os.makedirs(dir_path) dir_path = os.path.dirname(test_path) if not os.path.exists(dir_path): os.makedirs(dir_path) # Get MNIST data, normalize, and divide by level mnist = fetch_mldata('MNIST original', data_home='./data') mu = np.mean(mnist.data.astype(np.float32), 0) sigma = np.std(mnist.data.astype(np.float32), 0) mnist.data = (mnist.data.astype(np.float32) - mu) / (sigma + 0.001) mnist_data = [] for i in trange(10): idx = mnist.target == i mnist_data.append(mnist.data[idx]) print("\nNumb samples of each label:\n", [len(v) for v in mnist_data]) ###### CREATE USER DATA SPLIT ####### # Assign 100 samples to each user X = [[] for _ in range(NUM_USERS)] y = [[] for _ in range(NUM_USERS)] idx = np.zeros(10, dtype=np.int64)
n_labels = y_.shape[1] mi = np.zeros((n_labels, n_labels)) for i in xrange(n_labels): for j in xrange(n_labels): mi[i, j] = mutual_info_score(y_[:, i], y_[:, j]) mst = minimum_spanning_tree(sparse.csr_matrix(-mi)) edges = np.vstack(mst.nonzero()).T edges.sort(axis=1) return edges dataset = "scene" #dataset = "yeast" if dataset == "yeast": yeast = fetch_mldata("yeast") X = yeast.data X = np.hstack([X, np.ones((X.shape[0], 1))]) y = yeast.target.toarray().astype(np.int).T X_train, X_test = X[:1500], X[1500:] y_train, y_test = y[:1500], y[1500:] else: scene = load_scene() X_train, X_test = scene['X_train'], scene['X_test'] y_train, y_test = scene['y_train'], scene['y_test'] n_labels = y_train.shape[1] full = np.vstack([x for x in itertools.combinations(range(n_labels), 2)])
beta2=0.999) train_step = optimizer.minimize(loss) return train_step def accuracy(y, t): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return accuracy if __name__ == '__main__': ''' データの生成 ''' mnist = datasets.fetch_mldata('MNIST original', data_home='.') n = len(mnist.data) N = 30000 # MNISTの一部を使う N_train = 20000 N_validation = 4000 indices = np.random.permutation(range(n))[:N] # ランダムにN枚を選択 X = mnist.data[indices] X = X / 255.0 X = X - X.mean(axis=1).reshape(len(X), 1) y = mnist.target[indices] Y = np.eye(10)[y.astype(int)] # 1-of-K 表現に変換 X_train, X_test, Y_train, Y_test = \ train_test_split(X, Y, train_size=N_train)
from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import pydot, io import time #######################End imports################################### ####################Do not change anything below # Load MNIST data. fetch_mldata will download the dataset and put it in a folder called mldata. # Some things to be aware of: # The folder mldata will be created in the folder in which you started the notebook # So to make your life easy, always start IPython notebook from same folder. # Else the following code will keep downloading MNIST data try: mnist = fetch_mldata("MNIST original") except Exception as ex: import tensorflow.examples.tutorials.mnist.input_data as input_data m = input_data.read_data_sets("MNIST") data = np.concatenate((m.train.images, m.test.images)) target = np.concatenate((m.train.labels, m.test.labels)) class dataFrame: def __init__(self, data, target): self.data = data self.target = target mnist = dataFrame(data, target) # mnist = fetch_mldata("MNIST original")
################################ # Set hyperparameters ################################ no_of_hidden_units = 200 learning_rate = 1 batch_size = 100 ################################ # Prepare train and test sets ################################ # Fetching the dataset and performing minor normalization # to help with training print('Fetching MNIST dataset. Please wait...\n') dataset = fetch_mldata('MNIST original', data_home='datasets') dataset.data = dataset.data / 255 # Shuffling the ids to prepare for creation of train and test sets ids = np.arange(len(dataset.data)) np.random.shuffle(ids) # The full dataset consists of 70000 labelled examples. # We will use 60000 examples for training and 10000 for our test set. n_rows_train = 60000 n_rows_test = len(dataset.target) - n_rows_train data_train = np.c_[np.ones((n_rows_train, 1)), dataset.data[ids[:n_rows_train], :]] targets_train = np.zeros((n_rows_train, 10)) targets_train[np.arange(n_rows_train), dataset.target[ids[:n_rows_train]].astype(int)] = 1
#9. Load the MNIST dataset (introduced in Chapter 3) and split it into a training set and a test set (take the first 60,000 #instances for training, and the remaining 10,000 for testing). Train a Random Forest classifier on the dataset and time how long it #takes, then evaluate the resulting model on the test set. Next, use PCA to reduce the dataset’s dimensionality, with an explained #variance ratio of 95%. Train a new Random Forest classifier on the reduced dataset and see how long it takes. Was training much #faster? Next evaluate the classifier on the test set: how does it compare to the previous classifier? import time start = time.time() from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') #70,000 numbers between 0-9 X, y = mnist['data'], mnist[ 'target'] #Rows are intances, Columns are features (784 features=28x28pixeles). Each pixel (0:white -> 255:black) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=60000, random_state=42) from sklearn.ensemble import RandomForestClassifier rnd_clf = RandomForestClassifier(random_state=42) #Three Model Classifiers from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA reduced_clf = Pipeline([('pca', PCA(n_components=0.95)), ('rnd_reduced_clf', RandomForestClassifier(random_state=42))]) from sklearn.metrics import accuracy_score for clf in (rnd_clf, reduced_clf): #Final performance on the test set start = time.time()
from sklearn import datasets from neupy import algorithms, layers, environment environment.reproducible() theano.config.floatX = 'float32' def reduce_dimension(network, data): """ Function minimize input data dimention using pre-trained autoencoder. """ minimized_data = network.input_layer.output(data) return minimized_data.eval() mnist = datasets.fetch_mldata('MNIST original') data = mnist.data / 255. features_mean = data.mean(axis=0) data = (data - features_mean).astype(np.float32) np.random.shuffle(data) x_train, x_test = data[:60000], data[60000:] autoencoder = algorithms.Momentum( [ layers.Dropout(proba=0.5), layers.Sigmoid(784), layers.Sigmoid(100), layers.Output(784), ],
from sklearn.datasets import fetch_mldata from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier) from sklearn.linear_model import SGDClassifier from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC from sklearn.svm import SVC from sklearn.utils import shuffle import matplotlib.pyplot as plt import numpy as np mnist = fetch_mldata( 'MNIST original', data_home='/Users/henryliu/mspc/ml_dev/ml_quantitative/data') markers = ['o', '*', 's', 'd', 'D', '8', '.'] start0 = time.time() data, target = shuffle(mnist.data / 255, mnist.target, random_state=0) n = data.shape[0] #n = 10000 X, y = data[0:n], target[0:n] classifiers = [("ada_boost_10", AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy', splitter='best'), n_estimators=10)), ("ada_boost_50", AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy',
# -*- coding: utf-8 -*- """ Created on Wed Feb 21 17:56:23 2018 @author: TIM """ import matplotlib.pyplot as plt from sklearn.datasets import fetch_mldata from sklearn.neural_network import MLPClassifier mnist = fetch_mldata("MNIST") # rescale the data, use the traditional train/test split X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1) #mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4, #solver='sgd', verbose=10, tol=1e-4, random_state=1, #learning_rate_init=.1) mlp.fit(X_train, y_train) print("Training set score: %f" % mlp.score(X_train, y_train)) print("Test set score: %f" % mlp.score(X_test, y_test)) fig, axes = plt.subplots(4, 4) # use global min / max to ensure all weights are shown on the same scale
def download(): mnist = fetch_mldata('MNIST original') X = mnist.data.astype('float64') y = mnist.target print ('MNIST:', X.shape, y.shape) return (X, y)
import numpy.random as rand import matplotlib.pyplot as plt from sklearn.datasets import fetch_mldata from sklearn.linear_model import ElasticNetCV from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.cross_validation import train_test_split __doc__ = "See newcomparison.m" l1_ratio = 0.5 k_fold = 10 test_frac = 0.5 data_root = path.expanduser('~/data') # Load MNIST data mnist = fetch_mldata('MNIST original', data_home=data_root) X = mnist.data y = mnist.target # Split into train/test_frac X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac, random_state=0) # Construct and fit model en = ElasticNetCV(cv=k_fold, n_jobs=-1, random_state=0) en.fit(X_train, y_train) # Evaluate performance y_pred = np.round(en.predict(X_test))
def set_datasets(data="mnist", is_one_hot=True, is_normalize=True, **kwarg): data_home = "/".join(__file__.split("/")[:-1]) + "/data_dir_for_optimizer" if data == "mnist": data_dic = fetch_mldata('MNIST original', data_home=data_home) if is_one_hot == True: idx = data_dic["target"] num = int(idx.max() + 1) arr = np.zeros((idx.shape[0], num)).flatten() arr[idx.flatten().astype(int) + np.arange(idx.shape[0]) * num] = 1 data_dic["target"] = arr.reshape(idx.size, num) if is_normalize == True: data_dic["data"] = data_dic["data"] / 255 elif data == "boston": data_dic = load_boston() if is_normalize == True: data_dic["data"] = data_dic["data"] / data_dic["data"].max(axis=0) elif data == "digits": data_dic = load_digits() elif data == "iris": data_dic = load_iris() if is_one_hot == True: data_dic["target"] = gen_one_hot(data_dic["target"]) if is_normalize == True: data_dic["data"] = data_dic["data"] / data_dic["data"].max(axis=0) elif data == "linnerud": data_dic = load_linnerud() elif data == "wine": arr = np.loadtxt(data_home + "/wine.csv", delimiter=",", skiprows=1) data_dic = {"data": arr[:, :-1], "target": arr[:, -1]} if is_one_hot == True: data_dic["target"] = gen_one_hot(data_dic["target"]) if is_normalize == True: data_dic["data"] = data_dic["data"] / data_dic["data"].max(axis=0) elif data == "xor": data_dic = { "data": np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), ##.repeat(20, axis=0), "target": np.array([0, 1, 1, 0]) } #.repeat(20, axis=0)} elif data == "serial": data_dic = { "data": np.array(np.arange(20).reshape(5, 4)).repeat(20, axis=0), "target": np.arange(5).repeat(20, axis=0) } # elif data == "sin": # data_dic = {"data": np.arange(0,10,0.01)[:,None], # "target": np.sin(np.arange(0,10,0.01) * np.pi)} # # elif data == "sin": # # data_dic = {"data": np.arange(0,10,0.01)[:,None], # "target": np.sin(np.arange(0,10,0.01) * np.pi)} elif data == "sin": v = np.sin(np.pi * np.arange(1000) / 100) if not "data_length" in kwarg: data_length = 100 else: data_length = kwarg["data_length"] if not "predict_length" in kwarg: predict_length = 1 else: data_length = kwarg["data_length"] x, y, xidx, yidx = gen_time_series(v, data_length, predict_length) data_dic = {"data": x, "target": y} elif data == "decay": v = np.sin(np.pi * np.arange(10000) / np.arange(1, 10001)[::-1] * 10) * np.arange(10000)[::-1] v = v[:-1000] x, y, xidx, yidx = gen_time_series(v, 10, 1) data_dic = {"data": x, "target": y} if "data_only" in kwarg: data_dic["target"] = data_dic["data"] return data_dic["data"], data_dic["target"]
import numpy from sklearn.utils import shuffle from sklearn.datasets import fetch_mldata from sklearn.decomposition import PCA from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import StandardScaler from machine_learning import MyGaussian print("Loading ... ") mnist = fetch_mldata('MNIST original') #, data_home='data' ) X = mnist.data Y = mnist.target print("Transforming ... ") new_X = X #new_X = numpy.ndarray( [ len(X), 56 ] ) ##for n in range(len(X)): ## sample = X[n].reshape(28,28) ## for i in range(28): new_X[n,i] = sample[i,:].sum() ## for j in range(28): new_X[n,28+j] = sample[:,j].sum() ### new_X[n] = new_X[n] / new_X[n].sum() #X2=X.reshape( len(X), 28, 28 ) #new_X[:,:28] = X2.sum(axis=1) #new_X[:,28:] = X2.sum(axis=2) #new_X[:,:] = new_X[:,:] / new_X.sum(axis=1)[:,numpy.newaxis]
from sklearn import datasets import matplotlib.pyplot as plt import numpy as np import argparse # construct the argument parse and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-o", "--output", required=True, help="path to the output loss/accuracy plot") args = vars(ap.parse_args()) # grab the MNIST dataset (if this is your first time running this # script, the download may take a minute -- the 55MB MNIST dataset # will be downloaded) print("[INFO] loading MNIST (full) dataset...") dataset = datasets.fetch_mldata("MNIST Original") # scale the raw pixel intensities to the range [0, 1.0], then # construct the training and testing splits data = dataset.data.astype("float") / 255.0 (trainX, testX, trainY, testY) = train_test_split(data, dataset.target, test_size=0.25) # convert the labels from integers to vectors lb = LabelBinarizer() trainY = lb.fit_transform(trainY) testY = lb.fit_transform(testY) # define the 784-256-128-10 architecture using Keras model = Sequential() model.add(Dense(256, input_shape=(784,), activation='sigmoid'))
import numpy as np import random from sklearn.datasets import fetch_mldata from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import matplotlib.pyplot as plt import scikitplot as skplt # Get the MNIST data, if not already there DATA_DIR = "./" mnist = fetch_mldata('MNIST original', data_home=DATA_DIR) # Initiate the random generator rnd = random.Random() # Separate the mnist data into two arrays mnist_train_1 = np.array( [mnist.data[i] for i in range(60000) if mnist.target[i] == 1.0]) mnist_train_5 = np.array( [mnist.data[i] for i in range(60000) if mnist.target[i] == 5.0]) mnist_test_1 = np.array( [mnist.data[i] for i in range(60000, 70000) if mnist.target[i] == 1.0]) mnist_test_5 = np.array( [mnist.data[i] for i in range(60000, 70000) if mnist.target[i] == 5.0]) x = np.concatenate((mnist_train_1, mnist_train_5)) def kmeans(data, k, lam=0.001, lam_dec=0.00005): # Pick some centroids centroids = np.array([np.copy(data[rnd.randint(0, data.shape[0] - 1)])])
from sklearn import datasets, svm, metrics from sklearn.datasets import fetch_mldata from sklearn.externals import joblib MODEL_PATH = 'mnist_svm_model_full.pkl' mnist = fetch_mldata('MNIST original', data_home='./scikit_learn_data') X_data = mnist.data / 255.0 Y = mnist.target # print('svm') classifier = svm.SVC(C=5, gamma=0.05) classifier.fit(X_data, Y) joblib.dump(classifier, MODEL_PATH, compress=3)
parameters, losses, test_losses = \ StochasticMLP(X, y, layer_dims, 'multiclass', X_test, y_test, optimizer, lr, batch_size, beta1, beta2, eps, num_epochs, print_loss, add_del, reg_param, delta,prob,epsilon,max_hidden_size,tau) return parameters, losses, test_losses if __name__ == '__main__': # data_size = 7 # num_features = 10 # num_classes = 3 # # X_train = 10.*np.random.rand(num_features,data_size) # y_train = np.array([[1,0,0],[0,1,0],[0,0,1],[1,0,0],[0,1,0],[0,0,1],[1,0,0]]).T mnist = fetch_mldata('MNIST original', data_home=os.getcwd()) X = mnist.data.astype(np.float32) / 255. y_orig = mnist.target # one-hot encode the labels y_orig: i=0,...,9 --> [0,...,1,...,0] y = pd.get_dummies(y_orig).values.astype(np.float32) # pca = PCA(n_components=324) # pca.fit(X) # X_pca = pca.transform(X) X,y = shuffle(X,y) down_sample = 5000 X_ds = X[:down_sample,:] y_ds = y[:down_sample,:]
wavelet = 'db5' level = 1 psi = np.load('./wavelet_mat/{}_{}.npz'.format(wavelet, level))['psi'] def_params = dict(rho = rho, psi = psi) # L-inf attack budget, corresponding to images in the range [-1, 1] epsilon = 0.2 proj_iter = False # Change to True to run attack with iterated projections # Read MNIST data digit_1 = 3 digit_2 = 7 fetch_mnist() mnist = datasets.fetch_mldata("MNIST original") digit_1_data = 2.0*mnist.data[mnist.target==digit_1]/255.0 - 1.0 digit_2_data = 2.0*mnist.data[mnist.target==digit_2]/255.0 - 1.0 data = np.vstack([digit_1_data, digit_2_data]) labels = np.hstack([np.repeat(digit_1, digit_1_data.shape[0]), np.repeat(digit_2, digit_2_data.shape[0])]) data, labels = utils.shuffle(data, labels, random_state=1234) x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=0.25, random_state=1234, stratify=labels) num_test = x_test.shape[0] print("\n*************************************") print("{:} vs. {:} classification via linear SVM".format(digit_1,digit_2)) print("*************************************") print("Attacks use epsilon = {:.2f} \nImages are in the range [-1, 1]\n".format(epsilon)) print("**********") print("No defense") print("**********")
if len(sys.argv) == 1: print( "ERROR: Please specify implementation to benchmark, 'sknn' or 'nolearn'." ) sys.exit(-1) np.set_printoptions(precision=4) np.set_printoptions(suppress=True) from sklearn.base import clone from sklearn.cross_validation import train_test_split from sklearn.datasets import fetch_mldata from sklearn.metrics import classification_report mnist = fetch_mldata('mnist-original') X_train, X_test, y_train, y_test = train_test_split( (mnist.data / 255.0).astype(np.float32), mnist.target.astype(np.int32), test_size=1.0 / 7.0, random_state=1234) classifiers = [] if 'sknn' in sys.argv: from sknn.platform import gpu32 from sknn.mlp import Classifier, Layer, Convolution clf = Classifier( layers=[ # Convolution("Rectifier", channels=10, pool_shape=(2,2), kernel_shape=(3, 3)),
from sklearn.preprocessing import LabelBinarizer from sklearn.metrics import confusion_matrix, classification_report import renom as rm from renom.cuda import cuda from renom.optimizer import Sgd, Adam from renom.core import DEBUG_NODE_STAT, DEBUG_GRAPH_INIT, DEBUG_NODE_GRAPH from renom.operation import sum DEBUG_GRAPH_INIT(True) np.random.seed(10) cuda.set_cuda_active(True) mnist = fetch_mldata('MNIST original', data_home="dataset") X = mnist.data y = mnist.target X = X.astype(np.float32) X /= X.max() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) labels_train = LabelBinarizer().fit_transform(y_train).astype(np.float32) labels_test = LabelBinarizer().fit_transform(y_test).astype(np.float32) class MNist(rm.Model): def __init__(self): super(MNist, self).__init__()
# Import the modules import os from sklearn.externals import joblib from sklearn import datasets from skimage.feature import hog from sklearn.svm import LinearSVC import numpy as np from collections import Counter from sklearn_porter import Porter from sklearn.model_selection import train_test_split import cv2 # Load the dataset custom_data_home = 'D:\Christian-Data\Proyectos\Python\data' dataset = datasets.fetch_mldata('MNIST original', data_home=custom_data_home) # Extract the features and labels features = np.array(dataset.data, 'int16') labels = np.array(dataset.target, 'int') list_hog_fd = [] for feature in features: fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') print("Features inicial:" + str(len(hog_features))) print("Elementos inicial :" + str(labels.size))
# Import the modules from sklearn.externals import joblib from sklearn import datasets from skimage.feature import hog from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier import numpy as np from collections import Counter from sklearn.model_selection import GridSearchCV # Load the dataset dataset = datasets.fetch_mldata("MNIST Original", data_home='/home/sahil/virtualenvs/ALPR/') # Extract the features and labels features = np.array(dataset.data, 'int16') labels = np.array(dataset.target, 'int') # Extract the hog features list_hog_fd = [] for feature in features: fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') print("Count of digits in dataset", Counter(labels)) # Create an linear SVM object
def get_MNIST(): """Returns a (name, data, target) tuple of the MNIST dataset (70 000 items)""" mnist = fetch_mldata('MNIST original', data_home='./data/') return ('MNIST', pd.DataFrame(mnist.data / 255.), pd.DataFrame(mnist.target))
def get_mnist_data(): mnist = fetch_mldata('MNIST Original') X = mnist['data'] y = mnist['target'] return random_split(X, y, ratio=0.2)
from sklearn import datasets import numpy as np from sklearn.svm import LinearSVC from skimage.feature import hog from sklearn.externals import joblib from collections import Counter dataset = datasets.fetch_mldata('MNIST Original') features = np.array(dataset.data, 'int16') labels = np.array(dataset.target, 'int') list_hog_fd = [] for feature in features: fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1)) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') print('done with count', Counter(labels)) for i in range(1, 2, 1): clf = LinearSVC(C=3.0, max_iter=5000, random_state=1, tol=1e-5) clf.fit(hog_features, labels) joblib.dump(clf, 'C:\\Users\\tusha\Desktop\ocrclf{}.pkl'.format(i), compress=3) print('{} done'.format(i))
import numpy as np from matplotlib import pyplot as plt from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels \ import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared from sklearn.datasets import fetch_mldata data = fetch_mldata('mauna-loa-atmospheric-co2').data X = data[:, [1]] y = data[:, 0] # Kernel with parameters given in GPML book k1 = 66.0**2 * RBF(length_scale=67.0) # long term smooth rising trend k2 = 2.4**2 * RBF(length_scale=90.0) \ * ExpSineSquared(length_scale=1.3, periodicity=1.0) # seasonal component # medium term irregularity k3 = 0.66**2 \ * RationalQuadratic(length_scale=1.2, alpha=0.78) k4 = 0.18**2 * RBF(length_scale=0.134) \ + WhiteKernel(noise_level=0.19**2) # noise terms kernel_gpml = k1 + k2 + k3 + k4 gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0, optimizer=None, normalize_y=True) gp.fit(X, y) print("GPML kernel: %s" % gp.kernel_)
def train(self, n_epochs, batch_size=128, save_interval=50): mnist = fetch_mldata('MNIST original') X = mnist.data y = mnist.target # Rescale [-1, 1] X = (X.astype(np.float32) - 127.5) / 127.5 half_batch = int(batch_size / 2) for epoch in range(n_epochs): # --------------------- # Train Discriminator # --------------------- self.discriminator.set_trainable(True) # Select a random half batch of images idx = np.random.randint(0, X.shape[0], half_batch) imgs = X[idx] # Sample noise to use as generator input noise = np.random.normal(0, 1, (half_batch, self.latent_dim)) # Generate a half batch of images gen_imgs = self.generator.predict(noise) # Valid = [1, 0], Fake = [0, 1] valid = np.concatenate((np.ones( (half_batch, 1)), np.zeros((half_batch, 1))), axis=1) fake = np.concatenate((np.zeros( (half_batch, 1)), np.ones((half_batch, 1))), axis=1) # Train the discriminator d_loss_real, d_acc_real = self.discriminator.train_on_batch( imgs, valid) d_loss_fake, d_acc_fake = self.discriminator.train_on_batch( gen_imgs, fake) d_loss = 0.5 * (d_loss_real + d_loss_fake) d_acc = 0.5 * (d_acc_real + d_acc_fake) # --------------------- # Train Generator # --------------------- # We only want to train the generator for the combined model self.discriminator.set_trainable(False) # Sample noise and use as generator input noise = np.random.normal(0, 1, (batch_size, self.latent_dim)) # The generator wants the discriminator to label the generated samples as valid valid = np.concatenate((np.ones( (batch_size, 1)), np.zeros((batch_size, 1))), axis=1) # Train the generator g_loss, g_acc = self.combined.train_on_batch(noise, valid) # Display the progress print("%d [D loss: %f, acc: %.2f%%] [G loss: %f, acc: %.2f%%]" % (epoch, d_loss, 100 * d_acc, g_loss, 100 * g_acc)) # If at save interval => save generated image samples if epoch % save_interval == 0: self.save_imgs(epoch)
# -*- coding: utf-8 -*- from sklearn.datasets import fetch_mldata from matplotlib.pyplot import * from numpy import * mnist = fetch_mldata('MNIST original') data = array(mnist.data != 0, dtype=bool) # 二値化 # 適当に15サンプル表示 N = len(data) choice = random.choice(arange(N), 15) figure(figsize=(18, 8)) gray() for i in range(15): subplot(3, 5, i + 1) imshow(data[choice[i]].reshape(28, 28), interpolation='none') savefig('fig19-2.png')
from cnn.neural_network import CNN from keras.utils import np_utils from keras.optimizers import SGD from sklearn.datasets import fetch_mldata from sklearn.model_selection import train_test_split # Parse the Arguments ap = argparse.ArgumentParser() ap.add_argument("-s", "--save_model", type=int, default=-1) ap.add_argument("-l", "--load_model", type=int, default=-1) ap.add_argument("-w", "--save_weights", type=str) args = vars(ap.parse_args()) # Read/Download MNIST Dataset print('Loading MNIST Dataset...') dataset = fetch_mldata('MNIST Original') # Read the MNIST data as array of 784 pixels and convert to 28x28 image matrix mnist_data = dataset.data.reshape((dataset.data.shape[0], 28, 28)) mnist_data = mnist_data[:, np.newaxis, :, :] # Divide data into testing and training sets. train_img, test_img, train_labels, test_labels = train_test_split( mnist_data / 255.0, dataset.target.astype("int"), test_size=0.1) # Now each image rows and columns are of 28x28 matrix type. img_rows, img_columns = 28, 28 # Transform training and testing data to 10 classes in range [0,classes] ; num. of classes = 0 to 9 = 10 classes total_classes = 10 # 0 to 9 labels train_labels = np_utils.to_categorical(train_labels, 10)