def load_mnist(N_labeled=100, N_test=10000, pruning=False): data = mldata.fetch_mldata('MNIST original') x = data['data'].astype(np.float32) / 255 y = data['target'].astype(np.int32) if pruning: x = prune_by_stddev(x) D = len(x[0]) T = 10 N_labeled /= T x_split = [np.split(x[y == i], [N_labeled]) for i in six.moves.range(T)] x_train = np.concatenate([x_[0] for x_ in x_split]) x_rest = np.concatenate([x_[1] for x_ in x_split]) y_split = [np.split(y[y == i], [N_labeled]) for i in six.moves.range(T)] y_train = np.concatenate([y_[0] for y_ in y_split]) y_rest = np.concatenate([y_[1] for y_ in y_split]) N = 70000 N_rest = N - N_labeled * T perm = np.random.permutation(N_rest) x_unlabeled, x_test = np.split(x_rest[perm], [N_rest - N_test]) _, y_test = np.split(y_rest[perm], [N_rest - N_test]) return x_train, y_train, x_test, y_test, x_unlabeled, D, T
def test_save_sqlite_arrays(nmax=1000): "Load MNIST database (70000 samples) and store in a compressed SQLite db" os.path.exists(dbname) and os.unlink(dbname) con = sqlite3.connect(dbname, detect_types=sqlite3.PARSE_DECLTYPES) cur = con.cursor() cur.execute( "create table test (idx integer primary key, X array, y integer );") mnist = fetch_mldata('MNIST original') X, y = mnist.data[:nmax], mnist.target[:nmax] m = X.shape[0] t0 = time.time() for i, x in enumerate(X): cur.execute("insert into test (idx, X, y) values (?,?,?)", (i, y, int(y[i]))) if not i % 100 and i > 0: elapsed = time.time() - t0 remain = float(m - i) / i * elapsed print("\r[%5d]: %3d%% remain: %d secs" % (i, 100 * i / m, remain), ) sys.stdout.flush() con.commit() con.close() elapsed = time.time() - t0 print() print("Storing %d images in %0.1f secs" % (m, elapsed))
def mnist_data(tries): """This function fetches the MNIST dataset from MLDATA and splits the data and the target. http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_mldata.html http://mldata.org/repository/data/viewslug/mnist/ """ mnist = fetch_mldata('mnist original') X, y = mnist.data, mnist.target title = "Learning Curve Comparison for MNIST" plot_learning_curve(X, y, tries, title)
def load_dataset(num_patches, patch_size): '''utility function to load data set''' global verbosity_level print('======loading dataset=======\n') mnist = fetch_mldata('MNIST original') sss = StratifiedShuffleSplit(mnist.target, 1, test_size=0.1, train_size=20000, random_state=0) for train_index, test_index in sss: trainX, testX = mnist.data[train_index], mnist.data[test_index] trainY, testY = mnist.target[train_index], mnist.target[test_index] no_of_images = trainX.shape[0] """ the dataset is originally read as dictionary, convert it to an array. the resulting array is of shape[512,512,10]. no of images=10 image size = 512*512(gray scale) """ #dataset is of shape [64*10,000] dataset = np.zeros((patch_size*patch_size, num_patches)) """Randomly sample images""" rand = np.random.RandomState(23455) image_number = rand.randint(no_of_images, size = num_patches) for i in xrange(num_patches): """"get the patch indices """ index3 = image_number[i] """"extract patch from original image""" dataset[:,i] = trainX[index3] if verbosity_level==2: print('=========patches extracted========\n') """normalize the dataset(min max feature scaling is used)""" #transpose 'dataset' to form attributes as columns of the matrix, since scaling #is to be done featurewise if verbosity_level==2: print('***********scaling features to [0.1, 0.9] range***********\n') #dataset = normalizeDataset(dataset) dataset = dataset / 255.0 #dataset = np.transpose(dataset) # newsize = 10,000*64 #min_max_scaler = preprocessing.MinMaxScaler() #dataset = min_max_scaler.fit_transform(dataset) #dataset = np.transpose(dataset) #transpose to 64*10,000 print('======loading dataset : completed ========\n') return dataset
def main(): #set the timer start = time.time() #load the data mnist = fetch_mldata('MNIST original') mnist.target = mnist.target.astype(np.int32) seed = np.random.randint(1,30000) rand = np.random.RandomState(seed) items = len(mnist.target) indices = rand.randint(items, size = 70000) trindex = indices[0:30000] tsindex = indices[30000:] #scale down features to the range [0, 1] mnist.data = mnist.data/255.0 mnist.data = mnist.data.astype(np.float32) trainX = mnist.data[trindex] testX = mnist.data[tsindex] trainY = mnist.target[trindex] testY = mnist.target[tsindex] #extract the features using KPCA kpca = KernelPCA(kernel='precomputed') kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000]) #Fit the model from data in X kpca.fit(kpca_train) kernel_train = arc_cosine(trainX, trainX[0:1000]) kernel_test = arc_cosine(testX, trainX[0:1000]) trainX_kpca = kpca.transform(kernel_train) testX_kpca = kpca.transform(kernel_test) print testX_kpca.shape #fit the svm model and compute accuaracy measure clf = svm.SVC(kernel=arc_cosine) clf.fit(trainX_kpca, trainY) pred = clf.predict(testX_kpca) print accuracy_score(testY, pred) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def mnist(): mnist = fetch_mldata('MNIST original', data_home='./mldata') onehot = np.zeros((mnist.target.size, 10)) onehot[np.arange(onehot.shape[0]), mnist.target.astype(int)] = 1 train_X, test_X, train_Y, test_Y = train_test_split(mnist.data, onehot, test_size=1 / 7.0, random_state=0) train_X = train_X.astype(float) / 255 test_X = test_X.astype(float) / 255 # plt.figure(figsize=(20, 4)) # for index, (image, label) in enumerate(zip(train_X[0:5], train_Y[0:5])): # plt.subplot(1, 5, index + 1) # plt.imshow(np.reshape(image, (28, 28)), cmap=plt.cm.gray) # plt.axis('off') return train_X, train_Y, test_X, test_Y
def load_data2(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] X = np.array([np.reshape(element, (784, )) for element in X]) print(y[1]) y = np.array([np.reshape(vectorize(element), (10, )) for element in y]) # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) return X_train, X_test, y_train, y_test
def TrainingMachine(): dataset = fetch_mldata('mnist-original', data_home="home/nishchit/Major Project/Finalproject") #save the images of the digits as respect to corresponding feature and corresponding label using numpy array. features = np.array(dataset.data, 'int16') labels = np.array(dataset.target, 'int') #calculating the HOG feature and saving it in numpy array list_hog_fd = [] for feature in features: fd = skimage.feature._hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') #creating an objejct of LinearSVC clf = LinearSVC() clf.fit(hog_features, labels) joblib.dump(clf, "digits_cls.pkl", compress=3)
from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata from BigDataRuleListClassifier import * from SVMBigDataRuleListClassifier import * from sklearn.ensemble import RandomForestClassifier import time feature_labels = [ "#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)", "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function", "Age (years)" ] data = fetch_mldata("diabetes") # get dataset y = (data.target + 1) / 2 # target labels (0 or 1) ############################################################################### Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split t0 = time.time() # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class0label="diabetes", verbose=False) clf.fit(Xtrain, ytrain, feature_labels=feature_labels) print "RuleListClassifier Accuracy:", clf.score( Xtest, ytest), "Learned interpretable model:\n", clf t1 = time.time() # train classifier (allow more iterations for better accuracy) bclf = BigDataRuleListClassifier(training_subset=0.1, subset_estimator=RandomForestClassifier(
from intro_forward_backward_source import soft_thresholding from mpi4py import MPI anysource = MPI.ANY_SOURCE comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() ############################################################################ # Loading and visualizing the data ############################################################################ dataset_name = 'liver-disorders' data = fetch_mldata(dataset_name) X = data.data y = data.target X = X.astype(float) y = y.astype(float) y[y == 2] = -1 # standardize data X -= X.mean(axis=0) X /= X.std(axis=0) X[np.isnan(X)] = 0. ############################################################################ # Dataset splitting for distribution settting ############################################################################ n, p = np.shape(X)
def main(): ''' use of an autocoder param path: path to folder where you are loading MNIST param type: type of gradient function (sgd, sgd_momentum, rmsprop, adam) param train_size: train data size param test_size: test data size param num_epoch: number of epochs param minibatch_size: minibatch size param momentum: momentum param display: print to display ''' options = parse_args() mnist = fetch_mldata('MNIST original', data_home=options['path']) data = mnist.data.astype('float64') train_size = options['train_size'] train_data = data[np.random.choice(data.shape[0], train_size, False), :] test_size = options['test_size'] test_data = data[np.random.choice(data.shape[0], test_size, False), :] autoencoder = Autoencoder([ FCLayer((784, 250), SigmoidActivationFunction(), True), FCLayer((250, 50), SigmoidActivationFunction(), True), FCLayer((50, 2), SigmoidActivationFunction(), True), FCLayer((2, 50), LinearActivationFunction(), True), FCLayer((50, 250), SigmoidActivationFunction(), True), FCLayer((250, 784), SigmoidActivationFunction(), True) ]) if options['type'] == 'sgd': res = autoencoder.run_sgd(train_data.transpose(), step_size=1.0, momentum=0, num_epoch=options['num_epoch'], minibatch_size=options['minibatch_size'], l2_coef=1e-4, test_inputs=test_data.transpose(), display=options['display']) elif options['type'] == 'sgd_momentum': res = autoencoder.run_sgd(train_data.transpose(), step_size=1.0, momentum=options['momentum'], num_epoch=options['num_epoch'], minibatch_size=options['minibatch_size'], l2_coef=1e-4, test_inputs=test_data.transpose(), display=options['display']) elif options['type'] == 'rmsprop': res = autoencoder.run_rmsprop(train_data.transpose(), step_size=1.0, num_epoch=options['num_epoch'], minibatch_size=options['minibatch_size'], l2_coef=1e-4, test_inputs=test_data.transpose(), display=options['display']) elif options['type'] == 'adam': res = autoencoder.run_adam(train_data.transpose(), step_size=1.0, num_epoch=options['num_epoch'], minibatch_size=options['minibatch_size'], l2_coef=1e-4, test_inputs=test_data.transpose(), display=options['display']) print(res) plt.title('test loss') plt.scatter(np.arange(len(res['test_loss'])), res['test_loss']) plt.show()
import numpy as np import random from frameworks.CPLELearning import CPLELearningModel from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model.stochastic_gradient import SGDClassifier from methods.scikitWQDA import WQDA # load data heart = fetch_mldata("heart") X = heart.data ytrue = np.copy(heart.target) ytrue[ytrue == -1] = 0 # label a few points labeled_N = 2 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised score", basemodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "semi-supervised score", ssmodel.score(X, ytrue)
def main(): global lambdaa, ntrain, num_classes start = time.time() lambdaa = 0.0001 max_iterations = 100 # Load the Digit Data Set mnist = fetch_mldata("MNIST original") # min_max_scaler = preprocessing.MinMaxScaler() # mnist.data = min_max_scaler.fit_transform(mnist.data) mnist.data = mnist.data / 255.0 mnist.target = mnist.target.astype(np.int32) seed = np.random.randint(1, 30000) rand = np.random.RandomState(seed) items = len(mnist.target) indices = rand.randint(items, size=70000) trindex = indices[0:50000] tsindex = indices[50000:] trainX = mnist.data[trindex] testX = mnist.data[tsindex] trainY = mnist.target[trindex] testY = mnist.target[tsindex] # trainX,testX,trainY,testY = train_test_split(mnist.data,mnist.target,test_size=0.3) mtrain, ntrain = trainX.shape mtest, ntest = testX.shape # Append one to the first column of the training data ones = np.ones((mtrain, 1), dtype=int) trainX = np.append(ones, trainX, axis=1) ones = np.ones((mtest, 1), dtype=int) testX = np.append(ones, testX, axis=1) ntrain = ntest = ntrain + 1 # make digits range as 1-10 trainY = trainY + 1 testY = testY + 1 num_classes = len(np.unique(trainY)) theta = np.random.random_sample((num_classes, ntrain)).flatten() """do the optimization using L-BFGS algoritm""" result = scipy.optimize.minimize( costFunction, theta, args=(trainX, trainY), method="L-BFGS-B", jac=True, options={"maxiter": max_iterations, "disp": True}, ) theta = result.x.reshape((num_classes, ntrain)) theta_dash = np.transpose(theta) """" classify the test datapoints using the learned parameters""" pred = np.ones(mtest, dtype=int) for i in xrange(mtest): temp = np.exp(np.dot(testX[i], theta_dash)) pred[i] = np.argmax(temp) + 1 print accuracy_score(testY, pred) print ("total : %d, correct : %d, incorrect : %d\n" % (len(pred), np.sum(pred == testY), np.sum(pred != testY))) print ("execution time(in Minutes):%f\n" % ((time.time() - start) / 60))
from __future__ import print_function from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVC from sklearn.datasets.mldata import fetch_mldata import tempfile test_data_home = tempfile.mkdtemp() digits=fetch_mldata('uci-20070111 wisconsin', data_home=test_data_home) # Loading the Digits dataset # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.data) X = digits.data y = digits.target print (digits.data.shape) print (digits.target.shape) print (digits.data[0]) # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( digits.data,digits.target, test_size=0.1, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
if dataset == "synthetic": # Generate data set n_samples = 100 n_features = 200 sigma = 1. sparsity = 0.9 corr = 0.5 random_state = np.random.randint(0, 100) X, y, true_beta, true_sigma = generate_data(n_samples, n_features, sigma, sparsity, corr, random_state=random_state) if dataset == "leukemia": data = fetch_mldata('leukemia') X = data.data y = data.target X = X.astype(float) y = y.astype(float) n_samples, n_features = X.shape NO_SCREENING = 0 GAPSAFE = 1 WSTRT_SIGMA_0 = 2 BOUND = 3 BOUND2 = 4 # Number of elements in the path (set to 100 for papers results) n_lambdas = 10
# Preparing data to compare our method import pandas as pd from sklearn.datasets import fetch_covtype, fetch_kddcup99 from sklearn.datasets.mldata import fetch_mldata # Loading and editing datasets # # The target variable contains the label of abnormality. # 0 : Normal # 1 : Anomaly covtype = fetch_covtype() SF = fetch_kddcup99(subset='SF') http = fetch_kddcup99(subset='http') shuttle = fetch_mldata('shuttle') # We use the rules proposed in Learning hyperparameters for unsupervised anomaly detection. # A. Thomas, S. Clémençon, V. Feuillard, A. Gramfort. Anomaly Detection Workshop, ICML 2016] # For the Forest Cover dataset cover types 4 and 5 are considered abnormal when the cover type 2 is considered as normal df_covtype = pd.DataFrame(covtype.data) df_covtype['target'] = covtype.target df_covtype = df_covtype.query('target in [2,4,5]') df_covtype.target = df_covtype.target.replace(2, 0).replace(4, 1).replace(5, 1) # For the sf and http dataset all the categories not flagged normal are considered abnormal df_sf = pd.DataFrame(SF.data) df_sf['target'] = SF.target
import sklearn, sys, time from sklearn.neighbors import KNeighborsClassifier import numpy as numpy from sklearn.datasets.mldata import fetch_mldata t0 = time.clock() mnist = fetch_mldata('MNIST original') X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] time_dataset= time.clock() - t0 print("Dataset created in time "+str(time_dataset)) t0 = time.clock() # print(mnist.data.shape) neigh = KNeighborsClassifier(n_neighbors=3, metric="chebyshev", n_jobs=2) neigh.fit(X_train, y_train) time_train= time.clock() - t0 print("Dataset trained in time "+str(time_train)) t0 = time.clock() y_pred = neigh.predict(X_test) time_pred = time.clock() - t0 print("Predicted in time "+ str(time_pred)) print(y_pred.tolist()) score = 0 for x in range(len(y_pred)): if y_pred[x] == y_test[x]: score = score + 1 score = float(score)/len(y_pred) print("score = "+ str(score))
''' Created on Aug 16, 2014 @author: ryanshiroma ''' from sklearn import datasets from sklearn.datasets.mldata import fetch_mldata from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original', data_home=custom_data_home) if __name__ == '__main__': pass
def __init__(self): self.dataset = fetch_mldata("mnist-original", data_home="./") self.features = np.array(self.dataset.data, 'int16') self.labels = np.array(self.dataset.target, 'str')
from RuleListClassifier import * import sklearn.ensemble from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata dataseturls = ["https://archive.ics.uci.edu/ml/datasets/Iris", "https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes"] datasets = ["iris", "diabetes"] data_feature_labels = [ ["Sepal length", "Sepal width", "Petal length", "Petal width"], ["#Pregnant","Glucose concentration demo","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"] ] data_class1_labels = ["Iris Versicolour", "No Diabetes"] for i in range(len(datasets)): print "--------" print "DATASET: ", datasets[i], "(", dataseturls[i], ")" data = fetch_mldata(datasets[i]) y = data.target y[y>1] = 0 y[y<0] = 0 Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label=data_class1_labels[i], verbose=False) clf.fit(Xtrain, ytrain, feature_labels=data_feature_labels[i]) print "accuracy:", clf.score(Xtest, ytest) print "rules:\n", clf print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
from sklearn.linear_model import SGDClassifier from sklearn.datasets import mldata import numpy as np import matplotlib import matplotlib.pyplot as plt from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix from sklearn.neighbors import KNeighborsClassifier mnist = mldata.fetch_mldata('MNIST Original') def shuffle_split(data, labels, test_ratio=0.2): indices = np.random.permutation(data.shape[0]) test_size = int(test_ratio*data.shape[0]) return data[indices[test_size:]], labels[indices[test_size:]], \ data[indices[: test_size]], labels[indices[: test_size]] def print_image(data): data_image = data.reshape(28, 28) plt.imshow(data_image, cmap=matplotlib.cm.binary, interpolation='nearest') plt.show() def evaluate_model(data, labels): classifier = SGDClassifier(random_state=42) labels_predict = cross_val_predict(classifier, data, labels == 5, cv=3) return confusion_matrix(labels, labels_predict)
import numpy as np import random from frameworks.CPLELearning import CPLELearningModel from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model.stochastic_gradient import SGDClassifier import sklearn.svm from methods.scikitWQDA import WQDA from frameworks.SelfLearning import SelfLearningModel # load data cancer = fetch_mldata("Lung cancer (Ontario)") X = cancer.target.T ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel)
p1 = subprocess.Popen( ["./lcm", '{}f'.format(type), filename, str(min_support), "-"], stdout=subprocess.PIPE) output = p1.communicate(str.encode("utf-8"))[0] output = output.decode('utf-8') itemsets = output.split('\n') itemsets = list(map(split_itemset, itemsets)) itemsets = list(filter(None.__ne__, itemsets)) return itemsets mnist_path = "mnist" digits = fetch_mldata('mnist-original', data_home=mnist_path) features = digits.data labels = digits.target class_0 = features[labels == 0, :] transactions = [] for i in range(class_0.shape[0]): transactions.append(np.where(class_0[i, :] > 50)[0].tolist()) with open('s.txt', 'w') as fp: for i in range(class_0.shape[0]): fp.write(' '.join(map(str, transactions[i]))) if (i + 1 != class_0.shape[0]): fp.write('\n')
# -*- coding: utf-8 -*- from shiftpixels import * from sklearn.datasets.mldata import fetch_mldata mnist = fetch_mldata('mnist-original', data_home='/Users/maxim/Python AI/Hands on ML/datasets') mnist X, y = mnist["data"], mnist["target"] import matplotlib import matplotlib.pyplot as plt some_digit = X[36000] some_digit_image = some_digit.reshape(28, 28) plt.imshow(some_digit_image, cmap = matplotlib.cm.binary, interpolation="nearest") plt.axis("off") plt.show() X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:] import numpy as np shuffle_index = np.random.permutation(60000) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] y_train_5 = (y_train == 5) y_test_5 = (y_test == 5)
from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model import SGDClassifier import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_score dataset = fetch_mldata('MNIST original') #Importing the Data from the dataset X = dataset["data"] y = dataset["target"] #Printing random image at 10062 position to verify the data fetch was successful or not element_num = 10062 temp = X[element_num].reshape(28, 28) plt.imshow(temp) plt.show() print(y[element_num]) #Printing the label associated with the prinnted image #Separating the trainig and test data Train_X = X[:6000] Train_y = y[:6000] Test_X = X[60000:] Test_y = y[60000:] #Training data and the test data are separetd properly, #We dont need to worry about them. But as we are going to perform a cross-validation on the tranning set #lets make shufful it so that the data gets equally distributed and none of the data digits escape any of the #validation set or the traning set, as some algorithm only performs well on the equally distributed data. reviced_index = np.random.permutation(Train_X.shape[0])
print("=============================================") # plt.subplot(211) # plt.bar(range(len(y_test)), y_test) # plt.subplot(212) # plt.bar(range(len(y_test)), y_test_dw) # plt.show() return if __name__ == '__main__': if '--mnist' in sys.argv: mnist = fetch_mldata('MNIST original', data_home='data') idx_01 = np.where(mnist.target <= 10)[0] np.random.shuffle(idx_01) idx_01 = idx_01[:5000] idx_train = idx_01[:2*len(idx_01)//3] idx_test = idx_01[2*len(idx_01)//3:] X_train = mnist.data[idx_train] X_test = mnist.data[idx_test] y_train = 2*(mnist.target[idx_train] <= 4) - 1 y_test = 2*(mnist.target[idx_test] <= 4) - 1 clfs = train_model(X_train, y_train, X_test, y_test, 1.0)
target_name = "int3" if parsed.target_name != target_name: logging.warning( "{} target is {}".format(parsed.dataset, target_name) ) parsed.target_name = target_name elif parsed.dataset == "uci-20070111-liver-disorders": target_name = "int2" if parsed.target_name != target_name: logging.warning( "{} target is {}".format(parsed.dataset, target_name) ) parsed.target_name = target_name bunch = fetch_mldata( parsed.dataset, target_name=parsed.target_name, data_home=data_home ) data, labels = scale(bunch['data']), bunch['target'] old_labels = np.empty_like(labels) np.copyto(old_labels, labels) for i, label in enumerate(np.unique(labels)): labels[old_labels == label] = i + 1 labels = np.ravel(labels).astype(int) skf = StratifiedKFold( y=labels, n_folds=2, shuffle=False, random_state=42 ) # get the last of the two splits for train_idx, test_idx in skf: pass
8. ANOREXIA: no, yes 9. LIVER BIG: no, yes 10. LIVER FIRM: no, yes 11. SPLEEN PALPABLE: no, yes 12. SPIDERS: no, yes 13. ASCITES: no, yes 14. VARICES: no, yes 15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 -- see the note below 16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 17. SGOT: 13, 100, 200, 300, 400, 500, 18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 20. HISTOLOGY: no, yes """ data = fetch_mldata("datasets-UCI hepatitis") # get dataset #some data cleaning (due to horrible mldata format) # target y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]] # categorical variables data['SEX'] = data.data feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()] columns = {} for label in feature_labels: column = data[label] if len(data[label])>1 else data[label][0] while type(column[0]) == list or type(column[0]) == np.ndarray: column = [c[0] for c in column] columns[label] = pd.Series(column) # numeric variables columns['AGE'] = data.target
def __init__(self): self.data = fetch_mldata('MNIST original') self._preprocess_data()
from sklearn.datasets.mldata import fetch_mldata import tempfile test_data_home = tempfile.mkdtemp() breast=fetch_mldata('datasets-UCI breast-w', transpose_data=True, data_home=test_data_home) #breast=fetch_mldata('housing_scale', data_home=test_data_home) print breast.data.shape n_samples, n_features = breast.data.shape print n_samples,n_features print breast.data.shape #print breast.data[0] print breast.target.shape
from sklearn.cross_validation import train_test_split from sklearn.datasets.mldata import fetch_mldata from RuleListClassifier import * from sklearn.ensemble import RandomForestClassifier feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"] data = fetch_mldata("diabetes") # get dataset y = (data.target+1)/2 # target labels (0 or 1) Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split # train classifier (allow more iterations for better accuracy) clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False) clf.fit(Xtrain, ytrain, feature_labels=feature_labels) print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
8. ANOREXIA: no, yes 9. LIVER BIG: no, yes 10. LIVER FIRM: no, yes 11. SPLEEN PALPABLE: no, yes 12. SPIDERS: no, yes 13. ASCITES: no, yes 14. VARICES: no, yes 15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 -- see the note below 16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 17. SGOT: 13, 100, 200, 300, 400, 500, 18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 20. HISTOLOGY: no, yes """ data = fetch_mldata("datasets-UCI hepatitis") # get dataset #some data cleaning (due to horrible mldata format) # target y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]] # categorical variables data['SEX'] = data.data feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()] columns = {} for label in feature_labels: column = data[label] if len(data[label]) > 1 else data[label][0] while type(column[0]) == list or type(column[0]) == np.ndarray: column = [c[0] for c in column] columns[label] = pd.Series(column) # numeric variables columns['AGE'] = data.target
# <markdowncell> # 对于不大的文件使用.npy 保存 # <markdowncell> # 相关参考:http://www.astrobetter.com/blog/2013/07/29/python-tip-storing-data/ # <codecell> from sklearn.datasets.mldata import fetch_mldata # <codecell> dataset = fetch_mldata('MNIST Original') # <headingcell level=2> # [读取.mat 文件](http://stackoverflow.com/questions/874461/read-mat-files-in-python) # <codecell> from scipy.io import loadmat # <codecell> mat =loadmat('New/mauna-loa-atmospheric-co2.mat') # <headingcell level=2>
from sklearn.datasets.mldata import fetch_mldata import tempfile test_data_home = tempfile.mkdtemp() #data = fetch_mldata('uci-20070111 breastTumor', data_home=test_data_home) dat = fetch_mldata('housing_scale',transpose_data=True, data_home=test_data_home) print (dat.data.shape) print (dat.target.shape) print (dat.data[1]) print (dat.target_names)
sys.path.append('../') import numpy as np import random from frameworks.CPLELearning import CPLELearningModel from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model.stochastic_gradient import SGDClassifier import sklearn.svm from methods.scikitWQDA import WQDA from frameworks.SelfLearning import SelfLearningModel # load data # fetch_mldata # cancer = fetch_mldata("Lung cancer (Ontario)") # load data cancer = fetch_mldata("heart") # load data X = cancer.target.T # label (270, ) ytrue = np.copy(cancer.data).flatten() # data (3510, ) ytrue[ytrue > 0] = 1 # set the value = 1 where the original value > 0 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point # print(list(np.where(ytrue == 0)[0])) random_labeled_points = random.sample(list(np.where(ytrue == 0)[0]), int(labeled_N/2))+\ random.sample(list(np.where(ytrue == 1)[0]), int(labeled_N/2)) # set the labels of the labeled samples
# Feature importance #--------------------- """ # on Mac, following code didn't work from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') mnist """ from sklearn.ensemble import RandomForestClassifier from sklearn.datasets.mldata import fetch_mldata import tempfile test_data_home = tempfile.mkdtemp() mnist = fetch_mldata('MNIST original', data_home=test_data_home) mnist rnd_clf = RandomForestClassifier(random_state=42) rnd_clf.fit(mnist["data"], mnist["target"]) import matplotlib import matplotlib.pyplot as plt def plot_digit(data): image = data.reshape(28, 28) plt.imshow(image, cmap=matplotlib.cm.hot, interpolation="nearest") plt.axis("off")
import numpy as np import random from frameworks.CPLELearning import CPLELearningModel from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model.stochastic_gradient import SGDClassifier import sklearn.svm from methods.scikitWQDA import WQDA from frameworks.SelfLearning import SelfLearningModel # load data cancer = fetch_mldata("Lung cancer (Ontario)") X = cancer.target.T ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel)
from sklearn.datasets import * from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV #from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.metrics import classification_report,accuracy_score from sklearn import neighbors from sklearn.datasets.mldata import fetch_mldata import tempfile test_data_home = tempfile.mkdtemp() data = fetch_mldata('uci-20070111 wine', data_home=test_data_home) #print data.DESCR n_trials = 3 train_percentage = [90,70,50] # Set the parameters by cross-validation tuned_parameters = [{'n_neighbors' : [2,4,5,6,7,8,9,10], 'weights' : ['uniform', 'distance']}] print "All used parameters :",tuned_parameters test_accuracies = numpy.zeros(n_trials) for n in train_percentage: print "K neighbour classfier using brute approach for Wine data" print "" print "" print "training percentage ::",n print ""
#!/usr/bin/env python3 __author__ = "Thibaut Thonet, Maziar Moradi Fard" __license__ = "GPL" import tensorflow as tf from utils import read_list from sklearn.datasets.mldata import fetch_mldata # Fetch the dataset dataset = fetch_mldata("USPS") print("Dataset USPS loaded...") data = dataset.data target = dataset.target - 1 # Labels between 0 and 9 to match digits n_samples = data.shape[0] # Number of samples in the dataset n_clusters = 10 # Number of clusters to obtain # Get the split between training/test set and validation set test_indices = read_list("split/usps/test") validation_indices = read_list("split/usps/validation") # Auto-encoder architecture input_size = data.shape[1] hidden_1_size = 500 hidden_2_size = 500 hidden_3_size = 2000 embedding_size = n_clusters dimensions = [ hidden_1_size, hidden_2_size, hidden_3_size,
bench_active_set = 0 if dataset_id == 1: dataset = "synthetic" n_samples, n_features, n_tasks = (47, 1177, 20) # generate dataset X, y = make_regression(n_samples=n_samples, n_features=n_features, n_targets=n_tasks) # , random_state=2) X = X.astype(float) y = y.astype(float) eps = 1e-3 elif dataset_id == 2: dataset = "leukemia" data = fetch_mldata(dataset) X = data.data # [:, ::10] y = data.target[:, None] X = X.astype(float) y = y.astype(float) eps = 1e-3 if dataset_id == 3: # The data can be found in https://drive.google.com/open?id=139nKKy0AkpkZntB80n-LmcuzGgC8pQHi # Please unzip the file "meg_data.tar.gz" dataset = 'meg_full' data = io.loadmat('meg_Xy_full.mat') X = np.array(data['X'], dtype=np.float, order='F') Y = np.array(data['Y'], dtype=np.float) y = Y idx = np.argmax(np.sum(Y**2, axis=0))
# [gaussian process 分类图像]( http://www.pyimagesearch.com/2014/09/22/getting-started-deep-learning-python/) # <codecell> from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn import datasets from sklearn import gaussian_process import numpy as np import cv2 from sklearn.datasets.mldata import fetch_mldata import pickle # <codecell> mnist = fetch_mldata('MNIST original') # <markdowncell> # 首次获取数据使用 fetch_mldata, # dataset = fetch_mldata('MNIST Original') # # 下次就可以使用 load 文件了 # fid = open('MnistData.pkl','wb') # pickle.dump(dataset,fid) # <markdowncell> # 串行化数据导入 # <codecell>
the predictions of a :class:`sklearn.ensemble.RandomForestRegressor` object. The data used here are a classical machine learning data-set, describing various features of different cars, and their MPG. """ # Regression Forest Example import numpy as np from matplotlib import pyplot as plt from sklearn.ensemble import RandomForestRegressor import sklearn.cross_validation as xval from sklearn.datasets.mldata import fetch_mldata import forestci as fci # retreive mpg data from machine learning library mpg_data = fetch_mldata('mpg') # separate mpg data into predictors and outcome variable mpg_X = mpg_data["data"] mpg_y = mpg_data["target"] # split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42) # create RandomForestRegressor n_trees = 2000 mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42) mpg_forest.fit(mpg_X_train, mpg_y_train) mpg_y_hat = mpg_forest.predict(mpg_X_test)
import numpy as np import sklearn from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.datasets.mldata import fetch_mldata import matplotlib.pyplot as plt #from display_network import * mnist = fetch_mldata('mnist-original', data_home='/media/Vancouver/apps/mnist_dataset/') print(mnist) X_all = mnist.data y_all = mnist.target X0 = X_all[np.where(y_all == 0)[0]] # all digit 0 X1 = X_all[np.where(y_all == 1)[0]] # all digit 1 y0 = np.zeros(X0.shape[0]) # class 0 label y1 = np.ones(X1.shape[0]) # class 1 label X = np.concatenate((X0, X1), axis=0) # all digits y = np.concatenate((y0, y1)) # all labels # split train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2000) ################# print(X_train.shape) model = LogisticRegression(C=1e5) model.fit(X_train, y_train) y_pred = model.predict(X_test) print(accuracy_score(y_test, y_pred)) print(y_pred) print("Accuracy %.2f%%" % (100 * accuracy_score(y_test, y_pred.tolist()))) mis = np.where((y_pred - y_test) != 0)[0] print(mis)
from dcgan.trainer import Trainer from dcgan.generator import Generator from dcgan.discreminator import Discriminator from sklearn.datasets.mldata import fetch_mldata import numpy as np import pandas as pd import pickle if __name__ == '__main__': gen = Generator(100) dis = Discriminator() data = fetch_mldata('mnist-original', data_home=".") X = data['data'] n_train = X.shape[0] X = np.array(X, dtype=np.float32) X /= 255. X = X.reshape(n_train, 1, 28, 28) trainer = Trainer(gen, dis) trainer.fit(X, batch_size=1000, epochs=1000) df_loss = pd.DataFrame(trainer.loss) df_loss.to_csv('loss.csv') gen.to_cpu() dis.to_cpu()
from sklearn.neural_network import MLPClassifier import struct import numpy as np from sklearn.datasets.mldata import fetch_mldata if __name__ == '__main__': mnist = fetch_mldata('MNIST original') # rescale the data, use the traditional train/test split X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] clf = MLPClassifier(alpha=0.01, hidden_layer_sizes=(200, 150), random_state=1, max_iter=10) clf.fit(X_train, y_train) prediction = clf.predict(X_test) print("Test set score: %f" % clf.score(X_test, y_test))
the predictions of a :class:`sklearn.ensemble.RandomForestRegressor` object. The data used here are a classical machine learning data-set, describing various features of different cars, and their MPG. """ # Regression Forest Example import numpy as np from matplotlib import pyplot as plt from sklearn.ensemble import RandomForestRegressor import sklearn.model_selection as xval from sklearn.datasets.mldata import fetch_mldata import forestci as fci # retreive mpg data from machine learning library mpg_data = fetch_mldata('mpg') # separate mpg data into predictors and outcome variable mpg_X = mpg_data["data"] mpg_y = mpg_data["target"] # split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42 ) # create RandomForestRegressor n_trees = 2000 mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
pl.show() print("Wartosci: 7, 15") net5.trainf = nl.train.train_gd error5 = net5.train(inp, tar, epochs=500, show=100, goal=0.002) out5 = net5.sim(inp) x2 = np.linspace(1, 2.5, 150) y2 = net5.sim(x2.reshape(x2.size, 1)).reshape(x2.size) y3 = net5.sim(inp).reshape(size) pl.plot(x2, y2, '-', x, y, '.', x, y3, 'p') pl.legend(['wartosc rzeczywista', 'wynik uczenia']) pl.show() # Zadanie 3: data = fetch_mldata('MNIST') train, test, train_targets, test_targets = train_test_split(data.data, data.target, test_size=0.5, random_state=42) mlp = MLPClassifier(solver='adam', alpha=0.0001) mlp.fit(train, train_targets).predict(test) print(mlp.score(test, test_targets)) mlp = MLPClassifier(solver='lbfgs', alpha=0.0001) mlp.fit(train, train_targets).predict(test) print(mlp.score(test, test_targets)) mlp = MLPClassifier(alpha=0.000001) mlp.fit(train, train_targets).predict(test)
# -*- coding: utf-8 -*- """ Created on Sun Nov 22 13:26:02 2015 @author: LegendsUser """ import numpy as np import random from sklearn.datasets.mldata import fetch_mldata from sklearn.linear_model.stochastic_gradient import SGDClassifier import sklearn.svm from scikitWQDA import WQDA from SelfLearning import SelfLearningModel heart = fetch_mldata("heart") X = heart.data ytrue = np.copy(heart.target) ytrue[ytrue == -1] = 0 labeled_N = 2 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(list( np.where(ytrue == 0)[0]), int(labeled_N / 2)) + random.sample( list(np.where(ytrue == 1)[0]), int(labeled_N / 2)) ys[random_labeled_points] = ytrue[random_labeled_points] basemodel = SGDClassifier(loss='log', penalty='l1') basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])