예제 #1
0
파일: data.py 프로젝트: delta2323/ADGM
def load_mnist(N_labeled=100, N_test=10000, pruning=False):
    data = mldata.fetch_mldata('MNIST original')
    x = data['data'].astype(np.float32) / 255
    y = data['target'].astype(np.int32)

    if pruning:
        x = prune_by_stddev(x)
    D = len(x[0])

    T = 10
    N_labeled /= T

    x_split = [np.split(x[y == i], [N_labeled]) for i in six.moves.range(T)]
    x_train = np.concatenate([x_[0] for x_ in x_split])
    x_rest = np.concatenate([x_[1] for x_ in x_split])
    y_split = [np.split(y[y == i], [N_labeled]) for i in six.moves.range(T)]
    y_train = np.concatenate([y_[0] for y_ in y_split])
    y_rest = np.concatenate([y_[1] for y_ in y_split])

    N = 70000
    N_rest = N - N_labeled * T
    perm = np.random.permutation(N_rest)
    x_unlabeled, x_test = np.split(x_rest[perm], [N_rest - N_test])
    _, y_test = np.split(y_rest[perm], [N_rest - N_test])
    return x_train, y_train, x_test, y_test, x_unlabeled, D, T
예제 #2
0
def test_save_sqlite_arrays(nmax=1000):
    "Load MNIST database (70000 samples) and store in a compressed SQLite db"
    os.path.exists(dbname) and os.unlink(dbname)
    con = sqlite3.connect(dbname, detect_types=sqlite3.PARSE_DECLTYPES)
    cur = con.cursor()
    cur.execute(
        "create table test (idx integer primary key, X array, y integer );")

    mnist = fetch_mldata('MNIST original')

    X, y = mnist.data[:nmax], mnist.target[:nmax]
    m = X.shape[0]
    t0 = time.time()
    for i, x in enumerate(X):
        cur.execute("insert into test (idx, X, y) values (?,?,?)",
                    (i, y, int(y[i])))
        if not i % 100 and i > 0:
            elapsed = time.time() - t0
            remain = float(m - i) / i * elapsed
            print("\r[%5d]: %3d%% remain: %d secs" %
                  (i, 100 * i / m, remain), )
            sys.stdout.flush()

    con.commit()
    con.close()
    elapsed = time.time() - t0
    print()
    print("Storing %d images in %0.1f secs" % (m, elapsed))
def mnist_data(tries):
    """This function fetches the MNIST dataset from MLDATA and splits the data and the
    target.
    http://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_mldata.html
    http://mldata.org/repository/data/viewslug/mnist/
    """
    mnist = fetch_mldata('mnist original')
    X, y = mnist.data, mnist.target
    title = "Learning Curve Comparison for MNIST"
    plot_learning_curve(X, y, tries, title)
예제 #4
0
def load_dataset(num_patches, patch_size):
	'''utility function to load data set'''

	global verbosity_level
	print('======loading dataset=======\n')


	mnist = fetch_mldata('MNIST original')
	sss = StratifiedShuffleSplit(mnist.target, 1, test_size=0.1, train_size=20000, random_state=0)
	for train_index, test_index in sss:
		trainX, testX = mnist.data[train_index], mnist.data[test_index]
		trainY, testY = mnist.target[train_index], mnist.target[test_index]

	no_of_images  = trainX.shape[0]	
	""" the dataset is originally read as dictionary, convert it to an array.
		the resulting array is of shape[512,512,10]. 
		no of images=10
		image size = 512*512(gray scale)
	"""
	#dataset is of shape [64*10,000]
	dataset = np.zeros((patch_size*patch_size, num_patches))

	"""Randomly sample images"""
	rand = np.random.RandomState(23455)
	image_number = rand.randint(no_of_images, size = num_patches)

	for i in xrange(num_patches):
		""""get the patch indices """
		index3 = image_number[i]

		""""extract patch from original image"""
		dataset[:,i] = trainX[index3]

	if verbosity_level==2:
		print('=========patches extracted========\n')
	"""normalize the dataset(min max feature scaling is used)"""
	#transpose 'dataset' to form attributes as columns of the matrix, since scaling
	#is to be done featurewise
	if verbosity_level==2:
		print('***********scaling features to [0.1, 0.9] range***********\n')

	#dataset = normalizeDataset(dataset)	
	dataset = dataset / 255.0
	#dataset = np.transpose(dataset) # newsize = 10,000*64
	#min_max_scaler = preprocessing.MinMaxScaler()
	#dataset = min_max_scaler.fit_transform(dataset)
	#dataset = np.transpose(dataset) #transpose to 64*10,000

	print('======loading dataset : completed ========\n')
	return dataset
예제 #5
0
def main():

	#set the timer
	start = time.time()

	#load the data
	mnist = fetch_mldata('MNIST original')
	mnist.target = mnist.target.astype(np.int32)

	seed = np.random.randint(1,30000)
	rand = np.random.RandomState(seed)
	items = len(mnist.target)
	indices = rand.randint(items, size = 70000)
	trindex = indices[0:30000]
	tsindex = indices[30000:]

	#scale down features to the range [0, 1]
	mnist.data = mnist.data/255.0
	mnist.data = mnist.data.astype(np.float32)

	trainX = mnist.data[trindex]
	testX = mnist.data[tsindex]
	trainY = mnist.target[trindex]
	testY = mnist.target[tsindex]

	#extract the features using KPCA
	kpca = KernelPCA(kernel='precomputed')
	kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000])
	#Fit the model from data in X
	kpca.fit(kpca_train)

	kernel_train = arc_cosine(trainX, trainX[0:1000])
	kernel_test = arc_cosine(testX, trainX[0:1000])

	trainX_kpca = kpca.transform(kernel_train)
	testX_kpca = kpca.transform(kernel_test)
	print testX_kpca.shape

	#fit the svm model and compute accuaracy measure
	clf = svm.SVC(kernel=arc_cosine)
	clf.fit(trainX_kpca, trainY)

	pred = clf.predict(testX_kpca)
	print accuracy_score(testY, pred)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
예제 #6
0
파일: loadData.py 프로젝트: pwang724/OLD
def mnist():
    mnist = fetch_mldata('MNIST original', data_home='./mldata')
    onehot = np.zeros((mnist.target.size, 10))
    onehot[np.arange(onehot.shape[0]), mnist.target.astype(int)] = 1

    train_X, test_X, train_Y, test_Y = train_test_split(mnist.data,
                                                        onehot,
                                                        test_size=1 / 7.0,
                                                        random_state=0)
    train_X = train_X.astype(float) / 255
    test_X = test_X.astype(float) / 255

    # plt.figure(figsize=(20, 4))
    # for index, (image, label) in enumerate(zip(train_X[0:5], train_Y[0:5])):
    #     plt.subplot(1, 5, index + 1)
    #     plt.imshow(np.reshape(image, (28, 28)), cmap=plt.cm.gray)
    #     plt.axis('off')

    return train_X, train_Y, test_X, test_Y
예제 #7
0
def load_data2(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    X = np.array([np.reshape(element, (784, )) for element in X])
    print(y[1])
    y = np.array([np.reshape(vectorize(element), (10, )) for element in y])
    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test
def TrainingMachine():
    dataset = fetch_mldata('mnist-original', data_home="home/nishchit/Major Project/Finalproject")


  #save the images of the digits as respect to corresponding feature and corresponding label using numpy array.
    features = np.array(dataset.data, 'int16') 
    labels = np.array(dataset.target, 'int')
 
  #calculating the HOG feature and saving it in numpy array     
    list_hog_fd = []
    for feature in features:
        fd = skimage.feature._hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
        list_hog_fd.append(fd)
    hog_features = np.array(list_hog_fd, 'float64')
     
    #creating an objejct of LinearSVC
    clf = LinearSVC()
       

    clf.fit(hog_features, labels)

    joblib.dump(clf, "digits_cls.pkl", compress=3)
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata
from BigDataRuleListClassifier import *
from SVMBigDataRuleListClassifier import *
from sklearn.ensemble import RandomForestClassifier
import time

feature_labels = [
    "#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)",
    "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)",
    "Body mass index", "Diabetes pedigree function", "Age (years)"
]

data = fetch_mldata("diabetes")  # get dataset
y = (data.target + 1) / 2  # target labels (0 or 1)

###############################################################################

Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y)  # split

t0 = time.time()
# train classifier (allow more iterations for better accuracy)
clf = RuleListClassifier(max_iter=10000, class0label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)
print "RuleListClassifier Accuracy:", clf.score(
    Xtest, ytest), "Learned interpretable model:\n", clf
t1 = time.time()

# train classifier (allow more iterations for better accuracy)
bclf = BigDataRuleListClassifier(training_subset=0.1,
                                 subset_estimator=RandomForestClassifier(
예제 #10
0
from intro_forward_backward_source import soft_thresholding

from mpi4py import MPI

anysource = MPI.ANY_SOURCE
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

############################################################################
#            Loading and visualizing the data
############################################################################

dataset_name = 'liver-disorders'
data = fetch_mldata(dataset_name)
X = data.data
y = data.target
X = X.astype(float)
y = y.astype(float)
y[y == 2] = -1

# standardize data
X -= X.mean(axis=0)
X /= X.std(axis=0)
X[np.isnan(X)] = 0.

############################################################################
#            Dataset splitting for distribution settting
############################################################################
n, p = np.shape(X)
예제 #11
0
def main():
    '''
    use of an autocoder
    param path: path to folder where you are loading MNIST
    param type: type of gradient function (sgd, sgd_momentum, rmsprop, adam)
    param train_size: train data size
    param test_size: test data size
    param num_epoch: number of epochs
    param minibatch_size: minibatch size
    param momentum: momentum
    param display: print to display
    '''
    options = parse_args()
    mnist = fetch_mldata('MNIST original', data_home=options['path'])
    data = mnist.data.astype('float64')

    train_size = options['train_size']
    train_data = data[np.random.choice(data.shape[0], train_size, False), :]
    test_size = options['test_size']
    test_data = data[np.random.choice(data.shape[0], test_size, False), :]

    autoencoder = Autoencoder([
        FCLayer((784, 250), SigmoidActivationFunction(), True),
        FCLayer((250, 50), SigmoidActivationFunction(), True),
        FCLayer((50, 2), SigmoidActivationFunction(), True),
        FCLayer((2, 50), LinearActivationFunction(), True),
        FCLayer((50, 250), SigmoidActivationFunction(), True),
        FCLayer((250, 784), SigmoidActivationFunction(), True)
    ])

    if options['type'] == 'sgd':
        res = autoencoder.run_sgd(train_data.transpose(),
                                  step_size=1.0,
                                  momentum=0,
                                  num_epoch=options['num_epoch'],
                                  minibatch_size=options['minibatch_size'],
                                  l2_coef=1e-4,
                                  test_inputs=test_data.transpose(),
                                  display=options['display'])
    elif options['type'] == 'sgd_momentum':
        res = autoencoder.run_sgd(train_data.transpose(),
                                  step_size=1.0,
                                  momentum=options['momentum'],
                                  num_epoch=options['num_epoch'],
                                  minibatch_size=options['minibatch_size'],
                                  l2_coef=1e-4,
                                  test_inputs=test_data.transpose(),
                                  display=options['display'])
    elif options['type'] == 'rmsprop':
        res = autoencoder.run_rmsprop(train_data.transpose(),
                                      step_size=1.0,
                                      num_epoch=options['num_epoch'],
                                      minibatch_size=options['minibatch_size'],
                                      l2_coef=1e-4,
                                      test_inputs=test_data.transpose(),
                                      display=options['display'])
    elif options['type'] == 'adam':
        res = autoencoder.run_adam(train_data.transpose(),
                                   step_size=1.0,
                                   num_epoch=options['num_epoch'],
                                   minibatch_size=options['minibatch_size'],
                                   l2_coef=1e-4,
                                   test_inputs=test_data.transpose(),
                                   display=options['display'])

    print(res)

    plt.title('test loss')
    plt.scatter(np.arange(len(res['test_loss'])), res['test_loss'])
    plt.show()
예제 #12
0
import numpy as np
import random
from frameworks.CPLELearning import CPLELearningModel
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from methods.scikitWQDA import WQDA

# load data
heart = fetch_mldata("heart")
X = heart.data
ytrue = np.copy(heart.target)
ytrue[ytrue == -1] = 0

# label a few points
labeled_N = 2
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised score", basemodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "semi-supervised score", ssmodel.score(X, ytrue)
예제 #13
0
def main():

    global lambdaa, ntrain, num_classes
    start = time.time()
    lambdaa = 0.0001
    max_iterations = 100

    # Load the Digit Data Set
    mnist = fetch_mldata("MNIST original")
    # min_max_scaler = preprocessing.MinMaxScaler()
    # mnist.data = min_max_scaler.fit_transform(mnist.data)
    mnist.data = mnist.data / 255.0
    mnist.target = mnist.target.astype(np.int32)
    seed = np.random.randint(1, 30000)
    rand = np.random.RandomState(seed)
    items = len(mnist.target)
    indices = rand.randint(items, size=70000)
    trindex = indices[0:50000]
    tsindex = indices[50000:]

    trainX = mnist.data[trindex]
    testX = mnist.data[tsindex]
    trainY = mnist.target[trindex]
    testY = mnist.target[tsindex]

    # trainX,testX,trainY,testY = train_test_split(mnist.data,mnist.target,test_size=0.3)
    mtrain, ntrain = trainX.shape
    mtest, ntest = testX.shape

    # Append one to the first column of the training data
    ones = np.ones((mtrain, 1), dtype=int)
    trainX = np.append(ones, trainX, axis=1)
    ones = np.ones((mtest, 1), dtype=int)
    testX = np.append(ones, testX, axis=1)
    ntrain = ntest = ntrain + 1

    # make digits range as 1-10
    trainY = trainY + 1
    testY = testY + 1
    num_classes = len(np.unique(trainY))
    theta = np.random.random_sample((num_classes, ntrain)).flatten()

    """do the optimization using L-BFGS algoritm"""
    result = scipy.optimize.minimize(
        costFunction,
        theta,
        args=(trainX, trainY),
        method="L-BFGS-B",
        jac=True,
        options={"maxiter": max_iterations, "disp": True},
    )

    theta = result.x.reshape((num_classes, ntrain))
    theta_dash = np.transpose(theta)

    """" classify the test datapoints using the learned parameters"""
    pred = np.ones(mtest, dtype=int)
    for i in xrange(mtest):
        temp = np.exp(np.dot(testX[i], theta_dash))
        pred[i] = np.argmax(temp) + 1

    print accuracy_score(testY, pred)
    print ("total : %d, correct : %d, incorrect : %d\n" % (len(pred), np.sum(pred == testY), np.sum(pred != testY)))

    print ("execution time(in Minutes):%f\n" % ((time.time() - start) / 60))
예제 #14
0
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.datasets.mldata import fetch_mldata
import tempfile
test_data_home = tempfile.mkdtemp()
digits=fetch_mldata('uci-20070111 wisconsin', data_home=test_data_home)


# Loading the Digits dataset


# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.data)
X = digits.data
y = digits.target
print (digits.data.shape)
print (digits.target.shape)
print (digits.data[0])
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    digits.data,digits.target, test_size=0.1, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
if dataset == "synthetic":
    # Generate data set
    n_samples = 100
    n_features = 200
    sigma = 1.
    sparsity = 0.9
    corr = 0.5
    random_state = np.random.randint(0, 100)

    X, y, true_beta, true_sigma = generate_data(n_samples, n_features, sigma,
                                                sparsity, corr,
                                                random_state=random_state)

if dataset == "leukemia":

    data = fetch_mldata('leukemia')
    X = data.data
    y = data.target
    X = X.astype(float)
    y = y.astype(float)
    n_samples, n_features = X.shape

NO_SCREENING = 0
GAPSAFE = 1
WSTRT_SIGMA_0 = 2
BOUND = 3
BOUND2 = 4


# Number of elements in the path (set to 100 for papers results)
n_lambdas = 10
예제 #16
0
# Preparing data to compare our method

import pandas as pd
from sklearn.datasets import fetch_covtype, fetch_kddcup99
from sklearn.datasets.mldata import fetch_mldata

# Loading and editing datasets
# 
# The target variable contains the label of abnormality.
# 0 : Normal
# 1 : Anomaly

covtype = fetch_covtype()
SF = fetch_kddcup99(subset='SF')
http = fetch_kddcup99(subset='http')
shuttle = fetch_mldata('shuttle')

# We use the rules proposed in Learning hyperparameters for unsupervised anomaly detection.
# A. Thomas, S. Clémençon, V. Feuillard, A. Gramfort. Anomaly Detection Workshop, ICML 2016]

# For the Forest Cover dataset cover types 4 and 5 are considered abnormal when the cover type 2 is considered as normal

df_covtype = pd.DataFrame(covtype.data)
df_covtype['target'] = covtype.target
df_covtype = df_covtype.query('target in [2,4,5]')
df_covtype.target = df_covtype.target.replace(2, 0).replace(4, 1).replace(5, 1)

# For the sf and http dataset all the categories not flagged normal are considered abnormal

df_sf = pd.DataFrame(SF.data)
df_sf['target'] = SF.target
예제 #17
0
import sklearn, sys, time
from sklearn.neighbors import KNeighborsClassifier
import numpy as numpy
from sklearn.datasets.mldata import fetch_mldata
t0 = time.clock()
mnist = fetch_mldata('MNIST original')
X, y = mnist.data / 255., mnist.target
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

time_dataset= time.clock() - t0
print("Dataset created in time "+str(time_dataset))

t0 = time.clock()
# print(mnist.data.shape)
neigh = KNeighborsClassifier(n_neighbors=3, metric="chebyshev", n_jobs=2)
neigh.fit(X_train, y_train)
time_train= time.clock() - t0
print("Dataset trained in time "+str(time_train))

t0 = time.clock()
y_pred = neigh.predict(X_test)
time_pred = time.clock() - t0
print("Predicted in time "+ str(time_pred))
print(y_pred.tolist())
score = 0
for x in range(len(y_pred)):
	if y_pred[x] == y_test[x]:
		score = score + 1
score = float(score)/len(y_pred)
print("score = "+ str(score))
예제 #18
0
'''
Created on Aug 16, 2014

@author: ryanshiroma
'''
from sklearn import datasets
from sklearn.datasets.mldata import fetch_mldata
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', data_home=custom_data_home)



if __name__ == '__main__':
    pass

예제 #19
0
 def __init__(self):
     self.dataset = fetch_mldata("mnist-original", data_home="./")
     self.features = np.array(self.dataset.data, 'int16')
     self.labels = np.array(self.dataset.target, 'str')
예제 #20
0
from RuleListClassifier import *
import sklearn.ensemble
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata

dataseturls = ["https://archive.ics.uci.edu/ml/datasets/Iris", "https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes"]
datasets = ["iris", "diabetes"]
data_feature_labels = [
    ["Sepal length", "Sepal width", "Petal length", "Petal width"],
    ["#Pregnant","Glucose concentration demo","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
]
data_class1_labels = ["Iris Versicolour", "No Diabetes"]
for i in range(len(datasets)):
    print "--------"
    print "DATASET: ", datasets[i], "(", dataseturls[i], ")"
    data = fetch_mldata(datasets[i])
    y = data.target
    y[y>1] = 0
    y[y<0] = 0

    Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y)    
    
    clf = RuleListClassifier(max_iter=50000, n_chains=3, class1label=data_class1_labels[i], verbose=False)
    clf.fit(Xtrain, ytrain, feature_labels=data_feature_labels[i])
    
    print "accuracy:", clf.score(Xtest, ytest)
    print "rules:\n", clf
    print "Random Forest accuracy:", sklearn.ensemble.RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
예제 #21
0
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import mldata
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

mnist = mldata.fetch_mldata('MNIST Original')


def shuffle_split(data, labels, test_ratio=0.2):
    indices = np.random.permutation(data.shape[0])
    test_size = int(test_ratio*data.shape[0])
    return data[indices[test_size:]], labels[indices[test_size:]], \
           data[indices[: test_size]], labels[indices[: test_size]]


def print_image(data):
    data_image = data.reshape(28, 28)
    plt.imshow(data_image, cmap=matplotlib.cm.binary, interpolation='nearest')
    plt.show()


def evaluate_model(data, labels):
    classifier = SGDClassifier(random_state=42)
    labels_predict = cross_val_predict(classifier, data, labels == 5, cv=3)
    return confusion_matrix(labels, labels_predict)

예제 #22
0
import numpy as np
import random
from frameworks.CPLELearning import CPLELearningModel
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import sklearn.svm
from methods.scikitWQDA import WQDA
from frameworks.SelfLearning import SelfLearningModel

# load data
cancer = fetch_mldata("Lung cancer (Ontario)")
X = cancer.target.T
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
예제 #23
0
    p1 = subprocess.Popen(
        ["./lcm", '{}f'.format(type), filename,
         str(min_support), "-"],
        stdout=subprocess.PIPE)

    output = p1.communicate(str.encode("utf-8"))[0]
    output = output.decode('utf-8')
    itemsets = output.split('\n')

    itemsets = list(map(split_itemset, itemsets))
    itemsets = list(filter(None.__ne__, itemsets))
    return itemsets


mnist_path = "mnist"
digits = fetch_mldata('mnist-original', data_home=mnist_path)

features = digits.data
labels = digits.target

class_0 = features[labels == 0, :]

transactions = []
for i in range(class_0.shape[0]):
    transactions.append(np.where(class_0[i, :] > 50)[0].tolist())

with open('s.txt', 'w') as fp:
    for i in range(class_0.shape[0]):
        fp.write(' '.join(map(str, transactions[i])))
        if (i + 1 != class_0.shape[0]):
            fp.write('\n')
예제 #24
0
# -*- coding: utf-8 -*-
from shiftpixels import *
from sklearn.datasets.mldata import fetch_mldata
mnist = fetch_mldata('mnist-original', data_home='/Users/maxim/Python AI/Hands on ML/datasets')
mnist

X, y = mnist["data"], mnist["target"]

import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
           interpolation="nearest")
plt.axis("off")
plt.show()

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]




y_train_5 = (y_train == 5) 
y_test_5 = (y_test == 5)
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

dataset = fetch_mldata('MNIST original')

#Importing the Data from the dataset
X = dataset["data"]
y = dataset["target"]

#Printing random image at 10062 position to verify the data fetch was successful or not
element_num = 10062
temp = X[element_num].reshape(28, 28)
plt.imshow(temp)
plt.show()
print(y[element_num])  #Printing the label associated with the prinnted image

#Separating the trainig and test data
Train_X = X[:6000]
Train_y = y[:6000]
Test_X = X[60000:]
Test_y = y[60000:]

#Training data and the test data are separetd properly,
#We dont need to worry about them. But as we are going to perform a cross-validation on the tranning set
#lets make shufful it so that the data gets equally distributed and none of the data digits escape any of the
#validation set or the traning set, as some algorithm only performs well on the equally distributed data.
reviced_index = np.random.permutation(Train_X.shape[0])
예제 #26
0
파일: demo.py 프로젝트: vgoliber/qboost
    print("=============================================")

    # plt.subplot(211)
    # plt.bar(range(len(y_test)), y_test)
    # plt.subplot(212)
    # plt.bar(range(len(y_test)), y_test_dw)
    # plt.show()

    return


if __name__ == '__main__':

    if '--mnist' in sys.argv:

        mnist = fetch_mldata('MNIST original', data_home='data')

        idx_01 = np.where(mnist.target <= 10)[0]

        np.random.shuffle(idx_01)
        idx_01 = idx_01[:5000]
        idx_train = idx_01[:2*len(idx_01)//3]
        idx_test = idx_01[2*len(idx_01)//3:]

        X_train = mnist.data[idx_train]
        X_test = mnist.data[idx_test]

        y_train = 2*(mnist.target[idx_train] <= 4) - 1
        y_test = 2*(mnist.target[idx_test] <= 4) - 1

        clfs = train_model(X_train, y_train, X_test, y_test, 1.0)
예제 #27
0
        target_name = "int3"
        if parsed.target_name != target_name:
            logging.warning(
                "{} target is {}".format(parsed.dataset, target_name)
            )
        parsed.target_name = target_name
    elif parsed.dataset == "uci-20070111-liver-disorders":
        target_name = "int2"
        if parsed.target_name != target_name:
            logging.warning(
                "{} target is {}".format(parsed.dataset, target_name)
            )
        parsed.target_name = target_name

    bunch = fetch_mldata(
        parsed.dataset, target_name=parsed.target_name,
        data_home=data_home
    )

    data, labels = scale(bunch['data']), bunch['target']
    old_labels = np.empty_like(labels)
    np.copyto(old_labels, labels)
    for i, label in enumerate(np.unique(labels)):
        labels[old_labels == label] = i + 1
    labels = np.ravel(labels).astype(int)

    skf = StratifiedKFold(
        y=labels, n_folds=2, shuffle=False, random_state=42
    )
    # get the last of the two splits
    for train_idx, test_idx in skf: pass
8. ANOREXIA: no, yes 
9. LIVER BIG: no, yes 
10. LIVER FIRM: no, yes 
11. SPLEEN PALPABLE: no, yes 
12. SPIDERS: no, yes 
13. ASCITES: no, yes 
14. VARICES: no, yes 
15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 
-- see the note below 
16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 
17. SGOT: 13, 100, 200, 300, 400, 500, 
18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 
19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 
20. HISTOLOGY: no, yes 
""" 
data = fetch_mldata("datasets-UCI hepatitis") # get dataset

#some data cleaning (due to horrible mldata format)
# target
y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]]
# categorical variables
data['SEX'] = data.data
feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()]
columns = {}
for label in feature_labels:
    column = data[label] if len(data[label])>1 else data[label][0]
    while type(column[0]) == list or type(column[0]) == np.ndarray:
        column = [c[0] for c in column]
    columns[label] = pd.Series(column)
# numeric variables
columns['AGE'] = data.target 
예제 #29
0
 def __init__(self):
     self.data = fetch_mldata('MNIST original')
     self._preprocess_data()
예제 #30
0
from sklearn.datasets.mldata import fetch_mldata
import tempfile
test_data_home = tempfile.mkdtemp()
breast=fetch_mldata('datasets-UCI breast-w', transpose_data=True, data_home=test_data_home)
#breast=fetch_mldata('housing_scale', data_home=test_data_home)
print breast.data.shape
n_samples, n_features = breast.data.shape

print n_samples,n_features
print breast.data.shape
#print breast.data[0]
print breast.target.shape



예제 #31
0
from sklearn.cross_validation import train_test_split
from sklearn.datasets.mldata import fetch_mldata
from RuleListClassifier import *
from sklearn.ensemble import RandomForestClassifier

feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
    
data = fetch_mldata("diabetes") # get dataset
y = (data.target+1)/2 # target labels (0 or 1)
Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split

# train classifier (allow more iterations for better accuracy)
clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
clf.fit(Xtrain, ytrain, feature_labels=feature_labels)

print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
8. ANOREXIA: no, yes 
9. LIVER BIG: no, yes 
10. LIVER FIRM: no, yes 
11. SPLEEN PALPABLE: no, yes 
12. SPIDERS: no, yes 
13. ASCITES: no, yes 
14. VARICES: no, yes 
15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 
-- see the note below 
16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 
17. SGOT: 13, 100, 200, 300, 400, 500, 
18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 
19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 
20. HISTOLOGY: no, yes 
"""
data = fetch_mldata("datasets-UCI hepatitis")  # get dataset

#some data cleaning (due to horrible mldata format)
# target
y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]]
# categorical variables
data['SEX'] = data.data
feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()]
columns = {}
for label in feature_labels:
    column = data[label] if len(data[label]) > 1 else data[label][0]
    while type(column[0]) == list or type(column[0]) == np.ndarray:
        column = [c[0] for c in column]
    columns[label] = pd.Series(column)
# numeric variables
columns['AGE'] = data.target
예제 #33
0
# <markdowncell>

# 对于不大的文件使用.npy 保存

# <markdowncell>

# 相关参考:http://www.astrobetter.com/blog/2013/07/29/python-tip-storing-data/

# <codecell>

from sklearn.datasets.mldata import fetch_mldata

# <codecell>

dataset = fetch_mldata('MNIST Original')

# <headingcell level=2>

# [读取.mat 文件](http://stackoverflow.com/questions/874461/read-mat-files-in-python)

# <codecell>

from scipy.io import loadmat

# <codecell>

mat =loadmat('New/mauna-loa-atmospheric-co2.mat')

# <headingcell level=2>
예제 #34
0
from sklearn.datasets.mldata import fetch_mldata
import tempfile
test_data_home = tempfile.mkdtemp()

#data = fetch_mldata('uci-20070111 breastTumor', data_home=test_data_home)
dat = fetch_mldata('housing_scale',transpose_data=True, data_home=test_data_home)
print (dat.data.shape)
print (dat.target.shape)
print (dat.data[1])
print (dat.target_names)
예제 #35
0
sys.path.append('../')

import numpy as np
import random
from frameworks.CPLELearning import CPLELearningModel
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import sklearn.svm
from methods.scikitWQDA import WQDA
from frameworks.SelfLearning import SelfLearningModel

# load data
# fetch_mldata
# cancer = fetch_mldata("Lung cancer (Ontario)")        # load data

cancer = fetch_mldata("heart")  # load data

X = cancer.target.T  # label (270, )
ytrue = np.copy(cancer.data).flatten()  # data (3510, )
ytrue[ytrue > 0] = 1  # set the value = 1 where the original value > 0

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point

# print(list(np.where(ytrue == 0)[0]))

random_labeled_points = random.sample(list(np.where(ytrue == 0)[0]), int(labeled_N/2))+\
                        random.sample(list(np.where(ytrue == 1)[0]), int(labeled_N/2))

# set the labels of the labeled samples
# Feature importance
#---------------------
"""
# on Mac, following code didn't work 
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
mnist
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets.mldata import fetch_mldata
import tempfile

test_data_home = tempfile.mkdtemp()

mnist = fetch_mldata('MNIST original', data_home=test_data_home)
mnist

rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(mnist["data"], mnist["target"])

import matplotlib
import matplotlib.pyplot as plt


def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap=matplotlib.cm.hot, interpolation="nearest")
    plt.axis("off")

예제 #37
0
import numpy as np
import random
from frameworks.CPLELearning import CPLELearningModel
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import sklearn.svm
from methods.scikitWQDA import WQDA
from frameworks.SelfLearning import SelfLearningModel

# load data
cancer = fetch_mldata("Lung cancer (Ontario)")
X = cancer.target.T
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
예제 #38
0
파일: nn_final.py 프로젝트: pknelakuditi/ML
from sklearn.datasets import *
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
#from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.metrics import classification_report,accuracy_score
from sklearn import neighbors


from sklearn.datasets.mldata import fetch_mldata
import tempfile
test_data_home = tempfile.mkdtemp()



data = fetch_mldata('uci-20070111 wine', data_home=test_data_home)
#print data.DESCR
n_trials = 3
train_percentage = [90,70,50]
# Set the parameters by cross-validation
tuned_parameters = [{'n_neighbors' : [2,4,5,6,7,8,9,10], 'weights' : ['uniform', 'distance']}]
print "All used parameters :",tuned_parameters
test_accuracies = numpy.zeros(n_trials)

for n in train_percentage:
    print "K neighbour classfier using brute approach for Wine data"
    print ""
    print ""
    
    print "training percentage ::",n
    print ""
예제 #39
0
#!/usr/bin/env python3

__author__ = "Thibaut Thonet, Maziar Moradi Fard"
__license__ = "GPL"

import tensorflow as tf
from utils import read_list
from sklearn.datasets.mldata import fetch_mldata

# Fetch the dataset
dataset = fetch_mldata("USPS")
print("Dataset USPS loaded...")
data = dataset.data
target = dataset.target - 1  # Labels between 0 and 9 to match digits
n_samples = data.shape[0]  # Number of samples in the dataset
n_clusters = 10  # Number of clusters to obtain

# Get the split between training/test set and validation set
test_indices = read_list("split/usps/test")
validation_indices = read_list("split/usps/validation")

# Auto-encoder architecture
input_size = data.shape[1]
hidden_1_size = 500
hidden_2_size = 500
hidden_3_size = 2000
embedding_size = n_clusters
dimensions = [
    hidden_1_size,
    hidden_2_size,
    hidden_3_size,
예제 #40
0
bench_active_set = 0

if dataset_id == 1:
    dataset = "synthetic"
    n_samples, n_features, n_tasks = (47, 1177, 20)
    # generate dataset
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_features,
                           n_targets=n_tasks)  # , random_state=2)
    X = X.astype(float)
    y = y.astype(float)
    eps = 1e-3

elif dataset_id == 2:
    dataset = "leukemia"
    data = fetch_mldata(dataset)
    X = data.data  # [:, ::10]
    y = data.target[:, None]
    X = X.astype(float)
    y = y.astype(float)
    eps = 1e-3

if dataset_id == 3:
    # The data can be found in https://drive.google.com/open?id=139nKKy0AkpkZntB80n-LmcuzGgC8pQHi
    # Please unzip the file "meg_data.tar.gz"
    dataset = 'meg_full'
    data = io.loadmat('meg_Xy_full.mat')
    X = np.array(data['X'], dtype=np.float, order='F')
    Y = np.array(data['Y'], dtype=np.float)
    y = Y
    idx = np.argmax(np.sum(Y**2, axis=0))
예제 #41
0
# [gaussian process 分类图像]( http://www.pyimagesearch.com/2014/09/22/getting-started-deep-learning-python/)

# <codecell>

from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import datasets
from sklearn import gaussian_process
import numpy as np
import cv2
from sklearn.datasets.mldata import fetch_mldata
import pickle

# <codecell>

mnist = fetch_mldata('MNIST original')

# <markdowncell>

# 首次获取数据使用 fetch_mldata, 
# dataset = fetch_mldata('MNIST Original')
# 
# 下次就可以使用 load 文件了
# fid  = open('MnistData.pkl','wb')
# pickle.dump(dataset,fid)

# <markdowncell>

# 串行化数据导入

# <codecell>
예제 #42
0
the predictions of a :class:`sklearn.ensemble.RandomForestRegressor` object.

The data used here are a classical machine learning data-set, describing
various features of different cars, and their MPG.
"""

# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.cross_validation as xval
from sklearn.datasets.mldata import fetch_mldata
import forestci as fci

# retreive mpg data from machine learning library
mpg_data = fetch_mldata('mpg')

# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]

# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
    mpg_X, mpg_y, test_size=0.25, random_state=42)

# create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
예제 #43
0
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets.mldata import fetch_mldata
import matplotlib.pyplot as plt
#from display_network import *
mnist = fetch_mldata('mnist-original',
                     data_home='/media/Vancouver/apps/mnist_dataset/')
print(mnist)
X_all = mnist.data
y_all = mnist.target
X0 = X_all[np.where(y_all == 0)[0]]  # all digit 0
X1 = X_all[np.where(y_all == 1)[0]]  # all digit 1
y0 = np.zeros(X0.shape[0])  # class 0 label
y1 = np.ones(X1.shape[0])  # class 1 label
X = np.concatenate((X0, X1), axis=0)  # all digits
y = np.concatenate((y0, y1))  # all labels
# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2000)
#################
print(X_train.shape)
model = LogisticRegression(C=1e5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(y_pred)
print("Accuracy %.2f%%" % (100 * accuracy_score(y_test, y_pred.tolist())))
mis = np.where((y_pred - y_test) != 0)[0]
print(mis)
예제 #44
0
from dcgan.trainer import Trainer
from dcgan.generator import Generator
from dcgan.discreminator import Discriminator

from sklearn.datasets.mldata import fetch_mldata
import numpy as np
import pandas as pd

import pickle

if __name__ == '__main__':
    gen = Generator(100)
    dis = Discriminator()

    data = fetch_mldata('mnist-original', data_home=".")
    X = data['data']
    n_train = X.shape[0]
    X = np.array(X, dtype=np.float32)
    X /= 255.
    X = X.reshape(n_train, 1, 28, 28)

    trainer = Trainer(gen, dis)

    trainer.fit(X, batch_size=1000, epochs=1000)

    df_loss = pd.DataFrame(trainer.loss)
    df_loss.to_csv('loss.csv')

    gen.to_cpu()
    dis.to_cpu()
from sklearn.neural_network import MLPClassifier
import struct
import numpy as np
from sklearn.datasets.mldata import fetch_mldata

if __name__ == '__main__':
    mnist = fetch_mldata('MNIST original')
    # rescale the data, use the traditional train/test split
    X, y = mnist.data / 255., mnist.target
    X_train, X_test = X[:60000], X[60000:]
    y_train, y_test = y[:60000], y[60000:]
    clf = MLPClassifier(alpha=0.01,
                        hidden_layer_sizes=(200, 150),
                        random_state=1,
                        max_iter=10)
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    print("Test set score: %f" % clf.score(X_test, y_test))
예제 #46
0
the predictions of a :class:`sklearn.ensemble.RandomForestRegressor` object.

The data used here are a classical machine learning data-set, describing
various features of different cars, and their MPG.
"""

# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets.mldata import fetch_mldata
import forestci as fci

# retreive mpg data from machine learning library
mpg_data = fetch_mldata('mpg')

# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]

# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
                                                   mpg_X, mpg_y,
                                                   test_size=0.25,
                                                   random_state=42
                                                   )

# create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
예제 #47
0
pl.show()
print("Wartosci: 7, 15")
net5.trainf = nl.train.train_gd
error5 = net5.train(inp, tar, epochs=500, show=100, goal=0.002)
out5 = net5.sim(inp)
x2 = np.linspace(1, 2.5, 150)
y2 = net5.sim(x2.reshape(x2.size, 1)).reshape(x2.size)
y3 = net5.sim(inp).reshape(size)

pl.plot(x2, y2, '-', x, y, '.', x, y3, 'p')
pl.legend(['wartosc rzeczywista', 'wynik uczenia'])
pl.show()

# Zadanie 3:

data = fetch_mldata('MNIST')
train, test, train_targets, test_targets = train_test_split(data.data,
                                                            data.target,
                                                            test_size=0.5,
                                                            random_state=42)

mlp = MLPClassifier(solver='adam', alpha=0.0001)
mlp.fit(train, train_targets).predict(test)
print(mlp.score(test, test_targets))

mlp = MLPClassifier(solver='lbfgs', alpha=0.0001)
mlp.fit(train, train_targets).predict(test)
print(mlp.score(test, test_targets))

mlp = MLPClassifier(alpha=0.000001)
mlp.fit(train, train_targets).predict(test)
예제 #48
0
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 22 13:26:02 2015

@author: LegendsUser
"""

import numpy as np
import random
from sklearn.datasets.mldata import fetch_mldata
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import sklearn.svm
from scikitWQDA import WQDA
from SelfLearning import SelfLearningModel

heart = fetch_mldata("heart")
X = heart.data
ytrue = np.copy(heart.target)
ytrue[ytrue == -1] = 0

labeled_N = 2
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(list(
    np.where(ytrue == 0)[0]), int(labeled_N / 2)) + random.sample(
        list(np.where(ytrue == 1)[0]), int(labeled_N / 2))

ys[random_labeled_points] = ytrue[random_labeled_points]

basemodel = SGDClassifier(loss='log', penalty='l1')

basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])