示例#1
0
def leave_one_out_cv(gram_matrix, labels, alg = 'SVM'):
    """
    leave-one-out cross-validation
    """
    scores = []
    preds = []
    loo = sklearn.cross_validation.LeaveOneOut(len(labels))
    for train_index, test_index in loo:
        X_train, X_test = gram_matrix[train_index][:,train_index], gram_matrix[test_index][:, train_index]
        y_train, y_test = labels[train_index], labels[test_index]
        if(alg == 'SVM'):
            svm = sklearn.svm.SVC(kernel = 'precomputed')
            svm.fit(X_train, y_train)
            preds += svm.predict(X_test).tolist()
            score = svm.score(X_test, y_test)
        elif(alg == 'kNN'):
            knn = sklearn.neighbors.KNeighborsClassifier()
            knn.fit(X_train, y_train)
            preds += knn.predict(X_test).tolist()
            score = knn.score(X_test, y_test)
        scores.append(score)

    print "Mean accuracy: %f" %(np.mean(scores))
    print "Stdv: %f" %(np.std(scores))

    return preds, scores
def svm_iterkernel(train_data, train_labels, test_data, test_labels, op_name_dir):


	label_set=np.unique(train_labels)

	if op_name_dir != ('None' or 'none'):
		fo=open(op_name_dir,'a')

	predict_list={}
	for kernel in ['linear']: #, 'poly', 'rbf']:
		t0=time.time()
		svm = SVC(C=1., kernel=kernel, cache_size=10240)
		svm.fit(train_data, train_labels)
		prediction=svm.predict(test_data)
		predict_list[kernel]=prediction
		pred_acc_tot =(float(np.sum(prediction == test_labels)))/len(test_labels)
		print time.time() - t0, ',kernel = '+kernel, ',pred acc = '+str(round(pred_acc_tot*100))
		if op_name_dir != ('None' or 'none'):
			fo.write('time='+str(time.time() - t0)+'sec,kernel='+kernel+',pred acc='+str(round(pred_acc_tot*100))+'\n')
		for lab_unq in label_set:	
			pred_acc=(prediction == lab_unq) & (test_labels == lab_unq)
			pred_acc=float(pred_acc.sum())/(len(test_labels[test_labels == lab_unq]))
			print 'pred_'+str(lab_unq)+','+str(round(pred_acc*100))	
			if op_name_dir != ('None' or 'none'):
				fo.write('pred_'+str(lab_unq)+','+str(round(pred_acc*100))+'\n')

	if op_name_dir != ('None' or 'none'):
		fo.close()

	return predict_list
def trainSVM(filteredFaces, labels, subjects, e):
    uniqueSubjects = np.unique(subjects)
    accuracies = []
    masterK = filteredFaces.dot(filteredFaces.T)
    for testSubject in uniqueSubjects:
        idxs = np.nonzero(subjects != testSubject)[0]
        someFilteredFacesTrain = filteredFaces[idxs]
        someLabels = labels[idxs]
        y = someLabels == e
        K = masterK[idxs, :]
        K = K[:, idxs]
        svm = sklearn.svm.SVC(kernel="precomputed")
        svm.fit(K, y)

        idxs = np.nonzero(subjects == testSubject)[0]
        someFilteredFaces = filteredFaces[idxs]
        someLabels = labels[idxs]
        y = someLabels == e
        yhat = svm.decision_function(someFilteredFaces.dot(someFilteredFacesTrain.T))

        if len(np.unique(y)) > 1:
            auc = sklearn.metrics.roc_auc_score(y, yhat)
        else:
            auc = np.nan
        print "{}: {}".format(testSubject, auc)
        accuracies.append(auc)
    accuracies = np.array(accuracies)
    accuracies = accuracies[np.isfinite(accuracies)]
    print np.mean(accuracies), np.median(accuracies)
def train():
	training_set=[]
	training_labels=[]
	os.chdir("/Users/muyunyan/Desktop/EC500FINAL/logo/")
	counter=0
	a=os.listdir(".")
	for i in a:
	 os.chdir(i)
	 print(i)
	 for d in os.listdir("."):
		 img = cv2.imread(d)
		 res=cv2.resize(img,(250,250))
		 gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
		 xarr=np.squeeze(np.array(gray_image).astype(np.float32))
		 m,v=cv2.PCACompute(xarr)
		 arr= np.array(v)
		 flat_arr= arr.ravel()
		 training_set.append(flat_arr)
		 training_labels.append(i)
	 os.chdir("..")
	 trainData=training_set
	 responses=training_labels
	 svm = svm.SVC()
	 svm.fit(trainData,responses)
	 return svm
示例#5
0
文件: svm.py 项目: aflower15/pythia
def run_model(train_data, train_labels, test_data, test_labels):
    '''
    Algorithm which will take in a set of training text and labels to train a bag of words model
    This model is then used with a logistic regression algorithm to predict the labels for a second set of text
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        train_data_text: Text training set.  Needs to be iterable
        train_labels: Training set labels
        test_data_text: The text to
    Returns:
        pred_labels: The predicted labels as determined by logistic regression
    '''

    #use Logistic Regression to train a model
    svm = SVC()

    # we create an instance of Neighbours Classifier and fit the data.
    svm.fit(train_data, train_labels)

    #Now that we have something trained we can check if it is accurate with the test set
    pred_labels = svm.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels)

    #Perform_results is a dictionary, so we should add other pertinent information to the run
    perform_results['vector'] = 'Bag_of_Words'
    perform_results['alg'] = 'Support_Vector_Machine'

    return pred_labels, perform_results
def trainOneSVM(masterK, y, subjects):
    Cs = 1.0 / np.array([0.1, 0.5, 2.5, 12.5, 62.5, 312.5])
    # Cs = 10. ** np.arange(-5, +6)/2.
    uniqueSubjects, subjectIdxs = np.unique(subjects, return_inverse=True)
    highestAccuracy = -float("inf")
    NUM_MINI_FOLDS = 4
    for C in Cs:  # For each regularization value
        # print "C={}".format(C)
        accuracies = []
        for i in range(NUM_MINI_FOLDS):  # For each test subject
            testIdxs = np.nonzero(subjectIdxs % NUM_MINI_FOLDS == i)[0]
            trainIdxs = np.nonzero(subjectIdxs % NUM_MINI_FOLDS != i)[0]
            if len(np.unique(y[testIdxs])) > 1:
                K = masterK[trainIdxs, :]
                K = K[:, trainIdxs]
                svm = sklearn.svm.SVC(kernel="precomputed", C=C)
                svm.fit(K, y[trainIdxs])

                K = masterK[testIdxs, :]
                K = K[:, trainIdxs]  # I.e., need trainIdxs dotted with testIdxs
                accuracy = sklearn.metrics.roc_auc_score(y[testIdxs], svm.decision_function(K))
                # print accuracy
                accuracies.append(accuracy)
        if np.mean(accuracies) > highestAccuracy:
            highestAccuracy = np.mean(accuracies)
            bestC = C
    svm = sklearn.svm.SVC(kernel="precomputed", C=bestC)
    svm.fit(masterK, y)
    return svm
示例#7
0
def main():
    data = pickle.load(open('../submodular_20.pickle'))
    train, train_labels, test, test_labels = Load20NG()
    vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True,
            lowercase=False) 
    vectorizer.fit(train + test)                                                          
    train_vectors = vectorizer.transform(train)
    test_vectors = vectorizer.transform(test)                                             
    svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)               
    svm.fit(train_vectors, train_labels)                                                  
    
    json_ret = {}
    json_ret['class_names'] = ['Atheism', 'Christianity']
    json_ret['instances'] = []
    explanations = data['explanations']['20ng']['svm']
    idxs = data['submodular_idx']['20ng']['svm'][:10]
    for i in idxs:
        json_obj = {}
        json_obj['id'] = i
        idx = i
        instance = test_vectors[idx]
        json_obj['true_class'] = test_labels[idx]
        json_obj['c1'] = {}
        json_obj['c1']['predict_proba'] = list(svm.predict_proba(test_vectors[idx])[0])
        exp = explanations[idx]
        json_obj['c1']['exp'] = exp 
        json_obj['c1']['data'] = get_pretty_instance(test[idx], exp, vectorizer)
        json_ret['instances'].append(json_obj)
    import json
    open('static/exp2_local.json', 'w').write('data = %s' % json.dumps(json_ret))
示例#8
0
文件: 1.py 项目: pjhades/coursera
def q20():
    X, y = load_data('/Users/pjhades/code/lab/ml/train.dat')
    y = set_binlabel(y, 0)

    # init hit counts
    gammas = [1, 10, 100, 1000, 10000]
    hits = {}
    for gamma in gammas:
        hits[gamma] = 0

    repeat = 100
    for round in range(repeat):
        print('round {0}/{1}'.format(round, repeat), end=', ')

        err_min = 1
        gamma_min = max(gammas) + 1

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1000)
        for gamma in gammas:
            svm = sklearn.svm.SVC(C=0.1, kernel='rbf', gamma=gamma)
            svm.fit(X_train, y_train)
            err = get_error(svm, X_val, y_val)
            if err < err_min or (err == err_min and gamma < gamma_min):
                err_min = err
                gamma_min = gamma
        hits[gamma_min] += 1
        print('gamma={0}'.format(gamma_min))

    for gamma in gammas:
        print('{0} hits {1} times'.format(gamma, hits[gamma]))
示例#9
0
文件: 1.py 项目: pjhades/coursera
def q15():
    X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat')
    y = set_binlabel(y_train, 0)

    svm = sklearn.svm.SVC(C=0.01, kernel='linear') 
    svm.fit(X_train, y)
    print(linalg.norm(svm.coef_))
示例#10
0
def k_fold_cv(gram_matrix, labels, folds = 10, alg = 'SVM', shuffle = True):
    """
    K-fold cross-validation
    """
    pdb.set_trace()
    scores = []
    preds = []
    loo = sklearn.cross_validation.KFold(len(labels), folds, shuffle = shuffle, random_state = random.randint(0,100))
    #loo = sklearn.cross_validation.LeaveOneOut(len(labels))
    for train_index, test_index in loo:
        X_train, X_test = gram_matrix[train_index][:,train_index], gram_matrix[test_index][:, train_index]
        y_train, y_test = labels[train_index], labels[test_index]
        if(alg == 'SVM'):
            svm = sklearn.svm.SVC(kernel = 'precomputed')
            svm.fit(X_train, y_train)
            preds += svm.predict(X_test).tolist()
            score = svm.score(X_test, y_test)
        elif(alg == 'kNN'):
            knn = sklearn.neighbors.KNeighborsClassifier()
            knn.fit(X_train, y_train)
            preds += knn.predict(X_test).tolist()
            score = knn.score(X_test, y_test)

        scores.append(score)

    print "Mean accuracy: %f" %(np.mean(scores))
    print "Stdv: %f" %(np.std(scores))

    return preds, scores
def svm_train(X,y,k):
	C_range = 10.0 ** np.arange(-2, 9)
	gamma_range = 10.0 ** np.arange(-5, 4)
	param_grid = dict(gamma=gamma_range, C=C_range)
	cv = StratifiedKFold(y=y,n_folds=k)
	svm = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
	svm.fit(X,y)
	return svm
示例#12
0
def svm_liblinear_solver(X, y, C, tol=1e-6, max_iter=100, verbose=False):
    svm = sklearn.svm.LinearSVC(loss='hinge', tol=tol, C=C, verbose=verbose,
                                intercept_scaling=10, max_iter=max_iter)
    now = time.clock()
    svm.fit(X, y)
    res_time = time.clock() - now
    return {'w0': svm.intercept_[0],
            'w': svm.coef_.copy()[0],
            'time': res_time}
示例#13
0
def trainSVM(svm, sv, y):
	print "\ntraining SVM"
	# cross validate 5 times
	scores = cross_val_score(svm, sv, y, cv=5)
	print scores

	# fit the data to the labels
	svm.fit(sv, y)
	return svm
示例#14
0
文件: 1.py 项目: pjhades/coursera
def q16_17():
    X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat')

    for goal in [0, 2, 4, 6, 8]:
        y = set_binlabel(y_train, goal)
        svm = sklearn.svm.SVC(C=0.01, kernel='poly', degree=2, coef0=1, gamma=1)
        svm.fit(X_train, y)
        ein = get_error(svm, X_train, y)
        print('{0} vs not {0}, ein={1}'.format(goal, ein), end=', ')
        print('sum of alphas={0}'.format(np.sum(np.abs(svm.dual_coef_))))
示例#15
0
文件: 1.py 项目: pjhades/coursera
def q19():
    X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat')
    X_test, y_test = load_data('/Users/pjhades/code/lab/ml/test.dat')

    y_train = set_binlabel(y_train, 0)
    y_test = set_binlabel(y_test, 0)

    for gamma in [10000, 1000, 1, 10, 100]:
        svm = sklearn.svm.SVC(C=0.1, kernel='rbf', gamma=gamma)
        svm.fit(X_train, y_train)
        print('gamma={0:<10}, Eout={1}'.format(gamma, get_error(svm, X_test, y_test)))
def hw1q18():
    print "----------------------------------------"
    print "         Homework 1 Question 18         "
    print "----------------------------------------"

    Y_train_0 = (Y_train == 0).astype(int)
    Y_test_0 = (Y_test == 0).astype(int)

    print "in the training set:"
    print "n(+) =", np.count_nonzero(Y_train_0 == 1), "n(-) =", np.count_nonzero(Y_train_0 == 0)

    print "in the test set:"
    print "n(+) =", np.count_nonzero(Y_test_0 == 1), "n(-) =", np.count_nonzero(Y_test_0 == 0)

    for C in (0.001, 0.01, 0.1, 1, 10):
        svm = sklearn.svm.SVC(C=C, kernel="rbf", gamma=100, tol=1e-7, shrinking=True, verbose=False)
        svm.fit(X_train, Y_train_0)

        print "----------------------------------------"
        print "C =", C

        support = svm.support_
        coef = svm.dual_coef_[0]
        b = svm.intercept_[0]

        print "nSV =", len(support)
        Y_predict = svm.predict(X_test)

        print "in the prediction:"
        print "n(+) =", np.count_nonzero(Y_predict == 1), "n(-) =", np.count_nonzero(Y_predict == 0)

        print "E_out =", np.count_nonzero(Y_test_0 != Y_predict)
        print

        fig = plt.figure()
        plt.suptitle("C =" + str(C))
        plt.subplot(311)
        plt.title("Training data: green +, red -")
        plot_01(X_train, Y_train_0)
        plt.tick_params(axis="x", labelbottom="off")

        plt.subplot(312)
        plt.title("Prediction on test data: green +, red -")
        plot_01(X_test, Y_predict)
        plt.tick_params(axis="x", labelbottom="off")

        plt.subplot(313)
        plt.title("Support vectors: blue")
        plt.plot(X_train[:, 0], X_train[:, 1], "r.")
        plt.plot(X_train[support, 0], X_train[support, 1], "b.")

    plt.show()
示例#17
0
文件: 1.py 项目: pjhades/coursera
def q18():
    X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat')
    X_test, y_test = load_data('/Users/pjhades/code/lab/ml/test.dat')

    y_train = set_binlabel(y_train, 0)
    y_test = set_binlabel(y_test, 0)

    for C in [0.001, 0.01, 0.1, 1, 10]:
        svm = sklearn.svm.SVC(C=C, kernel='rbf', gamma=100)
        svm.fit(X_train, y_train)

        print('C={0}'.format(C))
        print('# support vectors =', np.sum(svm.n_support_))
        print('Eout =', get_error(svm, X_test, y_test))
    def runSVM(self):
        """
        Runs the SVM on 5 different splits of cross validation data
        """
        for train, test in self.kf:
            svm = self.models["SVM"]

            train_set, train_labels = self.getCurrFoldTrainData(train)
            test_set, test_labels = self.getCurrFoldTestData(test)
            svm.fit(train_set, train_labels)

            preds = svm.predict(test_set)
            acc = self.getAccuracy(test_labels, preds)
            print "(SVM) Percent correct is", acc
def hw1q15():
    svm = sklearn.svm.SVC(C=0.01, kernel="linear", shrinking=False, verbose=True)

    X_train_0 = X_train
    Y_train_0 = (Y_train == 0).astype(int)

    svm.fit(X_train_0, Y_train_0)

    w = svm.coef_[0]
    b = svm.intercept_[0]

    print "w =", w
    print "norm(w) =", np.linalg.norm(w, ord=2)
    print "b =", b
示例#20
0
文件: a1.py 项目: daveguy/Comp599
def trainTest():

    data2010, labels2010 = read_tac('2010')
    data2011, labels2011 = read_tac("2011")

    #classifiers
    gnb = naive_bayes.GaussianNB()
    svm = svm.SVC(kernel = "linear")
    logReg = linear_model.LogisticRegression()


    gnb.fit(data2010, labels2010)
    svm.fit(data2010, labels2010)
    logReg.fit(data2010, labels2010)

    gnbPrediction = gnb.predict(data2011)
    svmPrediction = svm.predict(data2011)
    logRegPrediction = logReg.predict(data2011)

    gnbAccuracy = accuracy(labels2011, gnbPrediction)
    svmAccuracy = accuracy(labels2011, svmPrediction)
    logRegAccuracy = accuracy(labels2011, logRegPrediction)

    confusionMatrix = metrics.confusion_matrix(labels2011, logRegPrediction)

    print "Results:"
    print "Gaussian Naive Bayes: " 
    print gnbAccuracy
    print "Support Vector Machine: " 
    print svmAccuracy
    print "Logistic Regression: " 
    print logRegAccuracy
    print confusionMatrix

    fh.write("Results:" + "\n")
    fh.write("Gaussian Naive Bayes: "  + "\n")
    fh.write(gnbAccuracy + "\n")
    fh.write("Support Vector Machine: "  + "\n")
    fh.write(svmAccuracy + "\n")
    fh.write("Logistic Regression: "  + "\n")
    fh.write(logRegAccuracy + "\n")
    for i in confusionMatrix:
        fh.write(str(i))
        fh.write("\n")
    fh.write("-------------------------------------------------\n")
    fh.write("\n\n")    
def hw1q16():
    print "----------------------------------------"
    print "         Homework 1 Question 16         "
    print "----------------------------------------"

    # polynomial kernel: (coef0 + gamma * x1.T * x2) ** degree

    for idx in (0, 2, 4, 6, 8):
        svm = sklearn.svm.SVC(
            C=0.01, kernel="poly", degree=2, gamma=1, coef0=1, tol=1e-4, shrinking=True, verbose=False
        )

        Y_train_i = (Y_train == idx).astype(int)

        svm.fit(X_train, Y_train_i)
        Y_predict_i = svm.predict(X_train)

        support = svm.support_
        coef = svm.dual_coef_[0]
        b = svm.intercept_[0]
        E_in = np.count_nonzero(Y_train_i != Y_predict_i)

        print "For class %d:" % (idx)
        print "sum(alpha) =", np.sum(np.abs(coef))
        print "b =", b
        print "E_in =", E_in

        fig = plt.figure()
        # plt.suptitle('%d vs rest' % (idx))
        plt.subplot(311)
        plt.title("Training data: green +, red -")
        plot_01(X_train, Y_train_i)
        plt.tick_params(axis="x", labelbottom="off")

        plt.subplot(312)
        plt.title("Prediction: green +, red -")
        plot_01(X_train, Y_predict_i)
        plt.tick_params(axis="x", labelbottom="off")

        plt.subplot(313)
        plt.title("Support vectors: blue")
        plt.plot(X_train[:, 0], X_train[:, 1], "r.")
        plt.plot(X_train[support, 0], X_train[support, 1], "b.")

    plt.show()
示例#22
0
def trainRBM_SVM(features, Cparam, nComponents):
    [X, Y] = listOfFeatures2Matrix(features)
    rbm = BernoulliRBM(n_components = nComponents, n_iter = 30, learning_rate = 0.2,  verbose = True)
    rbm.fit(X,Y)
    newX = rbm.transform(X)
#    colors = ["r","g","b"]
#    for i in range(1,Y.shape[0],5):
#        plt.plot(newX[i,:], colors[int(Y[i])])
#    plt.show()

    classifier = {}
    classifier["rbm"] = rbm    
    svm = sklearn.svm.SVC(C = Cparam, kernel = 'linear',  probability = True)        
    svm.fit(newX,Y)

    classifier["svm"] = svm

    return classifier    
示例#23
0
def svm_libsvm_solver(X, y, C, tol=1e-6, max_iter=100, verbose=False, gamma=0):
    if gamma == 0:
        svm = sklearn.svm.SVC(C=C, kernel='linear', tol=tol, verbose=verbose,
                              max_iter=max_iter)
    else:
        svm = sklearn.svm.SVC(C=C, kernel='rbf', gamma=gamma, tol=tol,
                              verbose=verbose, max_iter=max_iter)

    now = time.clock()
    svm.fit(X, y)
    res_time = time.clock() - now

    A = np.zeros(X.shape[0])
    A[svm.support_] = np.abs(svm.dual_coef_)

    return {'w0': svm.intercept_[0],
            'w': compute_w(X, y, A),
            'A': A,
            'time': res_time}
def svm_test():
    X_train = np.array([[0, 0], [1, 0], [0, 2], [-2, 0]])
    Y_train = np.array([1, 1, 0, 0])
    svm = sklearn.svm.SVC(C=100000, kernel="linear", shrinking=False, verbose=False)
    svm.fit(X_train, Y_train)
    Y_predict = svm.predict(X_train)
    print Y_predict
    b = svm.intercept_[0]
    print b

    plt.figure()
    plt.suptitle("svm test")
    plt.subplot(211)
    plot_01(X_train, Y_train)
    plt.subplot(212)
    plot_01(X_train, Y_predict)
    plt.plot(X_train[Y_predict == 0, 0], X_train[Y_predict == 0, 1], "ro")
    plt.plot(X_train[Y_predict == 1, 0], X_train[Y_predict == 1, 1], "go")
    plt.show()
def outlier_detection_with_SVM(dataframe, kernel, gamma, outlier_percentage):
	"""
	Note that the SVM parameters are higly sensitive to the dataset, so they have to be manually selected for each dataset
	"""
	assert isinstance(dataframe, DataFrame), "Expected pandas DataFrame, but got %s."%type(dataframe)
	from scipy.stats import scoreatpercentile
	from sklearn import svm
	svm = svm.OneClassSVM(kernel=kernel, gamma=gamma)
	
	points = dataframe.values
	svm.fit(points)
	assignment = svm.decision_function(points)
	score = scoreatpercentile(assignment.ravel(), 1 - outlier_percentage)
	
	inliers_idx, dummy = np.where(assignment <= score)
	outliers_idx, dummy = np.where(assignment > score)
	
	print "%s inliers and %s outliers"%(len(inliers_idx), len(outliers_idx))
	return inliers_idx, outliers_idx
示例#26
0
def train_svm_from_saved_dataset():
    dataset = get_thing_from_file("training_dataset.txt")
    svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, n_iter=5, random_state=42))])

    save_thing_to_file(svm, "svm.txt")
    svm = svm.fit(dataset.data, dataset.target)
    save_thing_to_file(svm, "svm_model.txt")
def hw1q19():
    print "----------------------------------------"
    print "         Homework 1 Question 19         "
    print "----------------------------------------"

    Y_train_0 = (Y_train == 0).astype(int)
    Y_test_0 = (Y_test == 0).astype(int)

    for gamma in (1, 10, 100, 1000, 10000):
        svm = sklearn.svm.SVC(C=0.1, kernel="rbf", gamma=gamma, tol=1e-7, shrinking=True, verbose=False)
        svm.fit(X_train, Y_train_0)
        print "----------------------------------------"
        print "gamma =", gamma
        Y_predict_0 = svm.predict(X_test)
        print "in the prediction:"
        print "n(+) =", np.count_nonzero(Y_predict_0 == 1), "n(-) =", np.count_nonzero(Y_predict_0 == 0)

        print "E_out =", np.count_nonzero(Y_test_0 != Y_predict_0)
        print
def trainSVM_RBF(features, Cparam):
    '''
    Train a multi-class probabilitistic SVM classifier.
    Note:     This function is simply a wrapper to the sklearn functionality for SVM training
              See function trainSVM_feature() to use a wrapper on both the feature extraction and the SVM training (and parameter tuning) processes.
    ARGUMENTS:
        - features:         a list ([numOfClasses x 1]) whose elements containt numpy matrices of features
                            each matrix features[i] of class i is [numOfSamples x numOfDimensions]
        - Cparam:           SVM parameter C (cost of constraints violation)
    RETURNS:
        - svm:              the trained SVM variable

    NOTE:
        This function trains a linear-kernel SVM for a given C value. For a different kernel, other types of parameters should be provided.
    '''

    [X, Y] = listOfFeatures2Matrix(features)
    svm = sklearn.svm.SVC(C = Cparam, kernel = 'rbf',  probability = True)        
    svm.fit(X,Y)

    return svm
示例#29
0
def train_svm_from_scratch():
    dataset = datasets.load_files(training_data, encoding='utf-8',
                                  decode_error='ignore')

    svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, n_iter=5, random_state=42))])

    save_thing_to_file(svm, "svm.txt")
    svm = svm.fit(dataset.data, dataset.target)
    save_thing_to_file(svm, "svm_model.txt")
def hw1q20():
    print "----------------------------------------"
    print "         Homework 1 Question 20         "
    print "----------------------------------------"

    Y_train_0 = (Y_train == 0).astype(int)

    C = 0.1
    m = len(Y_train_0)
    gammas = [1, 10, 100, 1000, 10000]
    counts = [0] * len(gammas)

    for nrun in range(10):
        print "run", nrun

        # generate a random order of m indices
        arr = np.arange(m)
        np.random.shuffle(arr)

        # pick 1000 for cross validation
        X_curval_0 = X_train[arr[:1000]]
        Y_curval_0 = Y_train_0[arr[:1000]]
        X_curtrain_0 = X_train[arr[1000:]]
        Y_curtrain_0 = Y_train_0[arr[1000:]]

        E_vals = [0.0] * len(gammas)
        for i in range(len(gammas)):
            gamma = gammas[i]

            svm = sklearn.svm.SVC(C=C, kernel="rbf", gamma=gamma, tol=1e-3, shrinking=True, verbose=False)
            svm.fit(X_curtrain_0, Y_curtrain_0)
            Y_curpredict_0 = svm.predict(X_curval_0)
            E_val = np.count_nonzero(Y_curval_0 != Y_curpredict_0)

            E_vals[i] = E_val

        counts[np.argmin(E_vals)] += 1

    for i in range(len(gammas)):
        print "gamma", gammas[i], "got picked", counts[i], "times"
示例#31
0
def main(args):
    # Get the arguments.
    filename_lars = args.f
    algoname = args.n
    alpha = args.a
    group_splits = args.g
    assert os.path.isfile(filename_lars)

    warnings.simplefilter("ignore", ConvergenceWarning)

    # Read the files.
    with h5py.File(filename_lars) as f:
        values = f["values"].value
        col_index = f["col_index"].value
        row_ptr = f["row_ptr"].value
        labels = f["labels"].value
    m = scipy.sparse.csr_matrix((values, col_index, row_ptr))

    if algoname == "forest_garrote" and len(group_splits) == 0:
        # Do the lasso.
        coefs = sklearn.linear_model.lasso_path(m,
                                                labels,
                                                positive=True,
                                                max_iter=100,
                                                alphas=[alpha])[1]
        coefs = coefs[:, -1]
    elif algoname == "l2_svm":
        # Use an l2 svm.
        svm = sklearn.svm.LinearSVC(C=1.0, penalty="l2")
        svm.fit(m, labels)
        coefs = svm.coef_[0, :]
    elif algoname == "l1_svm":
        # Use an l1 svm.
        svm = sklearn.svm.LinearSVC(C=1.0, penalty="l1", dual=False)
        svm.fit(m, labels)
        coefs = svm.coef_[0, :]
    elif algoname == "forest_garrote" and len(group_splits) > 0:
        # Make groups and run the forest garrote on each group.
        group_splits = [0] + group_splits + [m.shape[1]]
        n_groups = len(group_splits) - 1

        if args.n_threads < 1:
            args.n_threads = cpu_count()
        n_threads = min(n_groups, args.n_threads)
        if n_threads == 1:
            coef_list = []
            for i in xrange(n_groups):
                begin = group_splits[i]
                end = group_splits[i + 1]
                sub_m = m[:, begin:end]
                coefs = sklearn.linear_model.lasso_path(sub_m,
                                                        labels,
                                                        positive=True,
                                                        max_iter=100,
                                                        alphas=[alpha])[1]
                coefs = coefs[:, -1]
                coef_list.append(coefs)
            coefs = numpy.concatenate(coef_list)
            coefs = coefs / n_groups
        else:
            in_qu = Queue()
            out_qu = Queue()
            procs = [
                Process(target=lars_worker,
                        args=(in_qu, out_qu, m, labels, alpha))
                for _ in xrange(n_threads)
            ]
            for i in xrange(n_groups):
                begin = group_splits[i]
                end = group_splits[i + 1]
                in_qu.put((i, begin, end))
            for p in procs:
                in_qu.put(None)
                p.start()
            coef_list = [None] * n_groups
            for i in xrange(n_groups):
                k, coefs = out_qu.get()
                coef_list[k] = coefs
            for p in procs:
                p.join()
            coefs = numpy.concatenate(coef_list)
            coefs = coefs / n_groups

        # Use an additional l2 svm to refine the weights.
        nnz = coefs.nonzero()[0]
        m_sub = m[:, nnz]
        svm = sklearn.svm.LinearSVC(C=1.0, penalty="l2")
        svm.fit(m_sub, labels)
        new_coefs = svm.coef_[0, :]
        coefs = numpy.zeros(m.shape[1])
        coefs[nnz] = new_coefs
    else:
        raise Exception("Unknown algorithm: " + algoname)

    # Save the results.
    nnz = coefs.nonzero()[0]
    nnz_coefs = coefs[nnz]
    with h5py.File(filename_lars) as f:
        if "result_nnz" in f:
            del f["result_nnz"]
        if "result_nnz_coefs" in f:
            del f["result_nnz_coefs"]
        f.create_dataset("result_nnz",
                         data=nnz,
                         compression="gzip",
                         compression_opts=5)
        f.create_dataset("result_nnz_coefs",
                         data=nnz_coefs,
                         compression="gzip",
                         compression_opts=5)
示例#32
0
    newRows = len(X_train)
    newCols = len(X_train[0])
    newRowst = len(X_test)
    newColst = len(X_test[0])

    newRowsL = len(y_train)
    Features = chiSqr(X_train, y_train, feat)

    allFeatures.append(Features)
    argument = copy.deepcopy(Features)

    data_fea = CreateDataSet(argument, X_train)
    # print("New Data Made, rows= ",len(data_fea)," cols= ",len(data_fea[0]))

    svm.fit(data_fea, y_train)
    logReg.fit(data_fea, y_train)
    NaiveB.fit(data_fea, y_train)
    NearestCen.fit(data_fea, y_train)

    TestFeatures = chiSqr(X_test, y_test, feat)
    test_fea = CreateDataSet(TestFeatures, X_test)

    len_test_fea = len(test_fea)
    count_svm = 0
    count_log = 0
    count_nb = 0
    count_nc = 0
    count = 0
    for j in range(0, len_test_fea, 1):
        predLab_svm = int(svm.predict([test_fea[j]]))
# Find the accuracy on Testing Dataset
predicted_label = blrPredict(W, test_data)
print('\n Testing set Accuracy:' + str(100 * np.mean((predicted_label == test_label).astype(float))) + '%')
print(confusion_matrix(test_label, predicted_label, labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
stop_time_LR = time.time() - start_time_LR
print("Time taken for Logistic Regression {}.seconds\n".format(str(stop_time_LR)))

# Code for SVM
print("Learning SVM Using Linear Kernel")

svm = SVC(kernel = 'linear')
#train_label = train_label.flatten()
indexes = np.random.randint(50000, size = 10000)
sample_data = train_data[indexes, :]
sample_label = train_label[indexes, :]
svm.fit(sample_data, sample_label.flatten())

traning_accuracy = svm.score(train_data, train_label)
traning_accuracy = str(100*traning_accuracy)
print("Traning data Accuracy for Linear Kernel: {}%\n".format(traning_accuracy))
validation_accuracy = svm.score(validation_data, validation_label)
validation_accuracy = str(100*validation_accuracy)
print("Validation data Accuracy for Linear Kernel: {}%\n".format(validation_accuracy))
test_accuracy = svm.score(test_data, test_label)
test_accuracy = str(100*test_accuracy)
print("Test data Accuracy for Linear Kernel: {}%\n".format(test_accuracy))
time_linear_kernel = time.time() - start_time_linear_kernel

print("Time taken for SVM using Linear Kernel {}.seconds\n\n\n".format(str(time_linear_kernel)))

示例#34
0
print('Accuracy of GNB classifier on training set: {:.2f}'.format(
    gnb.score(X_train, Y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'.format(
    gnb.score(X_test, Y_test)))


def svc_param_selection(X, y, nfolds):
    from sklearn import svm
    import numpy as np

    GridSearchCV = sklearn.model_selection.GridSearchCV
    Cs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    # gammas = [0.001, 0.01, 0.1, 1]
    kernels = ['linear', 'rbf']
    param_grid = {'C': Cs, 'kernel': kernels}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_


# Learning
params = svc_param_selection(X_train, Y_train, 3)
print params
svm = SVC(**params)
# svm = SVC()
svm.fit(X_train, Y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'.format(
    svm.score(X_train, Y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(
    svm.score(X_test, Y_test)))
def run(subj_id, acq_date, subj_data=None):

    conf = ul_sens_fmri.config.get_conf()
    conf.ana = ul_sens_analysis.config.get_conf()

    cm = np.zeros((
        len(conf.ana.roi_names),
        2,  # pres loc (upper, lower),
        conf.exp.n_img,
        conf.exp.n_src_locs,  # (above, below)
        2  # (above, below) predicted
    ))

    for (i_vf, vf) in enumerate(("upper", "lower")):

        # get the data for this VF
        subj_data = ul_sens_analysis.mvpa.data.get_mvpa_data(
            subj_id, acq_date, vf)

        for (i_roi, roi_name) in enumerate(conf.ana.roi_names):

            beta_data = subj_data[0][roi_name]
            loc_data = subj_data[1][roi_name]

            # beta_data needs to be z-scored
            # combine the images and source locations together so we can get a
            # mean and std for each run
            temp_beta = np.concatenate((beta_data[:, 0, ...], beta_data[:, 1,
                                                                        ...]))
            run_mean = np.mean(temp_beta, axis=0)
            run_std = np.std(temp_beta, axis=0)

            # do the z-scoring
            beta_data = ((beta_data - run_mean[np.newaxis, np.newaxis, ...]) /
                         run_std[np.newaxis, np.newaxis, ...])

            node_k = len(loc_data)

            for i_test_run in xrange(conf.exp.n_runs):

                # exclude the current 'test' run
                i_train_runs = np.setdiff1d(range(conf.exp.n_runs),
                                            [i_test_run])

                train_data = np.empty(
                    (len(i_train_runs) * conf.exp.n_src_locs * conf.exp.n_img,
                     node_k))
                train_data.fill(np.NAN)

                train_labels = np.empty(train_data.shape[0])
                train_labels.fill(np.NAN)

                i_flat = 0
                for i_train_run in i_train_runs:

                    for i_img in xrange(conf.exp.n_img):

                        for (i_sl, sl_label) in enumerate([-1, 1]):

                            train_data[i_flat, :] = beta_data[i_img, i_sl,
                                                              i_train_run, :]

                            train_labels[i_flat] = sl_label

                            i_flat += 1

                svm = sklearn.svm.SVC(kernel="linear")

                svm.fit(train_data, train_labels)

                # testing
                for i_img in xrange(conf.exp.n_img):

                    curr_pred = svm.predict(beta_data[i_img, :, i_test_run, :])

                    for (true_val, pred_val) in zip([-1, 1], curr_pred):

                        if true_val == -1:
                            i_true = 0
                        else:
                            i_true = 1

                        if pred_val == -1:
                            i_pred = 0
                        else:
                            i_pred = 1

                        cm[i_roi, i_vf, i_img, i_true, i_pred] += 1

    return cm
示例#36
0
#from sklearn import KNeighborsClassifier
from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors = 100)
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print("knn score:", knn.score(x_test,y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))

#%% svm

from sklearn import svm
svm = svm.SVC(random_state=1)
svm.fit(x_train,y_train)
prediction_svm = svm.predict(x_test)
print("svm accuary: ",svm.score(x_test,y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction_svm))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction_svm)))
#%% rf classification

from sklearn import ensemble
rf= ensemble.RandomForestClassifier(n_estimators=10,random_state=1)
rf.fit(x_train,y_train)
prediction_rf = rf.predict(x_test)
print("rf accuracy: ",rf.score(x_test,y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction_rf))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction_rf))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction_rf)))
示例#37
0
        temp[0][0] = 1
    else:
        temp[0][1] = 1
    x_test.append(temp[0])
X_test = [(x, np.empty((0, 2), dtype=np.int)) for x in x_test]
print len(x_test)
for i in range(len(test_labels)):
    test_labels = test_labels.astype(int)
"""
print len(test_labels)
pbl = GraphCRF(inference_method='ad3')
svm = NSlackSSVM(pbl, C=1,n_jobs = 1,verbose = 1)
start = time()
print len(X_valid)
print len(valid_Y)
svm.fit(X_valid, valid_Y)
print "fit finished"
time_svm = time() - start
print X_test[i][0].shape
print svm.score(X_valid,valid_Y)
print svm.score(X_test,test_Y)
y_pred = np.vstack(svm.predict(np.array(X_valid)))
print("Score with pystruct crf svm: %f (took %f seconds)"
      % (np.mean(y_pred == valid_Y), time_svm))
y_predt = np.vstack(svm.predict(np.array(X_test)))
print("Score with pystruct crf svm: %f (took %f seconds)"
      % (np.mean(y_predt == test_Y), time_svm))


#we throw away void superpixels and flatten everything
#y_pred, y_true = np.hstack(y_pred), np.hstack(valid_Y)
示例#38
0
accuracy_score(test_target, knn_pred)

##################
# Random Forest  #
##################

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500)
rf.fit(train, train_target)
rf_pred = rf.predict(test)

#Confusion Matrix and Accuracy Score
print(confusion_matrix(test_target, rf_pred))
accuracy_score(test_target, rf_pred)

###################
##      SVM      ##
###################

from sklearn import svm

svm = svm.SVC()

svm.fit(train, train_target)
svm_pred = svm.predict(test)

#Confusion Matrix and Accuracy Score
print(confusion_matrix(test_target, svm_pred))
accuracy_score(test_target, svm_pred)
示例#39
0
def main():
    mnist = fetch_openml(name='mnist_784')
    echantillon = np.random.randint(70000, size=5000)
    data = mnist.data[echantillon]
    target = mnist.target[echantillon]

    xtrain, xtest, ytrain, ytest = train_test_split(data,
                                                    target,
                                                    train_size=0.7)

    classifier = svm.SVC(kernel='linear')
    classifier.fit(xtrain, ytrain)
    error = 1 - classifier.score(xtest, ytest)
    print(f"Score SVM linéaire : {error}")

    kernels = []
    print("Modification du kernel : ")
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
        classifier = svm.SVC(kernel=kernel)

        start_training = time.time()
        classifier.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = classifier.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        kernels.append((kernel, final_training, final_prediction, error))
        print(f"\t {kernels[-1]}")

    kernels_liste = list(zip(*kernels))

    plot_fig(kernels_liste)

    tol = []
    print("Evolution de la tolérance : ")
    for tolerance in np.linspace(0.1, 1.0, num=5):
        svm = svm.SVC(C=tolerance)

        start_training = time.time()
        svm.fit(xtrain, ytrain)
        final_training = time.time() - start_training

        start_prediction = time.time()
        ypred = svm.predict(xtest)
        final_prediction = time.time() - start_prediction

        error = metrics.zero_one_loss(ytest, ypred)
        error_training = svm.score(xtrain, ytrain)
        tol.append((tolerance, final_training, final_prediction, error,
                    error_training))
        print(f"\t {tol[-1]}")

    tol_list = list(zip(*tol))

    plot_fig(tol_list)

    plt.figure(figsize=(19, 9))
    plt.plot(tol_list[0], tol_list[3], 'x-', color='blue')  # erreur de test
    plt.plot(tol_list[0], tol_list[-1], 'x-',
             color='orange')  # erreur d'entrainement
    plt.grid(True)
    plt.show()

    best_kernel = 'rbf'
    best_tolerance = 1.0
    best_svm = svm.SVC(kernel=best_kernel, C=best_tolerance)

    start_training = time.time()
    best_svm.fit(xtrain, ytrain)
    best_final_entrainement = time.time() - start_training

    start_prediction = time.time()
    ypred = best_svm.predict(xtest)
    best_final_prediction = time.time() - start_prediction

    cross_val = model_selection.cross_val_score(best_svm, data, target, cv=10)
    meilleure_erreur = 1 - np.mean(cross_val)

    print(f"Durée de l'entraînement : {best_final_entrainement}")
    print(f"Durée de la prédiction : {best_final_prediction}")
    print(f"Erreur : {meilleure_erreur}")

    cm = confusion_matrix(ytest, ypred)
    df_cm = pd.DataFrame(cm, columns=np.unique(ytest), index=np.unique(ytest))
    df_cm.index.name = 'Valeur réelle'
    df_cm.columns.name = 'Valeur prédite'
    plt.figure(figsize=(16, 9))
    sn.heatmap(df_cm, cmap="Blues", annot=True)
    plt.show()
示例#40
0
def generate_roc_curves(tr_data, tr_labels, split_number, te_data, te_labels):

    # zero-one losses for naive bayes (nb) and support vector machine (svm)
    params_to_keep = set()
    for j in range(num_features):
        best_feature = 0
        best_loss = 1.0
        for k in range(tr_data.shape[1]):
            if k in params_to_keep: continue
            svm = LinearSVC(penalty='l2', C=0.5, dual=False)
            params_to_keep.add(k)

            lparams = list(params_to_keep)
            svm.fit(tr_data[:, lparams], tr_labels)
            preds = svm.predict(tr_data[:, lparams])
            loss = zero_one_loss(preds, tr_labels)
            params_to_keep.discard(k)
            if (loss <= best_loss):
                best_feature = k
                best_loss = loss
        params_to_keep.add(best_feature)

    # We now have the best features
    lparams = list(params_to_keep)
    nb1 = ClassifierResult('Naive Bayes (L1 features)', [], [])
    svm1 = SVMClassifierResult('svm:_c_=_1.0', [], [], [])
    svm2 = SVMClassifierResult('svm:_c_=_0.75', [], [], [])
    svm3 = SVMClassifierResult('svm:_c_=_0.50', [], [], [])
    svm4 = SVMClassifierResult('svm:_c_=_0.25', [], [], [])
    naive = ClassifierResult('Naive Classifier', [], [])

    ## SVM DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION
    classifiers = {
        'svm:_c_=_1': svm1,
        'svm:_c_=_.75': svm2,
        'svm:_c_=_.50': svm3,
        'svm:_c_=_.25': svm4
    }
    random_state = np.random.RandomState(0)

    for model_type in classifiers:
        train_data = tr_data
        test_data = te_data
        if model_type == "svm:_c_=_1":
            model = LinearSVC(penalty='l1', C=1, dual=False)
        elif model_type == "svm:_c_=_.75":
            model = LinearSVC(penalty='l1', C=0.75, dual=False)
        elif model_type == "svm:_c_=_.50":
            model = LinearSVC(penalty='l1', C=0.50, dual=False)
        elif model_type == "svm:_c_=_.25":
            model = LinearSVC(penalty='l1', C=0.25, dual=False)
        # elif model_type == "naive bayes":
        #     model = MultinomialNB()
        model.fit(train_data, tr_labels)
        y_score = model.decision_function(test_data)
        print(y_score)
        print("-=-=-")
        print(te_labels)
        fpr, tpr, _ = roc_curve(te_labels - 1, y_score)
        print(fpr, tpr)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        lw = 2
        plt.plot(fpr,
                 tpr,
                 color="darkorange",
                 lw=lw,
                 label="ROC Curve Area = {:.4f}".format(roc_auc))
        plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        plt.legend(loc="lower right")
        fn = "figures/ROC_" + model_type + ".png"
        plt.savefig(fn, bbox_inches='tight')
        plt.clf()
示例#41
0
def greedy_subset_svm(tr_data, tr_labels, num_features, split_number, te_data,
                      te_labels):
    nb1 = ClassifierResult('GS Naive Bayes', [], [])
    svm1 = GSSVMClassifierResult('GS svm: c = 0.5', [], [], [])
    naive = ClassifierResult('Naive Classifier', [], [])
    greed = GreedyResult('Greedy SVM', num_features)

    for i in range(split_number):
        print('Fold: ' + str(i))
        cv_tr_data,cv_tr_labels,cv_te_data,cv_te_labels\
            = split_data(tr_data,tr_labels,split_number,i)

        # We want to use some of the training data as 'validation' data for picking
        # the best subset.  We will use 90% of the data for training, 10% for validation.
        cv_training_data = np.array(cv_tr_data[:int(len(cv_tr_data) * .9)])
        cv_training_labels = cv_tr_labels[:int(len(cv_tr_labels) * .9)]
        cv_validation_data = np.array(cv_tr_data[int(len(cv_tr_data) * .9):])
        cv_validation_labels = cv_tr_labels[int(len(cv_tr_labels) * .9):]

        params_to_keep = set()
        for j in range(num_features):
            # print('Feature: ' + str(j))

            best_feature = 0
            best_loss = 1.0
            for k in range(cv_training_data.shape[1]):
                if k in params_to_keep: continue
                svm = LinearSVC(penalty='l2', C=0.5, dual=False)
                params_to_keep.add(k)

                lparams = list(params_to_keep)
                svm.fit(cv_training_data[:, lparams], cv_training_labels)
                preds = svm.predict(cv_validation_data[:, lparams])
                loss = zero_one_loss(preds, cv_validation_labels)
                params_to_keep.discard(k)
                if (loss <= best_loss):
                    best_feature = k
                    best_loss = loss
            params_to_keep.add(best_feature)
            greed.losses[j] = greed.losses[j] + [best_loss]

        # We now have the best features
        lparams = list(params_to_keep)
        svm = LinearSVC(penalty='l2', C=0.5, dual=False)
        svm.fit(cv_training_data[:, lparams], cv_training_labels)
        # Use the real cross validation testing data now to get an accurate loss
        preds = svm.predict(cv_te_data[:, lparams])
        loss = zero_one_loss(preds, cv_te_labels)
        svm1.zero_one_loss += [loss]
        params = [0 for x in range(cv_training_data.shape[1])]
        coefs = svm.coef_.ravel()
        for i in range(0, len(lparams)):
            params[lparams[i]] = coefs[i]
        svm1.params += [params]
        svm1.columns += [lparams]

        preds = svm.predict(te_data[:, lparams])
        loss = zero_one_loss(preds, te_labels)
        svm1.test_loss += [loss]

        nb = MultinomialNB()
        nb.fit(cv_training_data[:, lparams], cv_training_labels)
        preds = nb.predict(cv_te_data[:, lparams])
        loss = zero_one_loss(preds, cv_te_labels)
        nb1.zero_one_loss += [loss]

        preds = nb.predict(te_data[:, lparams])
        loss = zero_one_loss(preds, te_labels)
        nb1.test_loss += [loss]

        # Naive
        preds = [2 for x in range(len(cv_te_labels))]
        loss = zero_one_loss(preds, cv_te_labels)
        naive.zero_one_loss += [loss]

        preds = [2 for x in range(len(te_labels))]
        loss = zero_one_loss(preds, te_labels)
        naive.test_loss += [loss]

    return nb1, svm1, naive, greed
示例#42
0
def k_fold_cross_validation(tr_data, tr_labels, split_number, te_data,
                            te_labels):

    # zero-one losses for naive bayes (nb) and support vector machine (svm)

    nb1 = ClassifierResult('Naive Bayes (L1 features)', [], [])
    svm1 = SVMClassifierResult('svm: c = 1.0', [], [], [])
    svm2 = SVMClassifierResult('svm: c = 0.75', [], [], [])
    svm3 = SVMClassifierResult('svm: c = 0.50', [], [], [])
    svm4 = SVMClassifierResult('svm: c = 0.25', [], [], [])
    naive = ClassifierResult('Naive Classifier', [], [])

    for i in range(split_number):
        cv_tr_data,cv_tr_labels,cv_te_data,cv_te_labels\
            = split_data(tr_data,tr_labels,split_number,i)

        ## SVM DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION
        svm = LinearSVC(penalty='l1', C=1.0, dual=False)
        svm.fit(cv_tr_data, cv_tr_labels)
        preds = svm.predict(cv_te_data)
        loss = zero_one_loss(preds, cv_te_labels)
        svm1.zero_one_loss += [loss]
        svm1.params += [svm.coef_.ravel()]

        preds = svm.predict(te_data)
        loss = zero_one_loss(preds, te_labels)
        svm1.test_loss += [loss]

        ## SVM_C1 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION
        svm_c1 = LinearSVC(penalty='l1', C=0.75, dual=False)
        svm_c1.fit(cv_tr_data, cv_tr_labels)
        preds = svm_c1.predict(cv_te_data)
        loss = zero_one_loss(preds, cv_te_labels)
        svm2.zero_one_loss += [loss]
        svm2.params += [svm.coef_.ravel()]

        preds = svm_c1.predict(te_data)
        loss = zero_one_loss(preds, te_labels)
        svm2.test_loss += [loss]

        ## SVM_C2 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION
        svm_c2 = LinearSVC(penalty='l1', C=.50, dual=False)
        svm_c2.fit(cv_tr_data, cv_tr_labels)
        preds = svm_c2.predict(cv_te_data)
        loss = zero_one_loss(preds, cv_te_labels)
        svm3.zero_one_loss += [loss]
        svm3.params += [svm.coef_.ravel()]

        preds = svm_c2.predict(te_data)
        loss = zero_one_loss(preds, te_labels)
        svm3.test_loss += [loss]

        ## SVM_C3 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION
        svm_c3 = LinearSVC(penalty='l1', C=0.25, dual=False)
        svm_c3.fit(cv_tr_data, cv_tr_labels)
        preds = svm_c3.predict(cv_te_data)
        loss = zero_one_loss(preds, cv_te_labels)
        svm4.zero_one_loss += [loss]
        svm4.params += [svm.coef_.ravel()]

        preds = svm_c3.predict(te_data)
        loss = zero_one_loss(preds, te_labels)
        svm4.test_loss += [loss]

        nb = MultinomialNB()
        params_to_use = [
            i for i, x in enumerate(svm_c2.coef_.ravel()) if x != 0
        ]
        nb.fit(cv_tr_data[:, list(params_to_use)], cv_tr_labels)
        preds = nb.predict(cv_te_data[:, list(params_to_use)])
        loss = zero_one_loss(preds, cv_te_labels)
        nb1.zero_one_loss += [loss]

        preds = nb.predict(te_data[:, list(params_to_use)])
        loss = zero_one_loss(preds, te_labels)
        nb1.test_loss += [loss]

        # Naive
        preds = [2 for x in range(len(cv_te_labels))]
        loss = zero_one_loss(preds, cv_te_labels)
        naive.zero_one_loss += [loss]

        preds = [2 for x in range(len(te_labels))]
        loss = zero_one_loss(preds, te_labels)
        naive.test_loss += [loss]

    return nb1, svm1, svm2, svm3, svm4, naive
示例#43
0
        # decision boundry vector hyperplane

        db1 = hyperplane(hyp_x_min, self.w, self.b,0)
        db2 = hyperplane(hyp_x_max, self.w, self.b,0)
        self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2],'y--')        
    
        plt.show()


data_dict = { -1:np.array([[1,7],[2,8],[3,8]]),
              1:np.array([[5,1],[6,-1],[7,3]])}




svm = Support_Vector_Machine()
svm.fit(data=data_dict)
predict_us = [[0,10],
              [1,3],
              [3,4],
              [3,5],
              [5,5],
              [5,6],
              [6,-5],
              [5,8]]

for p in predict_us:
    print(svm.predict(p))
              
svm.visualize()
示例#44
0
文件: svm.py 项目: laurence-lin/SVM
def PCA(x):
    
    cov_x = np.cov(x.T)
    u, s, v = np.linalg.svd(cov_x)
    k = 2
    proj = u[:, 0:k]
    pca_x = np.matmul(x, proj)
    
    return pca_x

x_train_std = PCA(x_train_std)
x_test_std = PCA(x_test_std)

svm = svm.SVC(kernel = 'linear', probability = True)
svm.fit(x_train_std, y_train)

predict_y = svm.predict(x_test_std)
label_y = y_test
print(predict_y)
print(label_y)

correct = (label_y == predict_y).astype(int)
correct_rate = np.mean(correct)

print('Correct test rate', correct_rate)


def plot_decision_boundary(X, y, clf, test_ind = None, resolution = 0.02):
    '''
    x: 2D array, size [batch, features] , features = 2
示例#45
0
        y = self.inputs["in1"]
        self.results["out0"] = (x == y)


# Load iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

# Setup data feeder component
feeder = brica1.ConstantComponent()
feeder.make_out_port("out0", 2)

# Setup  components
svm = SVMComponent(2)
svm.fit(X, y)

RFC = RandomForestClassifierComponent(2)
RFC.fit(X, y)

SR = SVMvsRFC_Component(1)

# Connect the components
brica1.connect((feeder, "out0"), (svm, "in0"))
brica1.connect((feeder, "out0"), (RFC, "in0"))
brica1.connect((svm, "out0"), (SR, "in0"))
brica1.connect((RFC, "out0"), (SR, "in1"))

# Add components to module
mod = brica1.Module()
mod.add_component("feeder", feeder)
test_tfidf = tfidf_transformer.transform(test_data)

#2 antrenare Naive Bayesian
multi_naive_bayes = MultinomialNB()
multi_naive_bayes.fit(train_tfidf, train_labels)

#3 testare Naive Bayesian
test_predicted_nb = multi_naive_bayes.predict(test_tfidf)

#4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ)
#confusion matrix
print_results("Naive Bayes", test_predicted_nb, test_labels)

#2 antrenare SVM
svm = svm.SVC(C=1000)
svm.fit(train_tfidf, train_labels)

#3 testare SVM
test_predicted_svm = svm.predict(test_tfidf)

#4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ)
print_results("SVM", test_predicted_svm, test_labels)

#2 antrenare Passive Aggressive
pa = PassiveAggressiveClassifier(C=0.5, random_state=5)
pa.fit(train_tfidf, train_labels)

#3 testare Passive Aggressive
test_predicted_pa = pa.predict(test_tfidf)

#4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ)
示例#47
0
iris = datasets.load_iris()
X = iris.data
y = iris.target ##变为2分类
X, y = X[y != 2], y[y != 2]
#print(X)
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=0)
# Learn to predict each class against the other
svm = svm.SVC(kernel='linear', probability=True,random_state=random_state)
###通过decision_function()计算得到的y_score的值,用在roc_curve()函数中
y_score = svm.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
示例#48
0
    print '- loading the vocabulary'
    voc = joblib.load(CONST_DICTIONARY_PATH)

print '- setting vocabulary to extractor'
extract_bow.setVocabulary(voc)

if CONST_RETRAIN_MODEL == 1:
    print '- creating feature_extraction data'
    traindata, trainlabels = [], []
    for i in range(213): # 20
        traindata.extend(bow_features(path(pos, i))); trainlabels.append(1)
        traindata.extend(bow_features(path(neg, i))); trainlabels.append(-1)

    print '- feature_extraction the model'
    svm = svm.NuSVC(nu=0.5, kernel='rbf', gamma=0.1, probability=True)
    svm.fit(np.array(traindata), np.array(trainlabels))
    joblib.dump(svm, CONST_SVM_MODEL_PATH, compress=3)
else:
    print '- loading the model'
    svm = joblib.load(CONST_SVM_MODEL_PATH)

predictions = []
tot_geral_pos, tot_geral_neg = 0, 0
tot_pos_pos, tot_pos_neg = 0, 0
tot_neg_pos, tot_neg_neg = 0, 0
#pos=0
print '- testing the model (pos) -> ' + CONST_TEST_DATA_POS_PATH
for file in os.listdir(CONST_TEST_DATA_POS_PATH):
    if file != '.DS_Store':
        tot_geral_pos += 1
        obj_predict = predict(CONST_TEST_DATA_POS_PATH + file)
示例#49
0
X, y = X[y != 2], y[y != 2]

# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=.3, random_state=0)

# Learn to predict each class against the other
svm = svm.SVC(kernel='linear', probability=True, random_state=random_state)

# 通过decision_function() 返回wx+b 计算得到的y_score的值,用在roc_curve()函数中
svm_model = svm.fit(X_train, y_train)
y_score = svm_model.decision_function(X_test)

# Compute ROC curve and ROC area for each class
# fp rate:原本是错的,预测是对的比例(越小越好,0是理想)
# tp rate:原本是对的,预测为对的比例(越大越好,1是理想)
# 分类别 0,1  0.36 < 0.5 => 0
fpr, tpr, threshold = roc_curve(y_test, y_score)  # 计算真正率和假正率
roc_auc = auc(fpr, tpr)  # 计算auc的值
print(roc_auc)

plt.figure()
lw = 2
plt.figure(figsize=(10, 10))
plt.plot(fpr,
         tpr,
示例#50
0
    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
        minibatch_size = num_gpus * self.minibatch_per_gpu

        # Construct TensorFlow graph for each GPU.
        result_expr = []
        for gpu_idx in range(num_gpus):
            with tf.device('/gpu:%d' % gpu_idx):
                Gs_clone = Gs.clone()

                # Generate images.
                latents = tf.random_normal([self.minibatch_per_gpu] +
                                           Gs_clone.input_shape[1:])
                labels = self._get_random_labels_tf(self.minibatch_per_gpu)
                dlatents = Gs_clone.components.mapping.get_output_for(
                    latents, labels, **Gs_kwargs)
                images = Gs_clone.get_output_for(latents, None, **Gs_kwargs)

                # Downsample to 256x256. The attribute classifiers were built for 256x256.
                if images.shape[2] > 256:
                    factor = images.shape[2] // 256
                    images = tf.reshape(images, [
                        -1, images.shape[1], images.shape[2] // factor, factor,
                        images.shape[3] // factor, factor
                    ])
                    images = tf.reduce_mean(images, axis=[3, 5])

                # Run classifier for each attribute.
                result_dict = dict(latents=latents, dlatents=dlatents[:, -1])
                for attrib_idx in self.attrib_indices:
                    classifier = misc.load_pkl(classifier_urls[attrib_idx])
                    logits = classifier.get_output_for(images, None)
                    predictions = tf.nn.softmax(
                        tf.concat([logits, -logits], axis=1))
                    result_dict[attrib_idx] = predictions
                result_expr.append(result_dict)

        # Sampling loop.
        results = []
        for begin in range(0, self.num_samples, minibatch_size):
            self._report_progress(begin, self.num_samples)
            results += tflib.run(result_expr)
        results = {
            key: np.concatenate([value[key] for value in results], axis=0)
            for key in results[0].keys()
        }

        # Calculate conditional entropy for each attribute.
        conditional_entropies = defaultdict(list)
        for attrib_idx in self.attrib_indices:
            # Prune the least confident samples.
            pruned_indices = list(range(self.num_samples))
            pruned_indices = sorted(
                pruned_indices, key=lambda i: -np.max(results[attrib_idx][i]))
            pruned_indices = pruned_indices[:self.num_keep]

            # Fit SVM to the remaining samples.
            svm_targets = np.argmax(results[attrib_idx][pruned_indices],
                                    axis=1)
            for space in ['latents', 'dlatents']:
                svm_inputs = results[space][pruned_indices]
                try:
                    svm = sklearn.svm.LinearSVC()
                    svm.fit(svm_inputs, svm_targets)
                    svm.score(svm_inputs, svm_targets)
                    svm_outputs = svm.predict(svm_inputs)
                except:
                    svm_outputs = svm_targets  # assume perfect prediction

                # Calculate conditional entropy.
                p = [[
                    np.mean([
                        case == (row, col)
                        for case in zip(svm_outputs, svm_targets)
                    ]) for col in (0, 1)
                ] for row in (0, 1)]
                conditional_entropies[space].append(conditional_entropy(p))
示例#51
0
'Delhi Capitals',
'Chennai Super Kings',
'Gujarat Lions',
'Rising Pune Supergiant',
'Pune Warriors India',
'Kochi Tuskers Kerala',
'Deccan Chargers']

teams.sort()
teams_le = le.fit_transform(teams)

#Test Teain Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=1008)

#--------------------------------------SVM
'''
svm = svm.SVC(gamma = 'scale')
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

print(f1_score(y_test,y_pred, average='micro'))

#--------------------------------------Random Forest
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)
y_pred1 = classifier.predict(x_test)

print(f1_score(y_test,y_pred1,average='micro'))
'''
#--------------------------------------XGBoosr
xg = XGBClassifier()
示例#52
0
pred = model.predict(x_cols)
nb_max = pd.crosstab(pred, y_cols)
print('NB 분류정확도')
print(nb_max)

acc = (nb_max.ix[0,0]+nb_max.ix[1,1])/len(test_set)
print("acc = ",acc)


# SVM model
x_cols = train_set[cols[1:]] # terms
y_cols = train_set[cols[0]] # sms_type

svm = svm.SVC(kernel = 'linear')
model = svm.fit(x_cols, y_cols)

x_cols = test_set[cols[1:]] # terms
y_cols = test_set[cols[0]] # sms_type

pred = model.predict(x_cols)
svm_max = pd.crosstab(pred, y_cols)
print("SVM 분류정확도")
print(svm_max)

acc=(svm_max.ix[0,0]+svm_max.ix[1,1])/len(test_set)
print("acc = ",acc)

'''
[1 rows x 6823 columns]
NB 분류정확도
示例#53
0
print ''
print 'Report Logistic Regression:'
print(classification_report(y_val, y_pred))

# 2.- Support Vector Machine: (https://es.wikipedia.org/wiki/M%C3%A1quinas_de_vectores_de_soporte)
#Import the model:
from sklearn import svm
#If you want to change some of this hiperparameters see:
#http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svm = svm.SVC()
print 'Hiperparameters defined in SVM:'
print ''
print svm
print ''
#Train the model:
svm.fit(X_train, y_train)

#Make predictions in validation set:
y_pred = svm.predict(X_val)

#Create the confussion Matrix:
#[[True Negatives, False Negatives]]
#[[False Positives, True Positives]]
print 'Confusion Matrix SVM:'
print ''
print confusion_matrix(y_pred, y_val)
print ''
print 'Report SVM:'
print ''
print(classification_report(y_val, y_pred))
示例#54
0
def train_svm_regression(features, labels, c_param, kernel='linear'):
    svm = sklearn.svm.SVR(C=c_param, kernel=kernel)
    svm.fit(features, labels)
    train_err = np.mean(np.abs(svm.predict(features) - labels))
    return svm, train_err
示例#55
0
    vectorizer = CountVectorizer(ngram_range=args.ngrange,
                                 stop_words=stop_words,
                                 vocabulary=vectorizer.vocabulary_,
                                 binary=args.onehot,
                                 analyzer='word',
                                 token_pattern=r'\b[^\W\d]+\b')
    Trained_Vectors = vectorizer.fit_transform(training_corpus).toarray()

    print "Features Used in this iteration were:"
    print vectorizer.vocabulary_.keys()
    print "\n"

    test_vectors = vectorizer.transform(test_corpus).toarray()

    #Training each Classifier
    svm_clf = svm.fit(Trained_Vectors, train_labels)
    knn_clf = knn.fit(Trained_Vectors, train_labels)
    dt_clf = dt.fit(Trained_Vectors, train_labels)
    gnb_clf = gnb.fit(Trained_Vectors, train_labels)
    lr_clf = lr.fit(Trained_Vectors, train_labels)

    #Making Predictions
    svm_predictions = svm_clf.predict(test_vectors)
    knn_predictions = knn_clf.predict(test_vectors)
    dt_predictions = dt_clf.predict(test_vectors)
    gnb_predictions = gnb_clf.predict(test_vectors)
    lr_predictions = lr_clf.predict(test_vectors)

    svm_accuracy_list.append(calc_acc(svm_predictions))
    knn_accuracy_list.append(calc_acc(knn_predictions))
    dt_accuracy_list.append(calc_acc(dt_predictions))
    for image_path, descriptor in des_list_test[1:]:
        descriptors_test = np.vstack((descriptors_test, descriptor))

    #Initialize an SVM classifier
    svm = SVC(C=1.0,
              cache_size=200,
              class_weight=None,
              coef0=0.0,
              decision_function_shape=None,
              probability=True,
              degree=2,
              gamma='auto',
              kernel='rbf',
              verbose=False)

    clf = svm.fit(descriptors_train, y[train])

    ##Accuracy
    score = clf.score(descriptors_test, y[test])
    cvscores.append(score)
    print score

    ##Confusion matrix
    conf1 = confusion_matrix(y[test], clf.predict(descriptors_test))
    conf = conf + conf1

    ####ROC curve
    probas_ = clf.fit(descriptors_train,
                      y[train]).predict_proba(descriptors_test)
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
示例#57
0
    X_test.loc[seg_id, 'kurt'] = kurtosis(x)

X = X_train.copy()
y = y_train.copy()
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# 正则化X
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Feature engineering ends.")

print("SVM...")
svm = NuSVR()
svm.fit(X_train_scaled, y_train.values.flatten())
y_pred_svm = svm.predict(X_train_scaled)
score = mean_absolute_error(y_train.values.flatten(), y_pred_svm)
print(f'Score: {score:0.3f}')
y_pred_svm = svm.predict(X_test_scaled)
submission['time_to_failure'] = y_pred_svm
submission.to_csv('submission_svm.csv')
print("SVM ends.")

print("LightGBM...")
folds = KFold(n_splits=5, shuffle=True, random_state=42)
params = {
    'objective': "regression",
    'boosting': "gbdt",
    'metric': "mae",
    'boost_from_average': "false",
示例#58
0
def trainSVMregression_rbf(Features, Y, Cparam):
    svm = sklearn.svm.SVR(C=Cparam, kernel='rbf')
    svm.fit(Features, Y)
    train_err = numpy.mean(numpy.abs(svm.predict(Features) - Y))
    return svm, train_err
示例#59
0
# Get an numpy ndarray of the pixels
labels = dataset_train[[0]].values.ravel()
train = dataset_train.iloc[:, 1:].values
test = dataset_test.values

# Get PCA
start_time_pca = time.time()
# Want PCA to have explained_variance_ratio_ > 0.9 (as 0 < num < 1)
pca = PCA(n_components=0.8, whiten=True)
train_pca = pca.fit_transform(train)
end_time_pca = time.time()

# Create and train the SVM
start_time_svm = time.time()
svm = svm.SVC(C=10.0, verbose=True)
svm.fit(train_pca, labels)
end_time_svm = time.time()

# Make the prediction on test but first has to use PCA
test_pca = pca.transform(test)
pred = svm.predict(test_pca)

# Save as a dataSet
np.savetxt('./data/solution_svm.csv',
           np.c_[range(1,
                       len(test) + 1), pred],
           delimiter=',',
           header='ImageId,Label',
           comments='',
           fmt='%d')
示例#60
0
                #res = cv2.resize(img,(250,250))
                res = img
                gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
                xarr = np.squeeze(np.array(gray_image).astype(np.float32))
                m, v = cv2.PCACompute(xarr, mean=np.array([]))
                arr = np.array(v)
                flat_arr = arr.ravel()
                training_set.append(flat_arr)
                training_labels.append(label)
    print 'done ', root

trainData = np.float32(training_set)
responses = training_labels
#svm = cv2.SVM()
svm = sklearn.svm.LinearSVC(C=1.0, random_state=0)
svm.fit(trainData, responses)
#svm.save('svm_data.dat')
print 'training done!'

print 'testing...'

path = 'test/'
testing_set = []
testing_labels = []

for root, dirs, files in os.walk(path):
    for name in files:
        if name.endswith((".png")):
            if (os.path.getsize(root + str('/') + name)) != 0:
                label = root.split('/')[1]
                img = cv2.imread(root + str('/') + name)