def evalParams(m1=MEAN_COEFF, m2=STD_COEFF,  epsilon=EPSILON_FACTOR,gap=GAP_FACTOR, overlap=OVERLAP_FACTOR):       
    sum = 0
    for stitch in xrange(stitchesNum):
        data = []
        tags = []
        for subject in subjects:
            for index in xrange(8):
                try:
                    input = getAMCInput(joint, subject, index)
                except:
                    continue
                parts = st.createParts(input, partsAmount)
                stitched = st.stitch(parts, m1, m2, epsilon,gap, overlap)
                #plt.figure()
                #plt.plot(stitched)
                periods = pr.breakToPeriods(stitched)
                periods = ut.alignByMaxMany(periods)
                periods = inter.getUniformSampledVecs(periods, 100)
                data = data + periods
                tags = tags + [subject]*len(periods)
                #st.plotParts(periods)
        
        cl = KNeighborsClassifier()
        cl.n_neighbors = 5
        cl.weights = 'distance' 
        testSize = 1
        score = crossValidate(cl, data, tags, testSize, testAmount)
        #print str(m2)+' '+ str(score)
        sum+=score
    score = float(sum)/stitchesNum
    scores[m1, m2] = score
    return score
def evaluate_kNN(x_pos, y_pos, x, y, folds, n_params, runs, steps,
                 k_neighbors):
    print("in evaluate kNN")
    neigh = KNeighborsClassifier(n_neighbors=k_neighbors)
    svd_m = decomposition.TruncatedSVD(algorithm='randomized',
                                       n_components=n_params,
                                       n_iter=7)
    scores = []
    run = []
    for i in np.arange(n_params, runs, steps):
        # svd_model = svd_m.fit(x_pos, y_pos)
        # x_svd = svd_model.transform(x)
        # test_svd = svd_model.transform(test)
        # neigh.fit(x_svd, y)
        # val_list = cross_val_score(neigh, x_ch2, y, cv=folds, scoring='f1').mean()
        neigh.n_neighbors = i
        ch2_model = SelectKBest(chi2, k=i).fit(x, y)
        x_ch2 = ch2_model.transform(x)
        neigh.fit(x_ch2, y)
        val_list = cross_val_score(neigh, x_ch2, y, cv=folds,
                                   scoring='f1').mean()
        scores.append(val_list)
        run.append(i)
    evaluate_knn_csv(scores, k_neighbors, folds, chi2_n_params, run)
    print(scores)
예제 #3
0
파일: run_model.py 프로젝트: t36li/FINRA
def KNN(x_train,y_train,x_test, udf_kneighbors=100, do_CV=False):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.cross_validation import train_test_split
	from sklearn.metrics import roc_auc_score

	### variables may be in different scales, so mean standardize the variables ###
	### Mean Normalize variables before regression ###
	from sklearn.preprocessing import StandardScaler
	ss=StandardScaler()
	x_train=ss.fit_transform(x_train)
	x_test=ss.fit_transform(x_test)

	neigh=KNeighborsClassifier(weights='distance')	
	if do_CV:
		k_list=[25,125,255,387] #important to have odd numbers

		### Try different parameters of K for optimal value ###
		### Randomly divide training set into 80/20 split ###
		cv_score=list()		
		for k in k_list:
			neigh.n_neighbors=k
			x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_train,y_train,test_size=0.20, random_state=42)

			neigh.fit(x_train_cv,y_train_cv)
			y_pred=neigh.predict_proba(x_test_cv)[:,1]
			cv_score.append(roc_auc_score(y_test_cv,y_pred))			

		neigh.fit(x_train,y_train)
		y_pred=neigh.predict_proba(x_test)[:,1]

		print 'Cross Validation KNN Results........'
		print 'Parameters, CV_Scores'
		for i in range(len(cv_score)):
			print k_list[i], cv_score[i]
	else:
		print 'Making Prediction with optimal K neighbors...'
		neigh.n_neighbors=udf_kneighbors
		neigh.fit(x_train,y_train)
		y_pred=neigh.predict_proba(x_test)[:,1]
		print 'Writing submission file....'
		with open('KNN_Submission.csv','wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'
예제 #4
0
def useLibraryClassifier(avgRGBList, classifierVal, prnt=1):
    dataArr = []
    if classifierVal == 0:
        classifier = KNeighborsClassifier()
    elif classifierVal == 1:
        classifier = GaussianNB()
    elif classifierVal == 2:
        classifier = SVC()
    startTime = time.process_time()
    for currentFold in range(0, 10):
        x_train, y_train, x_test, y_test = [], [], [], []
        for y in range(1, 5):
            for x in range(0, len(avgRGBList[y])):
                x_train.append(avgRGBList[y][x][0])
                y_train.append(avgRGBList[y][x][1])
        for z in range(0, len(avgRGBList[0])):
            x_test.append(avgRGBList[0][z][0])
            y_test.append(avgRGBList[0][z][1])
        classifier.fit(x_train, y_train)
        if classifierVal == 0:
            for k in range(0, 10):
                classifier.n_neighbors = k + 1
                accuracy = float(classifier.score(x_test, y_test))
                dataArr.append(accuracy)
        else:
            accuracy = float(classifier.score(x_test, y_test))
            dataArr.append(accuracy)
        avgRGBList = avgRGBList[0] + avgRGBList[1] + avgRGBList[2] + avgRGBList[3] + avgRGBList[4]
        avgRGBList = crossValidation(avgRGBList)
    for x in range(0, 10):
        sum = 0.0
        max = 0.0
        avgAccuracy = 0.0
        if classifierVal == 0:
            for y in range(0, 10):
                sum += dataArr[10 * y + x]
            if (sum/10 > max):
                max = sum/10
                avgAccuracy = max
            if prnt==1: print("Average library-KNN accuracy for k == " + str(x+1) + ": " + str(sum/10))
        elif classifierVal == 1:
            if prnt==1: print("Average Gaussian Naive Beyers accuracy for fold " + str(x+1) + ": " + str(dataArr[x]))
            for x in dataArr:
                sum += x
            avgAccuracy = sum/10
        elif classifierVal == 2:
            if prnt==1: print("Average Support Vector Classifier accuracy for fold " + str(x+1) + ": " + str(dataArr[x]))
            for x in dataArr:
                sum += x
            avgAccuracy = sum/10
    endTime = time.process_time()
    if prnt==1: print("Run-time: " + str(endTime - startTime) + " fractal seconds.    *NOTE: Module time used, NOT Module timeit")    
    print("\n")
    return [classifierVal, avgAccuracy, endTime - startTime]
예제 #5
0
def optimal_k(x, y):
    opt_k = 0
    max_quality = 0
    generator = KFold(
        n_splits=5, shuffle=True,
        random_state=42)  # shuffles the dataset and breaks it into n (5) parts
    classifier = KNeighborsClassifier()
    for __k in range(1, 50):
        classifier.n_neighbors = __k
        qualities = cross_val_score(estimator=classifier,
                                    X=x,
                                    y=y,
                                    cv=generator)
        avg_quality = sum(qualities) / float(len(qualities))
        if avg_quality >= max_quality:
            max_quality = avg_quality
            opt_k = __k
    return [opt_k, max_quality]
예제 #6
0
def KNN(train_X, train_Y, test_X, test_Y, K_list):
    knn = KNeighborsClassifier(weights='distance')
    knn_acc_list = []
    for K in K_list:
        knn.n_neighbors = K
        # knn_acc_list = []
        acc = classify(knn, train_X, train_Y, test_X, test_Y)
        knn_acc_list.append(acc)
        print K, acc
        # print knn_acc_list

    return knn_acc_list
    plt.plot(K_list, knn_acc_list, '--^', label=K)
    plt.title('KNN')
    plt.xlabel('K')
    plt.ylabel('accuracy')
    plt.ylim((0.45, 0.65))

    plt.legend(loc='lower right', numpoints=1)
    plt.show()
예제 #7
0
파일: model_trainer.py 프로젝트: SeanU/waq
def find_best_model(df, contaminant, verbose=False):
    train_data, test_data = splitData(df[df.contaminant == contaminant])

    ### make sure the values make sense:
    if verbose:
        print('Contaminant ', contaminant)
        print('Status Levels: ', df.status.unique())
        print('Status Codes: ', df.status_numeric.unique())
        print('train data sample size', train_data.size)
        print('test data sample size', test_data.size)

    train_labels = train_data.status_numeric

    # create model templates
    RF = RandomForestClassifier()
    kNN = KNeighborsClassifier()

    kNN_scores = []
    RF_scores = []
    for p in range(2, 100):
        kNN.n_neighbors = p
        RF.n_estimators = p

        kNN.fit(X=train_data[['lat', 'lng', 'time_delta']],
                y=train_data.status_numeric)
        kNN_scores.append((p,
                           kNN.score(X=test_data[['lat', 'lng', 'time_delta']],
                                     y=test_data.status_numeric)))

        RF.fit(X=train_data[['lat', 'lng', 'time_delta']],
               y=train_data.status_numeric)
        RF_scores.append((p,
                          RF.score(X=test_data[['lat', 'lng', 'time_delta']],
                                   y=test_data.status_numeric)))

    # find the most accurate model and parameter
    if max(kNN_scores, key=lambda x: x[1])[1] > max(RF_scores,
                                                    key=lambda x: x[1])[1]:
        return contaminant, "kNN", max(kNN_scores, key=lambda x: x[1])
    else:
        return contaminant, "RF", max(RF_scores, key=lambda x: x[1])
예제 #8
0
def heart(dataType):

    package = data.createData(dataType)

    xTrain = package.xTrain
    xTest = package.xTest
    yTrain = package.yTrain
    yTest = package.yTest
    xLabel = 'K'
    scoreList = util.ScoreList(xLabel)
    title = '{0} KNN'.format(dataType)

    # searcher.searchKNN(xTrain, yTrain, xTest, yTest)
    params = {'algorithm': 'auto', 'p': 1, 'weights': 'uniform'}
    params = {'algorithm': 'ball_tree', 'p': 1, 'weights': 'distance'}
    # params = searcher.searchKNN(xTrain, yTrain, xTest, yTest)

    param = 'n_neighbors'
    param_range = list(range(1, 50))  #np.linspace(1, 50, 50)

    clf = KNeighborsClassifier()
    clf.set_params(**params)

    plotter.plotValidationCurve(clf,
                                xTrain,
                                yTrain,
                                param,
                                param_range,
                                graphTitle=title)

    clf = KNeighborsClassifier()
    clf.set_params(**params)
    clf.n_neighbors = 12
    plotter.plotLearningCurve(clf, title=title, xTrain=xTrain, yTrain=yTrain)
    # plotter.plotAll(clf, title, param, param_range, xTrain, yTrain, xTest, yTest)
    title = 'Heart'
    clf.fit(xTrain, yTrain)
    plotter.plotConfusion(clf, title,
                          ['Diameter narrowing ', 'Diameter not narrowing'],
                          xTest, yTest)
예제 #9
0
from PIL import Image
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from util import Segs_Dir

Xlist = []
Ylist = []
for charDir in os.listdir(Segs_Dir):
    for file in os.listdir(os.path.join(Segs_Dir, charDir)):
        img = Image.open(os.path.join(Segs_Dir, charDir, file))
        featureVector = np.array(img).flatten()
        Xlist.append(featureVector)
        Ylist.append(charDir)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xlist, Ylist, test_size=0.2)

clf = KNeighborsClassifier(n_jobs=4)
clf.fit(Xtrain, Ytrain)

for n in list(range(1,15)) + list(range(15,100,5)):
    start_t = time.time()
    clf.n_neighbors = n
    Ypredict = clf.predict(Xtest)
    accuracy = accuracy_score(Ytest, Ypredict)
    end_t = time.time()
    ms_per_sample = (end_t-start_t)/len(Xtest) * 1000
    print("{:3d}\t{:.4f}\t{:.3f} ms".format(n, accuracy, ms_per_sample))

kn = KNeighborsClassifier()

kn.fit(fish_data, fish_target)

# 정확도 계산하기
print(kn.score(fish_data, fish_target))

# 새로운 데이터 예측하기
print(kn.predict([[30, 600]]))

print(kn._fit_X)
print(kn._y)

# 모든 데이터를 참고하는 모델
kn49 = KNeighborsClassifier(n_neighbors=49)

kn49.fit(fish_data, fish_target)

print(kn49.score(fish_data, fish_target))
print(35 / 49)

tkn = KNeighborsClassifier()
tkn.fit(fish_data, fish_target)

for n in range(5, 50):
    tkn.n_neighbors = n
    score = tkn.score(fish_data, fish_target)
    if score < 1:
        print(n, score)
        break
예제 #11
0
#sample tests
#test_images = test_images[:1000]
#test_labels = test_labels.tolist()[:1000]


k = 2
knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
knn.fit(images, labels.tolist())
predictions = knn.predict(test_images)

print("KNN k=2")
print_report(predictions, test_labels)
print()

knn.n_neighbors=3
predictions = knn.predict(test_images)
print("KNN k=3")
print_report(predictions, test_labels)    
print()

knn.n_neighbors=4
predictions = knn.predict(test_images)
print("KNN k=4")
print_report(predictions, test_labels)    
print()

knn.n_neighbors=5
predictions = knn.predict(test_images)
print("KNN k=5")
print_report(predictions, test_labels)    
예제 #12
0
from sklearn.neighbors import KNeighborsClassifier

from projet.lib.data_functions.config import *

###############################
# COMPUTE MISCLASSIFICATION ERROR
###############################
results = {}
knn = KNeighborsClassifier(weights='distance')
for k in range(1, 101):
    knn.n_neighbors = k
    knn.fit(X_train, y_train)
    predicted_returns = knn.predict(X_test)
    results[k] = knn.score(X_test, y_test)
    print "For k = %s, %ssuccess = %s" % (k, '%', results[k])

for k in range(0, 41):
    knn.n_neighbors = 10 * k + 100
    knn.fit(X_train, y_train)
    predicted_returns = knn.predict(X_test)
    results[10 * k + 100] = knn.score(X_test, y_test)
    print "For k = %s, %ssuccess = %s" % (10 * k + 100, '%',
                                          results[10 * k + 100])
예제 #13
0
############ Intializing Variables ############
k = 1  # K nearest neighbors
fold = 5  # Number of folds
k_scores = {}

# Creating Knn classifier with k nearest neighbors
knn = KNeighborsClassifier(n_neighbors=k)

# Fit training data to classifier
knn.fit(features_train, labels_train)

while k != 5:
    score = cross_validate(knn, features_train, labels_train, cv=fold)
    k_scores[k] = np.mean(score['test_score'])
    k += 1
    knn.n_neighbors = k

MaxScore = max(k_scores, key=k_scores.get)
print("The Maximum Score on the training set is : " + str(k_scores[MaxScore]) +
      ", The Best K value is: " + str(MaxScore))

# Test set Prediction
test_prediction = knn.predict(features_test)

print("The Predicted output: " + str(test_prediction))
print("The Real output: " + str(labels_test))

final_score = accuracy_score(labels_test, test_prediction)

print("The Accuracy of prediction: " + str(final_score))
예제 #14
0
def main():
    start_time = time.time()
    column_names = ["preg", "plas", "pres", "skin", "insu", "mass", "pedi", "age", "class"]
    with open('/Users/tyler/machine/data/pima-indians-diabetes copy.csv') as f:
        data = pandas.read_csv(f, sep=',', names=column_names)
    X, y = data.iloc[:, :-1], data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0)
    print "Results with 15 Neighbors"


    estimator = KNeighborsClassifier(n_neighbors=15).fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    y_train_pred = estimator.predict(X_train)

    print("--- %s seconds ---" % (time.time() - start_time))

    title = "Learning Curves (kNN, K=15)"
    plot_learning_curve(estimator, title, X_train, y_train, cv=cv)
    train_sizes, average_train_scores, average_test_scores = plot_learning_curve(estimator, title, X_train, y_train,
                                                                                 cv=cv)
    plot = err_plot(train_sizes, average_train_scores, average_test_scores)
    print 'train accuracy: {}'.format(estimator.score(X_train, y_train))
    print 'test accuracy: {}'.format(estimator.score(X_test, y_test))
    print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
    print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])
    print metrics.confusion_matrix(y_test, y_pred)


    start_time = time.time()
    print "Results with 9 Neighbors"
    print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
    print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])

    estimator = KNeighborsClassifier(n_neighbors=9)

    estimator.fit(X_train, y_train)

    title = "Learning Curves (kNN, K=9)"
    plot_learning_curve(estimator, title, X_train, y_train, cv=cv)
    train_sizes, average_train_scores, average_test_scores = plot_learning_curve(estimator, title, X_train, y_train,
                                                                                 cv=cv)
    plot = err_plot(train_sizes, average_train_scores, average_test_scores)

    y_pred = estimator.predict(X_test)
    y_train_pred = estimator.predict(X_train)

    print("--- %s seconds ---" % (time.time() - start_time))
    print "Final Classification Report"
    print metrics.classification_report(y_test, y_pred)
    print 'train accuracy: {}'.format(estimator.score(X_train, y_train))
    print 'test accuracy: {}'.format(estimator.score(X_test, y_test))
    print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes'])
    print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes'])
    print metrics.confusion_matrix(y_test, y_pred)

    knn = KNeighborsClassifier()

    n_neighbors = np.arange(1, 141, 2)

    train_scores = list()
    test_scores = list()
    cv_scores = list()
    for n in n_neighbors:
        knn.n_neighbors = n
        knn.fit(X_train, y_train)
        train_scores.append(
            1 - metrics.accuracy_score(y_train, knn.predict(X_train)))
        test_scores.append(1 - metrics.accuracy_score(y_test, knn.predict(X_test)))
        cv_scores.append(1 - cross_val_score(knn, X_train, y_train, cv=cv).mean())
    print(
        'The best values of k are:\n' \
        '{} according to the Training Set\n' \
        '{} according to the Test Set and\n' \
        '{} according to Cross-Validation'.format(
            min(n_neighbors[train_scores == min(train_scores)]),
            min(n_neighbors[test_scores == min(test_scores)]),
            min(n_neighbors[cv_scores == min(cv_scores)])
        ))

    plt.figure(figsize=(10, 7.5))
    plt.plot(n_neighbors, train_scores, c="black", label="Training Set")
    plt.plot(n_neighbors, test_scores, c="black", linestyle="--", label="Test Set")
    plt.plot(n_neighbors, cv_scores, c="green", label="Cross-Validation")
    plt.xlabel('Number of K Nearest Neighbors')
    plt.ylabel('Classification Error')
    plt.gca().invert_xaxis()
    plt.legend(loc="lower left")
    plt.show()
예제 #15
0
# Se n_jobs = -1, então, o número de trabalhos é definido para o número de núcleos da CPU
# fonte: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

tempoInicial = time.time()
print("Training for k = 1")
knn.fit(trainImages, trainLabels.tolist())
# realizando o treinamento usando o k = 1
resultKnn.append(knn.predict(testImages))
# pegando o resultado das predições e salvando a lista.
printResul(resultKnn[len(resultKnn) - 1])
tempoAux = time.time()
tempo(int(tempoAux - tempoInicial))

tempoInicial = time.time()
#mudando o valor de k para um novo treinamento
knn.n_neighbors = 10
print("Training for k = 10")
knn.fit(trainImages, trainLabels.tolist())
# realizando o treinamento usando o k = 10
resultKnn.append(knn.predict(testImages))
# pegando o resultado das predições e salvando a lista.
printResul(resultKnn[len(resultKnn) - 1])
tempoAux = time.time()
tempo(int(tempoAux - tempoInicial))

tempoInicial = time.time()
#mudando o valor de k para um novo treinamento
knn.n_neighbors = 100
print("Training for k = 100")
knn.fit(trainImages, trainLabels.tolist())
# realizando o treinamento usando o k = 100
예제 #16
0
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('para os dados normalizados e agrupados')
print(classification_report(y_test, y_pred))

scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Accuracy: {} (+/- {})".format(scores.mean(), scores.std() * 2))

#experimentos:
#nao normalizados
k_s = list(range(1,50, 2))
experimentos_n = []

for i in k_s:
    clf.n_neighbors = i
    scores = cross_val_score(clf, Xs_n, Ys_n, cv=10)
    experimentos_n.append((i,scores.mean(), scores.std()*2))
    #print("─ K: {}, Accuracy: {} (+/- {})".format(i,scores.mean(), scores.std() * 2))


experimentos_n.sort(key=lambda tup: tup[1])
print("\n\nExperimentos na Base Abalone não Normalizados ─ K: {}, Accuracy: {} (+/- {})".format(experimentos_n[-1][0],experimentos_n[-1][1], experimentos_n[-1][2]))

#normalizados
experimentos = []
for i in k_s:
    clf.n_neighbors = i
    scores = cross_val_score(clf, Xs, Ys, cv=10)
    experimentos.append((i,scores.mean(), scores.std()*2))
    #print("─ K: {}, Accuracy: {} (+/- {})".format(i,scores.mean(), scores.std() * 2))
        except:
            continue
        input = alignByMax(input)
        sub =  fig.add_subplot(frameSize*110 + subjects.index(subject))
        sub.plot(range(len(input)), input)
        sub_uniform = fig_uniform.add_subplot(frameSize*110 + subjects.index(subject))
        new_time, uniform_input = inter.getUniformSampled(xrange(len(input)), input, numOfFeatures)
        sub_uniform.plot( xrange(numOfFeatures), uniform_input)
        data.append(uniform_input)
        tags.append(subject)
        plt.xlabel('Time (in frames)')
        plt.ylabel(joint + ' angle')
        plt.title('subject: ' + str(subject))

cl = KNeighborsClassifier()
cl.n_neighbors = 5
cl.weights = 'distance' 
testSize = 35
score = crossValidate(cl, data, tags, testSize)


outFile = 'out.txt'
out = open(outFile, 'r')
scores = []
testSizes = []
for line in out:
    splited = line.split()
    scores.append(splited[0])
    testSizes.append(splited[1])
plt.figure()
plt.plot(testSizes, scores)
예제 #18
0
features, labels = load_dataset('seeds.tsv') #had to write custom function to parse the file since it contains float and string data

# initialize a classifier instance
classifier = KNeighborsClassifier(n_neighbors=5, weights='uniform', 
                                 algorithm='auto', 
                                 leaf_size=30, 
                                 p=2, 
                                 metric='minkowski', 
                                 metric_params=None)

# compute 10-fold cross-validation
means = []
k = 1
for k in range(1, 20, 2):
    classifier.n_neighbors = k

    # normalize all features to same scale
    classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
    
    for training, testing in KFold(features.shape[0], n_folds=10, shuffle=True):  #need to shuffle the features first before creating folds since the labels are created in contiguous manner
        classifier.fit(features[training], labels[training])
        predictions = classifier.predict(features[testing])
        current_mean = np.mean(predictions == labels[testing])
        means.append(current_mean)

    print('10-fold cross-validation mean accuracy         = {0:.1%} for k={1:d}'.format(np.mean(means), k))

    crossed = cross_val_score(classifier, X=features, y=labels, scoring=None, cv=10, n_jobs=1)
    print('10-fold cross-validation using cross_val_score = {0:.1%} for k={1:d}'.format(np.mean(crossed), k))
예제 #19
0
for c in C_s:
    logistic.C = c
    temp = []
    for train, test in skf.split(X, y):
        logistic.fit(X[train], y[train])
        temp.append(logistic.score(X[test], y[test]))
    accs.append(temp)

accs = np.array(accs)
avg = np.mean(accs, axis=1)
avg
np.argmax(avg)
C_s[np.argmax(avg)]

ks = np.linspace(1, 10, 10)
knn3 = KNeighborsClassifier()

accs2 = []

for k in ks:
    knn3.n_neighbors = int(k)
    temp = []
    for train, test in skf.split(X, y):
        knn3.fit(X[train], y[train])
        temp.append(knn3.score(X[test], y[test]))
    accs2.append(temp)

np.mean(accs2, axis=1)
np.argmax(np.mean(accs2, axis=1))
ks[np.argmax(np.mean(accs2, axis=1))]
예제 #20
0
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from utils.amcParser import getMergedData
import time

knn = KNeighborsClassifier()
knn.n_neighbors = 5
knn.weights = 'distance' 

def crossValidate(data, tags, trainSize):  
    fit = knn.fit(dataTrain, tagTrain)
    #print(fit)
    res = knn.predict(dataTest)
    hits = 0.0
    for t,r in zip(tagsTest,res):
        if(t==r):
            hits+=1.0
    #print(res)
    #print(tagsTest)
    return hits/float(len(res))

sum = 0.0
numOftests = 100
for i in range(numOftests):
    joints = np.array(['lradius', 'rradius', 'ltibia', 'rtibia', 'lwrist', 'rwrist', 'lfingers', 'rfingers'])
    chosen = np.array([1,1,1,0,0,0,1,1])
    data, tags = getMergedData(joints[chosen.astype(np.bool)])
    numOfsamples, numOfFeatures = data.shape
    np.random.seed(i)
    trainSize = numOfsamples-1#int(0.9*numOfsamples)    
예제 #21
0
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

breast_cancer_data = load_breast_cancer()
data = breast_cancer_data.data
target = breast_cancer_data.target

subsets = train_test_split(data, target, train_size=0.8, random_state=120)
training_data, validation_data, training_labels, validation_labels = subsets

classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(training_data, training_labels)


k_list = [i for i in range(1,101)]
accuracies = []

for k in k_list:
    classifier.n_neighbors = k
    score = classifier.score(validation_data, validation_labels)
    accuracies.append(score)


plt.plot(k_list, accuracies)
plt.xlabel('k')
plt.ylabel("Validation Accuracy")
plt.title('Breast Cancer Classifier Accuracy')
plt.show()
#    X_train, X_test = X[train_index], X[test_index]
#    y_train, y_test = y[train_index], y[test_index]
#    print("%s %s | %s %s" % (X_train, X_test, y_train, y_test))

# Nu vil jeg se på hvordan man kan bruge cross-validation til, at vælge den rigtige model
# Loader iris datasæt
iris = load_iris()
X = iris.data
y = iris.target

print('')
knn = KNeighborsClassifier()

# Tester det optimale antal n_neighbors for knn
for i in range(20):
    knn.n_neighbors = i+1
    print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

# Opstiller de to modeller vi vil stille op mod hinanden
knn.n_neighbors = 20
logreg = LogisticRegression()

# Bruger cross_val_score til at få de to modeller nøjagtigheds scoring
# cv=10 står for, hvor mange fold vi vil have. I dette tilfælde 10
# scoring='accuracy' er hvilken evaluation metric vi har valgt
# Vi bruge mean() tilsidst for, at vi får svaret med det samme. Uden det skulle vi selv beregne gennemsnittet
# Kig nederst for, at se hvad jeg mener.
# Ud fra de svar vi kan vi, så vælge den model der klarede sig bedst.
print('')
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())