def evalParams(m1=MEAN_COEFF, m2=STD_COEFF, epsilon=EPSILON_FACTOR,gap=GAP_FACTOR, overlap=OVERLAP_FACTOR): sum = 0 for stitch in xrange(stitchesNum): data = [] tags = [] for subject in subjects: for index in xrange(8): try: input = getAMCInput(joint, subject, index) except: continue parts = st.createParts(input, partsAmount) stitched = st.stitch(parts, m1, m2, epsilon,gap, overlap) #plt.figure() #plt.plot(stitched) periods = pr.breakToPeriods(stitched) periods = ut.alignByMaxMany(periods) periods = inter.getUniformSampledVecs(periods, 100) data = data + periods tags = tags + [subject]*len(periods) #st.plotParts(periods) cl = KNeighborsClassifier() cl.n_neighbors = 5 cl.weights = 'distance' testSize = 1 score = crossValidate(cl, data, tags, testSize, testAmount) #print str(m2)+' '+ str(score) sum+=score score = float(sum)/stitchesNum scores[m1, m2] = score return score
def evaluate_kNN(x_pos, y_pos, x, y, folds, n_params, runs, steps, k_neighbors): print("in evaluate kNN") neigh = KNeighborsClassifier(n_neighbors=k_neighbors) svd_m = decomposition.TruncatedSVD(algorithm='randomized', n_components=n_params, n_iter=7) scores = [] run = [] for i in np.arange(n_params, runs, steps): # svd_model = svd_m.fit(x_pos, y_pos) # x_svd = svd_model.transform(x) # test_svd = svd_model.transform(test) # neigh.fit(x_svd, y) # val_list = cross_val_score(neigh, x_ch2, y, cv=folds, scoring='f1').mean() neigh.n_neighbors = i ch2_model = SelectKBest(chi2, k=i).fit(x, y) x_ch2 = ch2_model.transform(x) neigh.fit(x_ch2, y) val_list = cross_val_score(neigh, x_ch2, y, cv=folds, scoring='f1').mean() scores.append(val_list) run.append(i) evaluate_knn_csv(scores, k_neighbors, folds, chi2_n_params, run) print(scores)
def KNN(x_train,y_train,x_test, udf_kneighbors=100, do_CV=False): from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score ### variables may be in different scales, so mean standardize the variables ### ### Mean Normalize variables before regression ### from sklearn.preprocessing import StandardScaler ss=StandardScaler() x_train=ss.fit_transform(x_train) x_test=ss.fit_transform(x_test) neigh=KNeighborsClassifier(weights='distance') if do_CV: k_list=[25,125,255,387] #important to have odd numbers ### Try different parameters of K for optimal value ### ### Randomly divide training set into 80/20 split ### cv_score=list() for k in k_list: neigh.n_neighbors=k x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_train,y_train,test_size=0.20, random_state=42) neigh.fit(x_train_cv,y_train_cv) y_pred=neigh.predict_proba(x_test_cv)[:,1] cv_score.append(roc_auc_score(y_test_cv,y_pred)) neigh.fit(x_train,y_train) y_pred=neigh.predict_proba(x_test)[:,1] print 'Cross Validation KNN Results........' print 'Parameters, CV_Scores' for i in range(len(cv_score)): print k_list[i], cv_score[i] else: print 'Making Prediction with optimal K neighbors...' neigh.n_neighbors=udf_kneighbors neigh.fit(x_train,y_train) y_pred=neigh.predict_proba(x_test)[:,1] print 'Writing submission file....' with open('KNN_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def useLibraryClassifier(avgRGBList, classifierVal, prnt=1): dataArr = [] if classifierVal == 0: classifier = KNeighborsClassifier() elif classifierVal == 1: classifier = GaussianNB() elif classifierVal == 2: classifier = SVC() startTime = time.process_time() for currentFold in range(0, 10): x_train, y_train, x_test, y_test = [], [], [], [] for y in range(1, 5): for x in range(0, len(avgRGBList[y])): x_train.append(avgRGBList[y][x][0]) y_train.append(avgRGBList[y][x][1]) for z in range(0, len(avgRGBList[0])): x_test.append(avgRGBList[0][z][0]) y_test.append(avgRGBList[0][z][1]) classifier.fit(x_train, y_train) if classifierVal == 0: for k in range(0, 10): classifier.n_neighbors = k + 1 accuracy = float(classifier.score(x_test, y_test)) dataArr.append(accuracy) else: accuracy = float(classifier.score(x_test, y_test)) dataArr.append(accuracy) avgRGBList = avgRGBList[0] + avgRGBList[1] + avgRGBList[2] + avgRGBList[3] + avgRGBList[4] avgRGBList = crossValidation(avgRGBList) for x in range(0, 10): sum = 0.0 max = 0.0 avgAccuracy = 0.0 if classifierVal == 0: for y in range(0, 10): sum += dataArr[10 * y + x] if (sum/10 > max): max = sum/10 avgAccuracy = max if prnt==1: print("Average library-KNN accuracy for k == " + str(x+1) + ": " + str(sum/10)) elif classifierVal == 1: if prnt==1: print("Average Gaussian Naive Beyers accuracy for fold " + str(x+1) + ": " + str(dataArr[x])) for x in dataArr: sum += x avgAccuracy = sum/10 elif classifierVal == 2: if prnt==1: print("Average Support Vector Classifier accuracy for fold " + str(x+1) + ": " + str(dataArr[x])) for x in dataArr: sum += x avgAccuracy = sum/10 endTime = time.process_time() if prnt==1: print("Run-time: " + str(endTime - startTime) + " fractal seconds. *NOTE: Module time used, NOT Module timeit") print("\n") return [classifierVal, avgAccuracy, endTime - startTime]
def optimal_k(x, y): opt_k = 0 max_quality = 0 generator = KFold( n_splits=5, shuffle=True, random_state=42) # shuffles the dataset and breaks it into n (5) parts classifier = KNeighborsClassifier() for __k in range(1, 50): classifier.n_neighbors = __k qualities = cross_val_score(estimator=classifier, X=x, y=y, cv=generator) avg_quality = sum(qualities) / float(len(qualities)) if avg_quality >= max_quality: max_quality = avg_quality opt_k = __k return [opt_k, max_quality]
def KNN(train_X, train_Y, test_X, test_Y, K_list): knn = KNeighborsClassifier(weights='distance') knn_acc_list = [] for K in K_list: knn.n_neighbors = K # knn_acc_list = [] acc = classify(knn, train_X, train_Y, test_X, test_Y) knn_acc_list.append(acc) print K, acc # print knn_acc_list return knn_acc_list plt.plot(K_list, knn_acc_list, '--^', label=K) plt.title('KNN') plt.xlabel('K') plt.ylabel('accuracy') plt.ylim((0.45, 0.65)) plt.legend(loc='lower right', numpoints=1) plt.show()
def find_best_model(df, contaminant, verbose=False): train_data, test_data = splitData(df[df.contaminant == contaminant]) ### make sure the values make sense: if verbose: print('Contaminant ', contaminant) print('Status Levels: ', df.status.unique()) print('Status Codes: ', df.status_numeric.unique()) print('train data sample size', train_data.size) print('test data sample size', test_data.size) train_labels = train_data.status_numeric # create model templates RF = RandomForestClassifier() kNN = KNeighborsClassifier() kNN_scores = [] RF_scores = [] for p in range(2, 100): kNN.n_neighbors = p RF.n_estimators = p kNN.fit(X=train_data[['lat', 'lng', 'time_delta']], y=train_data.status_numeric) kNN_scores.append((p, kNN.score(X=test_data[['lat', 'lng', 'time_delta']], y=test_data.status_numeric))) RF.fit(X=train_data[['lat', 'lng', 'time_delta']], y=train_data.status_numeric) RF_scores.append((p, RF.score(X=test_data[['lat', 'lng', 'time_delta']], y=test_data.status_numeric))) # find the most accurate model and parameter if max(kNN_scores, key=lambda x: x[1])[1] > max(RF_scores, key=lambda x: x[1])[1]: return contaminant, "kNN", max(kNN_scores, key=lambda x: x[1]) else: return contaminant, "RF", max(RF_scores, key=lambda x: x[1])
def heart(dataType): package = data.createData(dataType) xTrain = package.xTrain xTest = package.xTest yTrain = package.yTrain yTest = package.yTest xLabel = 'K' scoreList = util.ScoreList(xLabel) title = '{0} KNN'.format(dataType) # searcher.searchKNN(xTrain, yTrain, xTest, yTest) params = {'algorithm': 'auto', 'p': 1, 'weights': 'uniform'} params = {'algorithm': 'ball_tree', 'p': 1, 'weights': 'distance'} # params = searcher.searchKNN(xTrain, yTrain, xTest, yTest) param = 'n_neighbors' param_range = list(range(1, 50)) #np.linspace(1, 50, 50) clf = KNeighborsClassifier() clf.set_params(**params) plotter.plotValidationCurve(clf, xTrain, yTrain, param, param_range, graphTitle=title) clf = KNeighborsClassifier() clf.set_params(**params) clf.n_neighbors = 12 plotter.plotLearningCurve(clf, title=title, xTrain=xTrain, yTrain=yTrain) # plotter.plotAll(clf, title, param, param_range, xTrain, yTrain, xTest, yTest) title = 'Heart' clf.fit(xTrain, yTrain) plotter.plotConfusion(clf, title, ['Diameter narrowing ', 'Diameter not narrowing'], xTest, yTest)
from PIL import Image import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report from util import Segs_Dir Xlist = [] Ylist = [] for charDir in os.listdir(Segs_Dir): for file in os.listdir(os.path.join(Segs_Dir, charDir)): img = Image.open(os.path.join(Segs_Dir, charDir, file)) featureVector = np.array(img).flatten() Xlist.append(featureVector) Ylist.append(charDir) Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xlist, Ylist, test_size=0.2) clf = KNeighborsClassifier(n_jobs=4) clf.fit(Xtrain, Ytrain) for n in list(range(1,15)) + list(range(15,100,5)): start_t = time.time() clf.n_neighbors = n Ypredict = clf.predict(Xtest) accuracy = accuracy_score(Ytest, Ypredict) end_t = time.time() ms_per_sample = (end_t-start_t)/len(Xtest) * 1000 print("{:3d}\t{:.4f}\t{:.3f} ms".format(n, accuracy, ms_per_sample))
kn = KNeighborsClassifier() kn.fit(fish_data, fish_target) # 정확도 계산하기 print(kn.score(fish_data, fish_target)) # 새로운 데이터 예측하기 print(kn.predict([[30, 600]])) print(kn._fit_X) print(kn._y) # 모든 데이터를 참고하는 모델 kn49 = KNeighborsClassifier(n_neighbors=49) kn49.fit(fish_data, fish_target) print(kn49.score(fish_data, fish_target)) print(35 / 49) tkn = KNeighborsClassifier() tkn.fit(fish_data, fish_target) for n in range(5, 50): tkn.n_neighbors = n score = tkn.score(fish_data, fish_target) if score < 1: print(n, score) break
#sample tests #test_images = test_images[:1000] #test_labels = test_labels.tolist()[:1000] k = 2 knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1) knn.fit(images, labels.tolist()) predictions = knn.predict(test_images) print("KNN k=2") print_report(predictions, test_labels) print() knn.n_neighbors=3 predictions = knn.predict(test_images) print("KNN k=3") print_report(predictions, test_labels) print() knn.n_neighbors=4 predictions = knn.predict(test_images) print("KNN k=4") print_report(predictions, test_labels) print() knn.n_neighbors=5 predictions = knn.predict(test_images) print("KNN k=5") print_report(predictions, test_labels)
from sklearn.neighbors import KNeighborsClassifier from projet.lib.data_functions.config import * ############################### # COMPUTE MISCLASSIFICATION ERROR ############################### results = {} knn = KNeighborsClassifier(weights='distance') for k in range(1, 101): knn.n_neighbors = k knn.fit(X_train, y_train) predicted_returns = knn.predict(X_test) results[k] = knn.score(X_test, y_test) print "For k = %s, %ssuccess = %s" % (k, '%', results[k]) for k in range(0, 41): knn.n_neighbors = 10 * k + 100 knn.fit(X_train, y_train) predicted_returns = knn.predict(X_test) results[10 * k + 100] = knn.score(X_test, y_test) print "For k = %s, %ssuccess = %s" % (10 * k + 100, '%', results[10 * k + 100])
############ Intializing Variables ############ k = 1 # K nearest neighbors fold = 5 # Number of folds k_scores = {} # Creating Knn classifier with k nearest neighbors knn = KNeighborsClassifier(n_neighbors=k) # Fit training data to classifier knn.fit(features_train, labels_train) while k != 5: score = cross_validate(knn, features_train, labels_train, cv=fold) k_scores[k] = np.mean(score['test_score']) k += 1 knn.n_neighbors = k MaxScore = max(k_scores, key=k_scores.get) print("The Maximum Score on the training set is : " + str(k_scores[MaxScore]) + ", The Best K value is: " + str(MaxScore)) # Test set Prediction test_prediction = knn.predict(features_test) print("The Predicted output: " + str(test_prediction)) print("The Real output: " + str(labels_test)) final_score = accuracy_score(labels_test, test_prediction) print("The Accuracy of prediction: " + str(final_score))
def main(): start_time = time.time() column_names = ["preg", "plas", "pres", "skin", "insu", "mass", "pedi", "age", "class"] with open('/Users/tyler/machine/data/pima-indians-diabetes copy.csv') as f: data = pandas.read_csv(f, sep=',', names=column_names) X, y = data.iloc[:, :-1], data.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) cv = ShuffleSplit(X_train.shape[0], n_iter=10, test_size=0.2, random_state=0) print "Results with 15 Neighbors" estimator = KNeighborsClassifier(n_neighbors=15).fit(X_train, y_train) y_pred = estimator.predict(X_test) y_train_pred = estimator.predict(X_train) print("--- %s seconds ---" % (time.time() - start_time)) title = "Learning Curves (kNN, K=15)" plot_learning_curve(estimator, title, X_train, y_train, cv=cv) train_sizes, average_train_scores, average_test_scores = plot_learning_curve(estimator, title, X_train, y_train, cv=cv) plot = err_plot(train_sizes, average_train_scores, average_test_scores) print 'train accuracy: {}'.format(estimator.score(X_train, y_train)) print 'test accuracy: {}'.format(estimator.score(X_test, y_test)) print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']) print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes']) print metrics.confusion_matrix(y_test, y_pred) start_time = time.time() print "Results with 9 Neighbors" print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']) print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes']) estimator = KNeighborsClassifier(n_neighbors=9) estimator.fit(X_train, y_train) title = "Learning Curves (kNN, K=9)" plot_learning_curve(estimator, title, X_train, y_train, cv=cv) train_sizes, average_train_scores, average_test_scores = plot_learning_curve(estimator, title, X_train, y_train, cv=cv) plot = err_plot(train_sizes, average_train_scores, average_test_scores) y_pred = estimator.predict(X_test) y_train_pred = estimator.predict(X_train) print("--- %s seconds ---" % (time.time() - start_time)) print "Final Classification Report" print metrics.classification_report(y_test, y_pred) print 'train accuracy: {}'.format(estimator.score(X_train, y_train)) print 'test accuracy: {}'.format(estimator.score(X_test, y_test)) print metrics.classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']) print metrics.classification_report(y_train, y_train_pred, target_names=['No Diabetes', 'Diabetes']) print metrics.confusion_matrix(y_test, y_pred) knn = KNeighborsClassifier() n_neighbors = np.arange(1, 141, 2) train_scores = list() test_scores = list() cv_scores = list() for n in n_neighbors: knn.n_neighbors = n knn.fit(X_train, y_train) train_scores.append( 1 - metrics.accuracy_score(y_train, knn.predict(X_train))) test_scores.append(1 - metrics.accuracy_score(y_test, knn.predict(X_test))) cv_scores.append(1 - cross_val_score(knn, X_train, y_train, cv=cv).mean()) print( 'The best values of k are:\n' \ '{} according to the Training Set\n' \ '{} according to the Test Set and\n' \ '{} according to Cross-Validation'.format( min(n_neighbors[train_scores == min(train_scores)]), min(n_neighbors[test_scores == min(test_scores)]), min(n_neighbors[cv_scores == min(cv_scores)]) )) plt.figure(figsize=(10, 7.5)) plt.plot(n_neighbors, train_scores, c="black", label="Training Set") plt.plot(n_neighbors, test_scores, c="black", linestyle="--", label="Test Set") plt.plot(n_neighbors, cv_scores, c="green", label="Cross-Validation") plt.xlabel('Number of K Nearest Neighbors') plt.ylabel('Classification Error') plt.gca().invert_xaxis() plt.legend(loc="lower left") plt.show()
# Se n_jobs = -1, então, o número de trabalhos é definido para o número de núcleos da CPU # fonte: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html tempoInicial = time.time() print("Training for k = 1") knn.fit(trainImages, trainLabels.tolist()) # realizando o treinamento usando o k = 1 resultKnn.append(knn.predict(testImages)) # pegando o resultado das predições e salvando a lista. printResul(resultKnn[len(resultKnn) - 1]) tempoAux = time.time() tempo(int(tempoAux - tempoInicial)) tempoInicial = time.time() #mudando o valor de k para um novo treinamento knn.n_neighbors = 10 print("Training for k = 10") knn.fit(trainImages, trainLabels.tolist()) # realizando o treinamento usando o k = 10 resultKnn.append(knn.predict(testImages)) # pegando o resultado das predições e salvando a lista. printResul(resultKnn[len(resultKnn) - 1]) tempoAux = time.time() tempo(int(tempoAux - tempoInicial)) tempoInicial = time.time() #mudando o valor de k para um novo treinamento knn.n_neighbors = 100 print("Training for k = 100") knn.fit(trainImages, trainLabels.tolist()) # realizando o treinamento usando o k = 100
clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('para os dados normalizados e agrupados') print(classification_report(y_test, y_pred)) scores = cross_val_score(clf, X_train, y_train, cv=10) print("Accuracy: {} (+/- {})".format(scores.mean(), scores.std() * 2)) #experimentos: #nao normalizados k_s = list(range(1,50, 2)) experimentos_n = [] for i in k_s: clf.n_neighbors = i scores = cross_val_score(clf, Xs_n, Ys_n, cv=10) experimentos_n.append((i,scores.mean(), scores.std()*2)) #print("─ K: {}, Accuracy: {} (+/- {})".format(i,scores.mean(), scores.std() * 2)) experimentos_n.sort(key=lambda tup: tup[1]) print("\n\nExperimentos na Base Abalone não Normalizados ─ K: {}, Accuracy: {} (+/- {})".format(experimentos_n[-1][0],experimentos_n[-1][1], experimentos_n[-1][2])) #normalizados experimentos = [] for i in k_s: clf.n_neighbors = i scores = cross_val_score(clf, Xs, Ys, cv=10) experimentos.append((i,scores.mean(), scores.std()*2)) #print("─ K: {}, Accuracy: {} (+/- {})".format(i,scores.mean(), scores.std() * 2))
except: continue input = alignByMax(input) sub = fig.add_subplot(frameSize*110 + subjects.index(subject)) sub.plot(range(len(input)), input) sub_uniform = fig_uniform.add_subplot(frameSize*110 + subjects.index(subject)) new_time, uniform_input = inter.getUniformSampled(xrange(len(input)), input, numOfFeatures) sub_uniform.plot( xrange(numOfFeatures), uniform_input) data.append(uniform_input) tags.append(subject) plt.xlabel('Time (in frames)') plt.ylabel(joint + ' angle') plt.title('subject: ' + str(subject)) cl = KNeighborsClassifier() cl.n_neighbors = 5 cl.weights = 'distance' testSize = 35 score = crossValidate(cl, data, tags, testSize) outFile = 'out.txt' out = open(outFile, 'r') scores = [] testSizes = [] for line in out: splited = line.split() scores.append(splited[0]) testSizes.append(splited[1]) plt.figure() plt.plot(testSizes, scores)
features, labels = load_dataset('seeds.tsv') #had to write custom function to parse the file since it contains float and string data # initialize a classifier instance classifier = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) # compute 10-fold cross-validation means = [] k = 1 for k in range(1, 20, 2): classifier.n_neighbors = k # normalize all features to same scale classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) for training, testing in KFold(features.shape[0], n_folds=10, shuffle=True): #need to shuffle the features first before creating folds since the labels are created in contiguous manner classifier.fit(features[training], labels[training]) predictions = classifier.predict(features[testing]) current_mean = np.mean(predictions == labels[testing]) means.append(current_mean) print('10-fold cross-validation mean accuracy = {0:.1%} for k={1:d}'.format(np.mean(means), k)) crossed = cross_val_score(classifier, X=features, y=labels, scoring=None, cv=10, n_jobs=1) print('10-fold cross-validation using cross_val_score = {0:.1%} for k={1:d}'.format(np.mean(crossed), k))
for c in C_s: logistic.C = c temp = [] for train, test in skf.split(X, y): logistic.fit(X[train], y[train]) temp.append(logistic.score(X[test], y[test])) accs.append(temp) accs = np.array(accs) avg = np.mean(accs, axis=1) avg np.argmax(avg) C_s[np.argmax(avg)] ks = np.linspace(1, 10, 10) knn3 = KNeighborsClassifier() accs2 = [] for k in ks: knn3.n_neighbors = int(k) temp = [] for train, test in skf.split(X, y): knn3.fit(X[train], y[train]) temp.append(knn3.score(X[test], y[test])) accs2.append(temp) np.mean(accs2, axis=1) np.argmax(np.mean(accs2, axis=1)) ks[np.argmax(np.mean(accs2, axis=1))]
import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier from utils.amcParser import getMergedData import time knn = KNeighborsClassifier() knn.n_neighbors = 5 knn.weights = 'distance' def crossValidate(data, tags, trainSize): fit = knn.fit(dataTrain, tagTrain) #print(fit) res = knn.predict(dataTest) hits = 0.0 for t,r in zip(tagsTest,res): if(t==r): hits+=1.0 #print(res) #print(tagsTest) return hits/float(len(res)) sum = 0.0 numOftests = 100 for i in range(numOftests): joints = np.array(['lradius', 'rradius', 'ltibia', 'rtibia', 'lwrist', 'rwrist', 'lfingers', 'rfingers']) chosen = np.array([1,1,1,0,0,0,1,1]) data, tags = getMergedData(joints[chosen.astype(np.bool)]) numOfsamples, numOfFeatures = data.shape np.random.seed(i) trainSize = numOfsamples-1#int(0.9*numOfsamples)
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt breast_cancer_data = load_breast_cancer() data = breast_cancer_data.data target = breast_cancer_data.target subsets = train_test_split(data, target, train_size=0.8, random_state=120) training_data, validation_data, training_labels, validation_labels = subsets classifier = KNeighborsClassifier(n_neighbors = 3) classifier.fit(training_data, training_labels) k_list = [i for i in range(1,101)] accuracies = [] for k in k_list: classifier.n_neighbors = k score = classifier.score(validation_data, validation_labels) accuracies.append(score) plt.plot(k_list, accuracies) plt.xlabel('k') plt.ylabel("Validation Accuracy") plt.title('Breast Cancer Classifier Accuracy') plt.show()
# X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # print("%s %s | %s %s" % (X_train, X_test, y_train, y_test)) # Nu vil jeg se på hvordan man kan bruge cross-validation til, at vælge den rigtige model # Loader iris datasæt iris = load_iris() X = iris.data y = iris.target print('') knn = KNeighborsClassifier() # Tester det optimale antal n_neighbors for knn for i in range(20): knn.n_neighbors = i+1 print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()) # Opstiller de to modeller vi vil stille op mod hinanden knn.n_neighbors = 20 logreg = LogisticRegression() # Bruger cross_val_score til at få de to modeller nøjagtigheds scoring # cv=10 står for, hvor mange fold vi vil have. I dette tilfælde 10 # scoring='accuracy' er hvilken evaluation metric vi har valgt # Vi bruge mean() tilsidst for, at vi får svaret med det samme. Uden det skulle vi selv beregne gennemsnittet # Kig nederst for, at se hvad jeg mener. # Ud fra de svar vi kan vi, så vælge den model der klarede sig bedst. print('') print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()) print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())