def main(): means = [[-1, -1], [1.0, 1.0]] variances = [np.random.rand] knn_models = [3, 5, 10] data_sizes = [10, 25, 50, 75, 100, 125, 150, 175, 200] points_per_class = 500 data = dg.generate_gaussian_mixture(class_means=means, class_variances=np.eye(2), num_components=5, num_desired_points_per_class=points_per_class) class_0 = np.hstack((data[0], np.zeros((len(data[0]), 1)))) class_1 = np.hstack((data[1], np.ones((len(data[0]), 1)))) results_train = np.empty((len(knn_models), len(data_sizes))) results_test = np.empty((len(knn_models), len(data_sizes))) train_data_class_0, test_data_class_0 = split_train_test(class_0) train_data_class_1, test_data_class_1 = split_train_test(class_1) print 'train size, test size', len(train_data_class_1), len(test_data_class_1) train_data = np.vstack((train_data_class_0, train_data_class_1)) test_data = np.vstack((test_data_class_0, test_data_class_1)) for i, knn_model in enumerate(knn_models): kncs = KNeighborsClassifier(n_neighbors=knn_model) for j, data_size in enumerate(data_sizes): curr_train_class_0, curr_train_class_1 = train_data_class_0[:data_size], train_data_class_1[:data_size] curr_train_data = np.vstack((curr_train_class_0, curr_train_class_1)) kncs.fit(curr_train_data[:, :2], curr_train_data[:, -1]) predictions_train = kncs.predict(train_data[:, :2]) predictions_test = kncs.predict(test_data[:, :2]) results_train[i][j] = len(np.where(predictions_train != train_data[:, -1])[0]) / float(len(train_data)) results_test[i][j] = len(np.where(predictions_test != test_data[:, -1])[0]) / float(len(test_data)) plt.plot(data_sizes, results_test[0, :], 'r') plt.plot(data_sizes, results_test[1, :], 'b') plt.plot(data_sizes, results_test[2, :], 'g') plt.plot(data_sizes, results_train[0, :], 'r--') plt.plot(data_sizes, results_train[1, :], 'b--') plt.plot(data_sizes, results_train[2, :], 'g--') plt.show()
def knn_est_cv(X, y, clf, n_neigh): knn_est = KNeighborsClassifier(n_neigh, metric="manhattan", algorithm="brute") knn_est1 = KNeighborsClassifier(n_neigh, metric="manhattan", algorithm="brute") knn = KNeighborsClassifier(n_neigh, metric="euclidean", algorithm="brute") acc_folds = [] for train, test in StratifiedKFold(y, 5): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) estimators = clf.estimators_ preds_train = np.array(map(lambda e: e.predict(X_train), estimators)).T preds_test = np.array(map(lambda e: e.predict(X_test), estimators)).T preds_train_proba = np.array(map(lambda e: e.predict_proba(X_train), estimators)) preds_test_proba = np.array(map(lambda e: e.predict_proba(X_test), estimators)) p_train = preds_train_proba.swapaxes(0, 1)[:, :, 0] p_test = preds_test_proba.swapaxes(0, 1)[:, :, 0] acc = [] for nn in xrange(1, n_neigh, 2): knn.set_params(n_neighbors=nn) knn_est.set_params(n_neighbors=nn) knn_est1.set_params(n_neighbors=nn) knn.fit(X_train, y_train) knn_est.fit(preds_train, y_train) knn_est1.fit(p_train, y_train) acc.append( [ accuracy_score(y_test, knn.predict(X_test)), accuracy_score(y_test, knn_est.predict(preds_test)), accuracy_score(y_test, knn_est1.predict(p_test)), ] ) acc_folds.append(acc) return np.mean(acc_folds, axis=0)
def DecisionTreeClassifier(TrainData): features=['Month','Date','Year'] season=['Fall','Spring','Summer','Winter'] district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] days=['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday','Wednesday'] time=['first','second','third'] features2 = [x for x in range(0,24)] Minute=[x for x in range(100,160)] latitude=[x for x in range(948,964)] longitude=[x for x in range(2070,2083)] features=district+Minute+features2+season+time train,validation= train_test_split(TrainData, test_size=0.4) knn = KNeighborsClassifier() knn.fit(train[features],train['Category']) KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',metric_params=None, n_jobs=1, n_neighbors=5, p=2,weights='uniform',multilabel=True) predicted=np.array(knn.predict_proba(validation[features])) model=knn.predict(validation[features]) model1=knn.predict(train[features]) print "Precision is ",precision_score(validation['Category'].values.tolist(),model,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model,average='macro') print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model) print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model1) result=pd.DataFrame(predicted, columns=le_crime.classes_) result['Predicted']=model result.to_csv('knnProbabilities.csv', index = True, index_label = 'Id' )
def knnSimulate(param): trainSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) knnFit = KNeighborsClassifier(n_neighbors=int(param['k'])) knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y'])) testSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) out = OrderedDict() out['p'] = int(param['p']) out['k'] = int(param['k']) out['train'] = trainSet out['test'] = testSet out['resubPreds'] = knnFit.predict(trainSet['x']) out['resubProbs'] = knnFit.predict_proba(trainSet['x']) out['testPreds'] = knnFit.predict(testSet['x']) out['testProbs'] = knnFit.predict_proba(testSet['x']) out['resubTable'] = pd.crosstab( Series(out['resubPreds'], index=trainSet['y'].index), trainSet['y'] ) out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) / (1.0 * np.sum(np.sum(out['resubTable'])))) out['testTable'] = pd.crosstab( Series(out['testPreds'], index=testSet['y'].index), testSet['y'] ) out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) / (1.0 * np.sum(np.sum(out['testTable'])))) return out
def knn(train,test,labels,neighbours=10, runAll=None, median=True, runPCA=True, components=80): print "Putting training data into matrix" trainM = np.mat(train) print "Running K nearest Neighbour, default = 10" knn = KNeighborsClassifier(n_neighbors=neighbours, algorithm="kd_tree") if(median == True): print "Running data through Median filter..." trainfilter = medianfilter(train) knn.fit(trainfilter,labels) result = knn.predict(trainfilter) print "Writing to output file output.knn-kdtree-median.csv\n" fwrite(result,fname='output.knn-kdtree-median.csv') return(0) if (runPCA==True): trainReduce, testReduce = pca(train,test,components) knn.fit(trainReduce,labels) #print at beginning of this result = knn.predict(testReduce) print "Writing output to file output.knn-kdtree-pca.csv\n" fwrite(result,fname='output.knn-kdtree-pca.csv') print "Running without PCA\n" knn.fit(trainM,labels) #need this here - might as well print result = knn.predict(test) print "Writing output to file output.knn-kdtree.csv\n" fwrite(result,fname="output.knn-kdtree.csv")
def main(): init_means = [-1, 1] mean_dimensions = 10 points_per_class = 250 knc = KNeighborsClassifier(n_neighbors=5) means = [[init_mean for mean_dim in range(mean_dimensions)]for init_mean in init_means] variances = [np.eye(len(means[0])), np.eye(len(means[0]))] data = dg.generate_prob_mixture(class_means=means, class_variances=variances, num_components=5, num_desired_points=points_per_class, dim_uniform=2) class_0 = np.hstack((data[0], np.zeros((len(data[0]), 1)))) class_1 = np.hstack((data[1], np.ones((len(data[0]), 1)))) train_data_class_0, test_data_class_0 = split_train_test(class_0) train_data_class_1, test_data_class_1 = split_train_test(class_1) train_data = np.vstack((train_data_class_0, train_data_class_1)) test_data = np.vstack((test_data_class_0, test_data_class_1)) corr_ranked_features, _ = CorrelationCoefficient.rank_features(train_data[:, :-1], train_data[:, -1]) relief_ranked_features, _ = Relief.rank_features(train_data[:, :-1], train_data[:, -1]) knc.fit(train_data[:, :-1], train_data[:, -1]) pred = pred_test_default = knc.predict(test_data[:, :-1]) print len(np.where(pred != test_data[:, -1])[0]) corr_train_removed_features = remove_features(train_data[:, :-1], corr_ranked_features) corr_test_removed_features = remove_features(test_data[:, :-1], corr_ranked_features) knc.fit(corr_train_removed_features, train_data[:, -1]) pred = knc.predict(corr_test_removed_features) print len(np.where(pred != test_data[:, -1])[0]) corr_train_removed_features = remove_features(train_data[:, :-1], relief_ranked_features) corr_test_removed_features = remove_features(test_data[:, :-1], relief_ranked_features) knc.fit(corr_train_removed_features, train_data[:, -1]) pred = knc.predict(corr_test_removed_features) print len(np.where(pred != test_data[:, -1])[0]) return
class KNN_strings(object): ''' classdocs ''' def __init__(self, n_neighbors=1): ''' Constructor ''' self.dsr = DatasetReader() self.fenc = FreemanEncoder() self.data = [] self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto', metric=self.lev_metric) def lev_metric(self, x, y): i, j = int(x[0]), int(y[0]) # extract indices # if self.data[i] == self.data[j]: # print self.data[i], self.data[j], edit_dist(self.data[i], self.data[j]) return edit_dist(self.data[i], self.data[j]) def knn_train(self, dataset, cv=1, datasplit=0.7): images_dataset= self.dsr.read_dataset_images(dataset) freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset) _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict) self.data = codes X = np.arange(len(self.data)).reshape(-1, 1) if cv <= 1: self.knn.fit(X, labels) elif cv > 1: cv_result = cross_validation.cross_val_score(self.knn, X, labels, cv=cv) print cv_result print 'Training Done!' def knn_predict(self, test_data, score=False): images_dataset= self.dsr.read_dataset_images(test_data) freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset) _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict) X_pred = np.arange(len(codes)).reshape(-1, 1) predictions = self.knn.predict(X_pred) if score == True: accuracy = self.knn.score(X_pred, labels) print "Test Accuracy: ", accuracy return predictions def knn_predict_one(self, test_image): image_code = self.fenc.encode_freeman(test_image) print image_code data = [image_code] X_pred = np.arange(len(data)).reshape(-1, 1) prediction = self.knn.predict(X_pred) return prediction
def knnRun(folds,train,test,features): ''' Performs random forest with cross validation based upon a list of features chosen from the data set. ''' from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors = 5) performanceList = [] RFfit = [] for m in range(1,folds+1): #Generate this fold's test and train foldTrain = train[train['fold'] != m] foldTest = train[train['fold'] == m] #fit the train for this fold RFfit.append(clf.fit(foldTrain[features],foldTrain['grantstatus'])) #Test the fit against the fold's test Response = clf.predict(foldTest[features]) == foldTest['grantstatus'] foldTest['Response'] = Response #Out of the response groupby grantid so we get one record for each grantid, and #sum the result and divide by the total number of records in a single grantid resultArray = foldTest['Response'].groupby(foldTest['grantid']).sum().apply(float)/foldTest.groupby(foldTest['grantid']).size().apply(float) #append performance for this fold to a list performanceList.append(sum(resultArray)/len(resultArray)) # print 'In Sample Performance: for fold %d: %f' % (m, np.mean(performanceList)) Bestfitfold = performanceList.index(max(performanceList)) + 1 BestfitTrain = train[train['fold'] != Bestfitfold] Bestfit = clf.fit(BestfitTrain[features],BestfitTrain['grantstatus']) testResponse = clf.predict(test[features]) == test['grantstatus'] test['Response'] = testResponse #Out of the response groupby grantid so we get one record for each grantid, and #sum the result and divide by the total number of records in a single grantid resultArray = test['Response'].groupby(test['grantid']).sum().apply(float)/test.groupby(test['grantid']).size().apply(float) #append performance for this fold to a list testResult = sum(resultArray)/len(resultArray) return Bestfit, np.mean(performanceList), testResult
def supervisedTest01(): import numpy as np from sklearn import datasets iris = datasets.load_iris() iris_X = iris.data #iris_X 是150*4的特征(二维矩阵) iris_Y = iris.target #iris_Y 是150*1的label (1维向量) #print len(iris_X) #print len(iris_Y) #print np.unique(iris_Y) #这是获得label的种类, 这里是一共3类 np.random.seed(0) indices = np.random.permutation(len(iris_X)) #获得0-149的一个全排列 #print indices iris_x_train = iris_X[indices[:-10]] #这里是获得从开始到倒数第十的数据 iris_y_train = iris_Y[indices[:-10]] #获得与iris_x_train对应的label iris_x_test = iris_X[indices[-10:]] #这是获得最后的10组数据作为test数据 iris_y_test = iris_Y[indices[-10:]] ##获得与iris_x_test对应的label from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=3, p=2, weights='uniform') knn.fit(iris_x_train, iris_y_train) #其实这里输入的就是train_x 和train_y #算法待理解 print knn.predict(iris_x_test) print iris_y_test
def startknn(self): temp=[] ff=open("knnout.txt","w+") if self.wbr==False : from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr,self.classlabels) for i in self.te: temp.append(i) print neigh.predict(temp) elif self.ngraph==False: from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr,self.classlabels) for i in self.te: temp.append(i) print>>ff,neigh.predict(temp) A = neigh.kneighbors_graph(self.tr) print A else: a=train(self.tr,self.classlabels,self.k) out=open("output.txt","w+") for i in self.te: print>>out,i,classify(a,i,distance_fn=self.dm) print "-------KNN --------Done------------"
class KNNClassifier: def __init__(self): """ This is the constructor for the KNN Classifier """ self.outputHeader = "#knn" self.clf = None self.n_neighbors = 5 self.weights = "uniform" self.algorithm = "auto" def buildModel(self): """ This builds the model of the KNN Classifier """ self.clf = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights = self.weights, algorithm=self.algorithm) def setNeighbors(self, param): """ This sets the n neighbor for the KNN Classifier. """ self.n_neighbors = param def setAlgorithm(self, param): """ This sets the algorithm parameter for the KNN Classifier """ if param in ["auto", "ball_tree", "kd_tree", "brute"]: self.algorithm = param else: print "unknown parameter defaulting to auto." def setWeights(self, param): """ This sets the weights parameter for KNN Classifier """ self.weights = param def trainKNN(self,X, Y): """ Training the KNN Classifier """ self.clf.fit(X, Y) def validateKNN(self,X, Y): """ Validate the KNN Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred) def testKNN(self,X, Y): """ Test the KNN Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred)
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('knnLog25.txt','a') print 'running mashable knn simulation' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) with SimpleTimer('time to train', outFile): clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target) plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4) baseScore = clf.score(test_M, dataTest.target) baseParams = clf.get_params(True) baseNeighbors = baseParams['n_neighbors'] print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors)) res = [] with SimpleTimer('time to fine tune number of neighbors', outFile): for neighbors in range(2,baseNeighbors * 10): # print 'training for neighbors %d' % neighbors clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, neighbors)) outFile.write('%d %.3f \n' % (neighbors, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestNeighbors = res[0][1] print ('best number of neighbors is %d' % bestNeighbors) outFile.write('best number of neighbors is %d and score is %.3f\n' % (bestNeighbors, res[0][0])) bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance') bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
def predictAction(testMoments, trainMoments, trainLabels): neigh = KNeighborsClassifier(n_neighbors=5) #neigh.fit([trainMHI[:,:,i].flatten() for i in xrange(20)], trainLabels) #print(neigh.predict([testMHI.flatten()])) neigh.fit(trainMoments, trainLabels) print(neigh.predict(testMoments)) print(neigh.predict(trainMoments)) print(neigh.predict_proba(trainMoments)) return neigh.predict(testMoments)
def knn(): X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]] y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ] model = KN(n_neighbors=3, weights='distance') model.fit(X,y) print '80k, Single, HO=Y', model.predict([80,1]) print '98k, Single, HO=N', model.predict([98,0]) print '130k, Married, HO=N', model.predict([260, 0])
def knn(X_vectors, t): # leave-one-out strategy to get average accuracy n = len(t) true_num = 0 for i in range(n): X_train = list(X_vectors) del X_train[i] t_train = list(t) del t_train[i] X_test = X_vectors[i] t_test = t[i] clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, t_train) y = clf.predict(X_test) if y == t_test: true_num += 1 accuracy = 1.0 * true_num / n # 8/2 split X = np.array(X_vectors) tt = list(t) pre = [] rec = [] for _ in range(100): X_train, X_test, t_train, t_test = train_test_split(X, tt, test_size=0.2) clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, t_train) y_test = clf.predict(X_test) t_pos = 0 f_pos = 0 t_neg = 0 f_neg = 0 for i in range(len(y_test)): if t_test[i] == 1 and y_test[i] == 1: t_pos += 1 elif t_test[i] == 0 and y_test[i] == 1: f_pos += 1 elif t_test[i] == 0 and y_test[i] == 0: t_neg += 1 elif t_test[i] == 1 and y_test[i] == 0: f_neg += 1 if t_pos == 0: precision = 0 recall = 0 else: precision = 1.0 * t_pos / (t_pos + f_pos) recall = 1.0 * t_pos / (t_pos + f_neg) pre.append(precision) rec.append(recall) pre = sum(pre) / len(pre) rec = sum(rec) / len(rec) F = 2 / (1/pre + 1/rec) return accuracy, pre, rec, F
def knn_mesh_fitter(self): # get the minimum and maximum values for each of the predictor variables v1_min, v1_max = np.min(self.X[:,0]), np.max(self.X[:,0]) v2_min, v2_max = np.min(self.X[:,1]), np.max(self.X[:,1]) # get the range of each variable v1_range = v1_max - v1_min v2_range = v2_max - v2_min # set up the min and max ranges of the axes of the plot # I add a buffer here (1/15th of the range) so no points are on the axes self.x_min = v1_min - (v1_range/self.buffer_denom) self.x_max = v1_max + (v1_range/self.buffer_denom) self.y_min = v2_min - (v2_min/self.buffer_denom) self.y_max = v2_max + (v2_range/self.buffer_denom) # use the numpy meshgrid function to make a bunch of points across the range # of values. self.xx, self.yy = np.meshgrid(np.linspace(self.x_min, self.x_max, self.granularity), np.linspace(self.y_min, self.y_max, self.granularity)) # meshgrids: self.Zs = {'uniform':{}, 'distance':{}} for nn in self.nn_range: # fit a knn on the data with the nearest neighbors number passed into the function knn_mod_euc = KNeighborsClassifier(n_neighbors=nn, weights='uniform') knn_mod_euc.fit(self.X, self.y) knn_mod_w = KNeighborsClassifier(n_neighbors=nn, weights='distance') knn_mod_w.fit(self.X, self.y) # Predict using the knn model on all the meshgrid points. This will let us see # the knn boundary of where it predicts between one class and another! Z = knn_mod_euc.predict(np.c_[self.xx.ravel(), self.yy.ravel()]) Z = Z.reshape(self.xx.shape) self.Zs['uniform'][nn] = Z Z = knn_mod_w.predict(np.c_[self.xx.ravel(), self.yy.ravel()]) Z = Z.reshape(self.xx.shape) self.Zs['distance'][nn] = Z if len(np.unique(self.X[:,0]))+50 < self.X.shape[0]: self.v1_points = self.rand_jitter(self.X[:,0]) else: self.v1_points = self.X[:,0] if len(np.unique(self.X[:,1]))+50 < self.X.shape[0]: self.v2_points = self.rand_jitter(self.X[:,1]) else: self.v2_points = self.X[:,1]
def predictAction(testMoments, trainMoments, trainLabels): neigh = KNeighborsClassifier(n_neighbors=5) # neigh.fit([trainMHI[:,:,i].flatten() for i in xrange(20)], trainLabels) # print(neigh.predict([testMHI.flatten()])) # trainMoments_norm = trainMoments/np.linalg.norm(trainMoments) #normalize(trainMoments, axis=1) # testMoments_norm = testMoments/np.linalg.norm(trainMoments) #normalize(testMoments, axis=1) neigh.fit(trainMoments, trainLabels) print(neigh.predict(trainMoments)) # print(neigh.predict_proba(trainMoments)) return neigh.predict(testMoments)
def knn(X_train, y_train, X_test, y_test): cpu_count = multiprocessing.cpu_count() knn = KNeighborsClassifier(n_neighbors=1, n_jobs=max(1, cpu_count // 3)) knn.fit(X_train, y_train) y_pred_train = knn.predict(X_train) acc_train = skl_metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train) y_pred_test = knn.predict(X_test) acc_test = skl_metrics.accuracy_score(y_true=y_test, y_pred=y_pred_test) return acc_train, acc_test
def apply_knn(k_set,traing_data,train_labels,test_data,test_labels): misclass = [] for k in k_set: # p = 2 and metric='minkowski' combination uses euclidean distance knn_train_test = KNeighborsClassifier(n_neighbors=k, weights='uniform',p=2, metric='minkowski',) knn_train_test.fit(traing_data, train_labels) knn_train_test.predict(test_data) acc = knn_train_test.score(test_data,test_labels) misclass.append(round(1-acc,3)) return misclass
def expAlgorithm(X,y,X_val, y_val,p): ''' FOR FUTURE REFERENCE: "train" refers to the data set which is trained "test" refers to the untrained portion of the data set on which the training is validated "quiz" refers to the unlabeled points which we attempt to label and then submit to kaggle ''' start = time.time() """ X = data[train_start_idx:train_end_idx,0:-1] y = [lbl for lbl in data[train_start_idx:train_end_idx,-1]] """ print('Received data, took this many seconds: ' + str(time.time() - start)) # Training classifier # TODO: ExtraTreesClassifier clf1 = KNeighborsClassifier( n_neighbors=5, weights = "distance", algorithm= "auto" ) # fit sub-classifiers clf1.fit(X,y) # pickle.dump(clf1, open('experimental_classifier.pickle', 'wb')) # fit voting classifier # predict & calculate training error y_hat = clf1.predict(X) train_err = 0 for yi, y_hati in zip(y, y_hat): train_err += (yi != y_hati) train_err = float(train_err)/float(len(y)) print("Train err: " + str(train_err)) print("Beginning test validation...") y_val_hat = clf1.predict(X_val) test_err = 0 for yi, y_hati in zip(y_val, y_val_hat): test_err += (yi != y_hati) print(len(y_val)) print(len(y_val_hat)) test_err = float(test_err)/float(len((y_val))) print("Test error: " + str(test_err)) return test_err
def classifier(train,test,train_target,test_target): kclass = KNeighborsClassifier(n_neighbors=13,algorithm='kd_tree',weights='uniform',p=1) kclass.fit(train,train_target) res = kclass.predict(train) print classification_report(train_target,res) res1 = kclass.predict(test) print classification_report(test_target,res1) return kclass
def predict_author(arr, yazar_features, yazar_classes): results = [] print "\n[DEBUG] K-NN result (neighbors: 10)" knn = KNeighborsClassifier(n_neighbors=10) knn.fit(yazar_features, yazar_classes) print knn.predict(arr) results.append(knn.predict(arr)[0]) print "\n[DEBUG] SVC result (linear) (degree=3)" svc = svm.SVC(kernel='linear', degree=3) svc.fit(yazar_features, yazar_classes) print svc.predict(arr) results.append(svc.predict(arr)[0]) print "\n[DEBUG] Logistic Regression result ()" regr = linear_model.LogisticRegression() regr.fit(yazar_features, yazar_classes) print regr.predict(arr) results.append(regr.predict(arr)[0]) print "\n[DEBUG] Gaussian Naive Bayes" gnb = GaussianNB() gnb.fit(yazar_features, yazar_classes) print gnb.predict(arr) results.append(gnb.predict(arr)[0]) print "\n[DEBUG] Decision Tree Classifier" dtc = tree.DecisionTreeClassifier() dtc.fit(yazar_features, yazar_classes) print dtc.predict(arr) results.append(dtc.predict(arr)[0]) print "\n[DEBUG] Gradient Boosting Classification" gbc = GradientBoostingClassifier() gbc.fit(yazar_features, yazar_classes) print gbc.predict(arr) results.append(gbc.predict(arr)[0]) # output = open('features.pkl', 'wb') # pickle.dump(yazar_features, output) # output.close() # output = open('classes.pkl', 'wb') # pickle.dump(yazar_classes, output) # output.close() # test_yazar_features = [] # for test data # test_yazar_classes = [] # for test classes # # yazar_features = [] # for train data # # yazar_classes = [] # for train classes return results
def knnclassifier(train, train_target, test, test_target, k): classif = KNeighborsClassifier(n_neighbors = k,algorithm='kd_tree',weights='uniform',p=1) classif.fit(train,train_target) res = classif.predict(train) print '*************************** knn ****************' print classification_report(train_target,res) res1 = classif.predict(test) print classification_report(test_target, res1) return classif
def get_iris_dataset(): iris_dataset = load_iris() #1. The format of the dataset print("Keys of iris_dataset: \n{}".format(iris_dataset.keys())) print("Target names: {}".format(iris_dataset['target_names'])) print("Feature names: \n{}".format(iris_dataset['feature_names'])) # data -> numpy.ndarray # row -> the labels # column -> the features print("Type of data: {}".format(iris_dataset['data'].shape)) # (150,4) print("Type of target: {}".format(iris_dataset['target'].shape)) # (150,) #import pdb; pdb.set_trace() #2. split the dataset into training set and testing set # y = f(X) X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'],test_size=0.2, random_state=0) print("X_train shape: {}".format(X_train.shape)) print("y_train shape: {}".format(y_train.shape)) print("X_test shape: {}".format(X_test.shape)) print("y_test shape: {}".format(y_test.shape)) #import pdb; pdb.set_trace() # 3. inspect the data - virtualize it # convert Numpy array int oa pandas DataFrame iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names) # pdb; pdb.set_trace() grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3) plt.show() #import pdb; pdb.set_trace() # The modelu from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) # build the model on the training set knn.fit(X_train, y_train) # the prediction X_new = np.array([[5, 2.9, 1, 0.2]]) prediction = knn.predict(X_new) print("Prediction: {}".format(prediction)) print("Predicted target name: {}".format(iris_dataset['target_names'][prediction])) y_pred = knn.predict(X_test) print("Test set predictions:\n {}".format(y_pred)) print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))
def train_and_test(X_train, y_train, X_test, y_test, distance, n_neighbors): knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto', metric=distance) knn.fit(X_train, y_train) y_pred = knn.predict(X_train) train_acc = '%.2f' % (accuracy_score(y_train, y_pred) * 100) y_pred = knn.predict(X_test) test_acc = '%.2f' % (accuracy_score(y_test, y_pred) * 100) return train_acc, test_acc
def classify(train_feature_matrix, train_classes, test_features, classifier): if classifier == 1: model = KNeighborsClassifier(23, weights='distance', p=1) model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 2: model = VotingClassifier(estimators=[ ('knn1', KNeighborsClassifier(10, weights='distance', p=1)), ('knn3', KNeighborsClassifier(30, weights='distance', p=1)), ('knn4', KNeighborsClassifier(50, weights='distance', p=1)), ('knn4', KNeighborsClassifier(70, weights='distance', p=1)), ('knn5', KNeighborsClassifier(90, weights='distance', p=1))], voting='soft') model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 3: model = GaussianNB() model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 4: model = SGDClassifier(loss='modified_huber', class_weight='balanced', penalty='l1') model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 5: model = RandomForestClassifier() model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 6: model = DecisionTreeClassifier() model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 7: model = KNeighborsClassifier() model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 8: model = SGDClassifier() model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 9: model = AdaBoostClassifier(n_estimators=100) model.fit(train_feature_matrix, train_classes.ravel()) prediction = model.predict(test_features)[0] elif classifier == 10: model = SVC(kernel='precomputed') model.fit(gram(train_feature_matrix, train_feature_matrix), train_classes.ravel()) prediction = model.predict(gram(train_feature_matrix, test_features))[0] return genre_from_int(prediction)
def main(output=RESULTS1B): """ Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). Tests on a subset of trip_data_1.csv Uses sklearn to implement nearest neighbors """ features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'trip_time_in_secs'] ## Extract necessary data into pandas dataframes numrows = 100000 df_train_read = pd.read_csv(TRAIN_DATA) df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows) # first 100k rows, for speed df_test = df_test_read[features].dropna() df_train = df_train_read[features].dropna() ## Use sklearn to run nearest neighbors k = 1 clf = KNeighborsClassifier(n_neighbors=k) # default distance metric: euclidean clf.fit(df_train[features[0:4]], df_train[features[-1]]) preds = clf.predict(df_test[features[0:4]]) # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error) print "Calculating statistics" with open(output, "a+") as outputFile: outputFile.write("Ran knn with k={}".format(k) + \ " Trained on {}. Tested on first".format(TRAIN_DATA) + \ " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1)) calcAndLogStats( numpy.array(preds), numpy.array(df_test[features[-1]]), output=output)
def kann_classify(train_data,train_label,test_data): knnClf=KNeighborsClassifier(n_neighbors=5) knnClf.fit(train_data,ravel(train_label)) test_label=knnClf.predict(test_data) save_result(test_label,'sklearn_knn_Result.csv') return test_label
def main(): print("k nearest neighbours classifier!") X,Y,Xtest = importdata() print(Y.shape) param_grid={ "n_neighbors":[10,20,50,100,200], "algorithm":['auto','ball_tree','kd_tree','brute'], "weights":['uniform','distance'] } knn = KNeighborsClassifier() Gridsearch_impl(X,Y,knn,param_grid,5) # for i in range(10,11,5): # clf = DecisionTreeClassifier(min_samples_split=i) # rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i) # ab = AdaBoostClassifier(rf,n_estimators = 10) #ab = GradientBoostingClassifier(n_estimators = 100) # score = cross_validation.cross_val_score(ab,X,Y,cv=3) # print(score) # print("average score %f"%np.mean(score)) # print("std %f"%np.std(score)) # ab.fit(X,Y) Ytest = knn.predict(Xtest) output(Ytest,'submit3.csv')
def analyze_image(self): ''' Load the image and analyze it with KNN im_file - pre-processed with histogram specification ''' if self._avg_pixels.size == 0: self._process_annotations() self._get_initial_classes() im = self._image rows = im.shape[0] clf = KNeighborsClassifier(n_neighbors = self._n_neighbors) clf.fit(self._avg_pixels, self._labels) im_1d = im.reshape(-1, 3) # calculate prediction reshape into image prediction = clf.predict(im_1d) prediction = prediction.reshape(rows, -1) prediction [self._mask == 0] = Labels.Masked self.display_current(prediction) return prediction
mobile_data = pd.read_csv('clearDataset.csv') X, Y = mobile_data.drop(['price_range'], axis=1), mobile_data['price_range'] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) x_1_train = copy.deepcopy(x_train) x_train = pd.concat([x_train, y_train], axis=1) num_neighbors = 5 n_folds = 5 model = KNN(num_neighbors, n_folds) # scores = model.evaluate_algorithm(x_train.values, model.k_nearest_neighbors, n_folds, num_neighbors) scores2 = model.fit(x_train, y_train) y_pred = model.predict(x_train, x_test) print(f'CV scores: {scores2}') mn = sum(scores2) / float(len(scores2)) print(f'Train data accuracy: {mn}') test_score = model.accuracy_metric(y_test.values, y_pred) print(f'Test data accuracy: {test_score}') neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(x_1_train, y_train) y_pred = neigh.predict(x_test) print(f'Sklearn accuracy score: {accuracy_score(y_test, y_pred)}')
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) #print(X_train.shape) #print(X_val.shape) #print(X_test.shape) #comparing models logr=linear_model.LogisticRegression(solver= 'liblinear').fit(X_train, y_train) logrPred= logr.predict(X_val) print("LR :", accuracy_score(y_val, logrPred)) result =[] result.append(logrPred) knn = KNeighborsClassifier().fit(X_train, y_train) knnPred= knn.predict(X_val) print("knn :", accuracy_score(y_val, knnPred)) result.append(knnPred) gus = GaussianNB().fit(X_train, y_train) gusPred= gus.predict(X_val) print("Gaussian :", accuracy_score(y_val, gusPred)) result.append(gusPred) svm = SVC().fit(X_train, y_train) svmPred = svm.predict(X_val) print("SVC :", accuracy_score(y_val, svmPred)) result.append(svmPred)
labels = [] for d in dirs: files = glob.glob('C:/img/dataset/' + d + '/*.jpg') for file in files: img = cv2.imread(file) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) (t, img) = cv2.threshold(img, 200, 1, cv2.THRESH_BINARY) imgs.append(cv2.resize(img, (45, 45))) labels.append(d) for img in imgs: img = img.flatten() dataset.append(np.array(img, dtype=int)) knn = KNN(n_neighbors=1) #print(dataset) knn.fit(dataset, labels) imagepath = "C:/img/1502099642.3551896('192.168.8.119', 53690)" files1 = next(os.walk(imagepath + "/"))[2] files1 = sorted(files1, key=lambda x: sortfiles(x)) images1 = [] for file in files1: img1 = cv2.resize(cv2.imread(imagepath + '/' + file), (45, 45)) img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY) (t, img1) = cv2.threshold(img1, 200, 1, cv2.THRESH_BINARY) img1 = img1.flatten() img1 = np.array(img1, dtype=int) images1.append(img1) arr = knn.predict(images1) print(arr) end = time.time() print(end - start)
def Question3_B(Xapp, Yapp, Xdev, Ydev): start_time = time() print("Taille de l'échantillon : ", Xdev.shape[0]) print("Classifieur plus proches voisins") neigh = KNeighborsClassifier(n_jobs=-1) neigh.fit(Xapp, Yapp) print("START") erreur = 0 result = neigh.predict(Xdev) tiBase = (time() - start_time) for i in range(0, Xdev.shape[0]): if (result[i] != Ydev[i]): erreur += 1 teBase = (erreur / Xdev.shape[0]) print("--- ", tiBase, " seconds ---") print("END") print("Taux d\'erreur : ", teBase, "\n") teList = [] timeList = [] sizeList = [i for i in range(10, 50, 10) ] + [i for i in range(50, Xapp.shape[1], 50)] for size in sizeList: start_time = time() pca = PCA(size) pca.fit(Xapp) XappPCA = pca.transform(Xapp) X = pca.transform(Xdev) Y = Ydev print("Taille de l'échantillon : ", X.shape[0]) print("Classifieur plus proches voisins, ACP de taille ", size) neigh = KNeighborsClassifier(n_jobs=-1) neigh.fit(XappPCA, Yapp) print("START") erreur = 0 result = neigh.predict(X) ti = (time() - start_time) for i in range(0, X.shape[0]): if (result[i] != Y[i]): erreur += 1 print("--- ", ti, " seconds ---") print("END") te = (erreur / X.shape[0]) print("Taux d'erreur : ", te, "\n") teList.append(te) timeList.append(ti) fig, ax1 = plt.subplots() ax2 = ax1.twinx() lgd1_1 = ax1.plot( sizeList, teList, 'r-o', label="Taux d'erreur en fonction de la dimension de L'ACP") lgd1_2 = ax1.plot(Xdev.shape[1], teBase, 'g-o', label="Taux d'erreur sans ACP") lgd2_1 = ax2.plot( sizeList, timeList, 'b-x', label="Temps de traitemet en fonction de la dimension de L'ACP") lgd2_2 = ax2.plot(Xdev.shape[1], tiBase, 'g-x', label="temps de traitement sans ACP") lgds = lgd1_1 + lgd1_2 + lgd2_1 + lgd2_2 lbls = [lgd.get_label() for lgd in lgds] ax1.legend(lgds, lbls, loc=2) ax1.set_xlabel("Dimension de l'ACP") ax1.set_ylabel("Taux d'erreur", color='r') ax2.set_ylabel("Temps de traitement de l'échantillon \nEn seconde", color='b') plt.show() fig.savefig("question3_B.png")
def assign_labels(X_total,X_pred,y_pred): knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_pred, y_pred) return knn.predict(X_total)
Y_test = test_file['Speaker'].values #Standardising the values. '''std_scaler_x = preprocessing.StandardScaler().fit(X_train) std_scaler_y = preprocessing.StandardScaler().fit(y_train) X_train_std = std_scaler_x.transform(X_train) X_test_std = std_scaler_x.transform(X_test) y_train_std = std_scaler_y.transform(y_train) y_test_std = std_scaler_y.transform(y_test)''' # Prediction #KNN algorithm KNN_clf= KNeighborsClassifier(n_neighbors=1) KNN_clf.fit(X_train, Y_train) Y_pred= KNN_clf.predict(X_test) print Y_pred KNN_accuracy= accuracy_score(Y_test, Y_pred) print KNN_accuracy Date = input_file['Date'].values test_date = np.array(Date[456:]) datetimes = [datetime.strptime(t, "%Y-%m-%d") for t in test_date] date = matplotlib.dates.date2num(datetimes) hfmt = matplotlib.dates.DateFormatter('%d-%m-%Y') fig = plt.figure() fig.canvas.set_window_title('Moisture Prediction using Machine Learning') ax = fig.add_subplot(1,1,1)
trainFeatures = train.drop([0], axis=1) trainLabels = train[0] valFeatures = val.drop([0], axis=1) valLabels = val[0] testFeatures = test.drop([0], axis=1) testLabels = test[0] #----------KNN---------- print("\n\n----------KNN----------") knnClassifier = KNeighborsClassifier() knnClassifier.fit(trainFeatures, trainLabels) knnTrainPredictions = knnClassifier.predict(trainFeatures) knnTrainAccuracy = metrics.accuracy_score(knnTrainPredictions, trainLabels) print('Train Accuracy : ', knnTrainAccuracy) knnValPredictions = knnClassifier.predict(valFeatures) knnValAccuracy = metrics.accuracy_score(knnValPredictions, valLabels) print('Val Accuracy : ', knnValAccuracy) knnTestPredictions = knnClassifier.predict(testFeatures) knnTestAccuracy = metrics.accuracy_score(knnTestPredictions, testLabels) print('Test Accuracy : ', knnTestAccuracy) #----------SVM---------- print("\n\n----------SVM----------") svmClassifier = svm.LinearSVC() #svmClassifier = svm.SVC()
from sklearn.neighbors import KNeighborsClassifier # make an instance of a KNeighborsClassifier object knn = KNeighborsClassifier(n_neighbors=1) type(knn) print knn # fit the knn model. What might the function be called? Documentation... knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X, y) # make predictions on this input: [3, 5, 4, 2] # Again, what might the prediction function be called for knn? X1 = [[3, 5, 4, 2]] print(knn.predict(X1)) # now make predictions for [3, 5, 4, 2], [5, 4, 3, 2] X2 = [[3, 5, 4, 2], [5, 4, 3, 2]] print(knn.predict(X2)) # confirm prediction is an numpy array print type(knn.predict(X)) # instantiate the model (using the value K=5) knn = KNeighborsClassifier(n_neighbors=5, weights='distance') # fit the model with data knn.fit(X, y) X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
def main(): folder, image_path = sys.argv[1], sys.argv[2] train_loader, val_loader = get_dataloader(folder, batch_size=1) #draw_train_loader, draw_val_loader = get_dataloader10(folder, batch_size = 1) if torch.cuda.is_available(): extractor = alexnet(pretrained=True).features.cuda() else: extractor = alexnet(pretrained=True).features feats_train = [] train_y = [] for batch, (x, label) in enumerate(tqdm(train_loader), 1): if torch.cuda.is_available(): x = x.cuda() label = label.cuda() extractor.eval() feat = extractor(x).view(x.size(0), FEATURES_EXTRACTED, -1) feat = torch.mean(feat, 2) feat = feat.cpu().detach().numpy() feats_train.append(feat) train_y.append(label.item()) feats_train = np.array(feats_train) feats_train = feats_train.reshape(6987, FEATURES_EXTRACTED) train_y = np.array(train_y) #pca_feats_train = do_pca(feats_train) x_train_std = StandardScaler().fit_transform(feats_train) pca = PCA(n_components=25).fit(x_train_std) pca_feats_train = pca.transform(x_train_std) knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(pca_feats_train, train_y) # parsing valid data valid_y = [] feats_valid = [] for batch, (x, label) in enumerate(tqdm(val_loader), 1): if torch.cuda.is_available(): x = x.cuda() label = label.cuda() extractor.eval() feat = extractor(x).view(x.size(0), FEATURES_EXTRACTED, -1) feat = torch.mean(feat, 2) feat = feat.cpu().detach().numpy() feats_valid.append(feat) valid_y.append(label.item()) feats_valid = np.array(feats_valid) feats_valid = feats_valid.reshape(1526, FEATURES_EXTRACTED) # do pca to valid x_valid_std = StandardScaler().fit_transform(feats_valid) pca_feats_valid = pca.transform(x_valid_std) train_pred = knn.predict(pca_feats_train) val_pred = knn.predict(pca_feats_valid) train_accuracy = accuracy_score(train_pred, train_y) val_accuracy = accuracy_score(val_pred, valid_y) print("accuracy on training data: {}".format(train_accuracy)) print("accuracy on validation data: {}".format(val_accuracy))
test_scores.append(knn1.score(X_test, y_test)) #training max_train_score = max(train_scores) train_scores_ind = [ i for i, v in enumerate(train_scores) if v == max_train_score ] print('Max train score {} % and k = {}'.format( max_train_score * 100, list(map(lambda x: x + 1, train_scores_ind)))) #testing max_test_score = max(test_scores) test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score] print('Max test score {} % and k = {}'.format( max_test_score * 100, list(map(lambda x: x + 1, test_scores_ind)))) plt.figure(figsize=(12, 5)) p = sns.lineplot(range(1, 20), train_scores, marker='*', label='Train Score') p = sns.lineplot(range(1, 20), test_scores, marker='o', label='Test Score') plt.show() y_pred = knn1.predict(X_test) confusion_matrix(y_test, y_pred) print( pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) print(classification_report(y_test, y_pred))
models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # Evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # Compare Algorithms fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() # Make predictions on validation dataset knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions))
print(metrics.classification_report(expected,predicted_svm)) print(metrics.confusion_matrix(expected,predicted_svm)) cm_svm=metrics.confusion_matrix(expected,predicted_svm) cm_svm_list=cm_svm.tolist() cm_svm_list[0].insert(0,'Real True') cm_svm_list[1].insert(0,'Real False') print tabulate(cm_svm_list,headers=['Real/Pred','Pred True', 'Pred False']) K_NN=KNeighborsClassifier() K_NN.fit(x_train,y_train.ravel()) predicted_knn=K_NN.predict(x_test) accuracy_knn=K_NN.score(x_test,y_test) print(accuracy_knn) print(metrics.classification_report(expected,predicted_knn)) print(metrics.confusion_matrix(expected,predicted_knn)) cm_knn=metrics.confusion_matrix(expected,predicted_knn) cm_knn_list=cm_knn.tolist() cm_knn_list[0].insert(0,'Real True') cm_knn_list[1].insert(0,'Real False') print tabulate(cm_knn_list,headers=['Real/Pred','Pred True', 'Pred False']) dtc=DecisionTreeClassifier(random_state=42)
# -*- coding:utf-8 -*- from sklearn.datasets import load_iris import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report iris = load_iris() type(iris) type(iris.data) type(iris.target) iris.data.shape print(iris.DESCR) x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33) ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) kn = KNeighborsClassifier() kn.fit(x_train, y_train) kn.score(x_test, y_test) y_predict = kn.predict(x_test) print(classification_report(y_test, y_predict, target_names=iris.target_names))
test_data = pd.read_csv('dataset/balanced_noTimestamp_mixTest.csv') test_labels = test_data.iloc[:, -1] # separate labels of testing set test_data.drop(test_data.columns[label_index], axis=1, inplace=True) dt_clf = DecisionTreeClassifier() # Train DecisionTreeClassifier selector_dt = RFE(dt_clf, None, step=1).fit(train_data, train_labels) predicted_test_dt = selector_dt.predict(test_data) rf_clf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0) # RandomForestClassifier selector_rf = RFE(rf_clf, None, step=1).fit(train_data, train_labels) predicted_test_rf = selector_rf.predict(test_data) knn_clf = KNeighborsClassifier(n_neighbors=5).fit( train_data, train_labels) # Train KNN classifier predicted_test_knn = knn_clf.predict(test_data) # Train SVM classifier svc_clf = svm.SVC(gamma='auto', kernel='rbf', decision_function_shape='ovo', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False).fit(train_data, train_labels) predicted_test_svc = svc_clf.predict(test_data) nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set # Create your classifier here from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # In[99]: from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score knn = KNeighborsClassifier(n_neighbors=7) knn.fit(X_train, y_train) # In[100]: pred = knn.predict(X_test) knn.score(X_test, y_test) # In[101]: neighbors = [] cv_scores = [] from sklearn.model_selection import cross_val_score # perform 10 fold cross validation for k in range(1, 51, 2): neighbors.append(k) knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, X_train, y_train, cv=20, scoring='accuracy') cv_scores.append(scores.mean())
#iris.data.shape ''' 查看数据说明 ''' print (iris.DESCR) from sklearn.cross_validation import train_test_split X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.25,random_state=33) from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier knc1 = KNeighborsClassifier(n_neighbors=5,p=1,metric='minkowski')#近邻k值,p=1曼哈顿距离,p=2欧氏距离 knc.fit(X_train,y_train) knc1.fit(X_train,y_train) y_predict = knc.predict(X_test) y_predict1 = knc1.predict(X_test) from sklearn.metrics import classification_report print ('Accuracy of K-Nearest Neighbor Classifier is:',knc.score(X_test,y_test)) print (classification_report(y_test,y_predict,target_names=iris.target_names)) print ('Accuracy of K-Nearest Neighbor Classifier with setting is:',knc.score(X_test,y_test)) print (classification_report(y_test,y_predict1,target_names=iris.target_names)) from matplotlib.colors import ListedColormap import matplotlib.pyplot as plt #def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02): # # # setup marker generator and color map # markers = ('s', 'x', 'o', '^', 'v') # colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan') # cmap = ListedColormap(colors[:len(np.unique(y))]) #
plt.legend(labels=['Female', 'Male']) female_data = df[df['sex'] == 0] #Only consider female female_data.info() feature_lis = ['trestbps', 'oldpeak', 'age'] feature_1 = female_data[feature_lis[0]] feature_2 = female_data[feature_lis[1]] feature_3 = female_data[feature_lis[2]] target = female_data['target'] from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(n_neighbors=5) knn_clf.fit(X, target) y_pred = knn_clf.predict(X) fig3 = plt.figure(10, figsize=(8, 5)) ax = fig3.add_subplot(111, projection='3d') ax.scatter(feature_1, feature_2, feature_3, c=y_pred, cmap=mcolors.ListedColormap(colors)) plt.legend(handles=scatter.legend_elements()[0], labels=['No Heart Desease', 'Heart Desease']) ax.set_xlabel(feature_lis[0]) ax.set_ylabel(feature_lis[1]) ax.set_zlabel(feature_lis[2]) ax.set_title('KNN') plt.show()
df_new X X X = df_new X y np.shape(X) np.shape(y) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test =train_test_split(X,y,test_size = 0.2) X X_train from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() knn.fit(X_train, y_train) knn.predict(X_test) y_test from sklearn.metrics import accuracy_score l knn.predict(X_test) y_predict = knn.predict(X_test) y_predict y_test y_test[1] y_test[1] np.array(y_test) y_test = np.array(y_test) y_predict accuracy_score(y_test,y_predict) get_ipython().magic(u'ls ') get_ipython().magic(u'pwd ')
from sklearn.model_selection import train_test_split training_data, testing_data, training_labels, testing_labels = train_test_split( iris_dataset['data'], iris_dataset['target']) print(len(training_data)) print(testing_data.shape) iris_dataframe = pd.DataFrame(training_data, columns=iris_dataset.feature_names) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) knn.fit(training_data, training_labels) import numpy as np mystery_iris = np.array([[5, 2.9, 1, 0.2]]) print(mystery_iris.shape) prediction = knn.predict(mystery_iris) print(iris_dataset['target_names'][prediction]) test_predictions = knn.predict(testing_data) print(test_predictions) print("Score: {:.2f}".format(np.mean(testing_labels == test_predictions))) print(knn.score(testing_data, testing_labels))
def knncls(): """ K-近邻预测用户签到位置 :return:None """ # 读取数据 data = pd.read_csv("./data/FBlocation/train.csv") print(data.head(10)) # 处理数据 # 1、缩小数据,查询数据晒讯 data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75") # 处理时间的数据 time_value = pd.to_datetime(data['time'], unit='s') print(time_value) # 把日期格式转换成 字典格式 time_value = pd.DatetimeIndex(time_value) # 构造一些特征 data['day'] = time_value.day data['hour'] = time_value.hour data['weekday'] = time_value.weekday # 把时间戳特征删除 data = data.drop(['time'], axis=1) print(data) # 把签到数量少于n个目标位置删除 place_count = data.groupby('place_id').count() tf = place_count[place_count.row_id > 3].reset_index() data = data[data['place_id'].isin(tf.place_id)] # 取出数据当中的特征值和目标值 y = data['place_id'] x = data.drop(['place_id'], axis=1) # 进行数据的分割训练集合测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 特征工程(标准化) std = StandardScaler() # 对测试集和训练集的特征值进行标准化 x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # 进行算法流程 # 超参数 knn = KNeighborsClassifier() # # fit, predict,score knn.fit(x_train, y_train) # # 得出预测结果 y_predict = knn.predict(x_test) # # print("预测的目标签到位置为:", y_predict) # # # 得出准确率 # print("预测的准确率:", knn.score(x_test, y_test)) # 构造一些参数的值进行搜索 param = {"n_neighbors": [3, 5, 10]} # 进行网格搜索 gc = GridSearchCV(knn, param_grid=param, cv=2) gc.fit(x_train, y_train) # 预测准确率 print("在测试集上准确率:", gc.score(x_test, y_test)) print("在交叉验证当中最好的结果:", gc.best_score_) print("选择最好的模型是:", gc.best_estimator_) print("每个超参数每次交叉验证的结果:", gc.cv_results_) return None
logreg = LogisticRegression() logreg.fit(X, y) logreg.predict(X) # Store the response values yPredictions = logreg.predict(X) # We then evaluate the training accuracy # Classification accuracy is the proportion of correct predictions score = metrics.accuracy_score(y, yPredictions) print(score) # We can then repeat this procedure for KNN with K = 5 and K = 1 knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X, y) yPredictions = knn.predict(X) score = metrics.accuracy_score(y, yPredictions) print(score) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X, y) yPredictions = knn.predict(X) score = metrics.accuracy_score(y, yPredictions) print(score) # We conclude the KNN with K = 1 is the best model to use with this data # Note that obtaining 100 % accuracy with K = 1 is expected since we are testing # the training data # We therefore conclude that training and testing our models on the exact same data # is not a useful procedure for deciding which models to choose # Our goal here is to estimate how well each model is likely to perform on
image = cv.imread(imagePath) try: # Convert to Gray and Resize gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) logo = cv.resize(gray, (200, 100)) # Calculate Histogram of Test Image hist = feature.hog(logo, orientations=9, pixels_per_cell=(10, 10), cells_per_block=(2, 2), transform_sqrt=True, block_norm="L1") # Predict in model predict = model.predict(hist.reshape(1, -1))[0] # Make pictures default Height height, width = image.shape[:2] reWidth = int((300 / height) * width) image = cv.resize(image, (reWidth, 300)) # Write predicted label over the Image cv.putText(image, predict.title(), (10, 30), cv.FONT_HERSHEY_TRIPLEX, 1.2, (0, 255, 0), 4) # Get Image name and show Image imageName = imagePath.split("/")[-1] plt.imshow(image) plt.show() cv.waitKey(0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(features_train, labels_train) pred = neigh.predict(features_test) from sklearn.metrics import accuracy_score print accuracy_score(pred, labels_test) ######################## ### adaboost algorithm ######################## from time import time from sklearn.ensemble import AdaBoostClassifier print "-:: adaboost ::------------------" t0 = time() adab = AdaBoostClassifier(n_estimators=100, learning_rate=1) adab.fit(features_test, labels_test) print "training time adaboost:", round(time() - t0, 3), "s"
similarity_matrix = np.zeros([n_subs, n_subs]) disparity_matrix = np.zeros([n_subs, n_subs]) for s2 in range(n_subs): neigh = KNeighborsClassifier(n_neighbors=1) sub_feat = feat[s2, :, :, :].reshape(feat.shape[1] * feat.shape[2], feat.shape[-1]) neigh.fit(sub_feat, labels) for s1 in range(n_subs): equal_label = 0 diff_label = 0 for cond in range(n_conditions): for sample in range(n_samples): data = feat[s1, cond, sample, :].reshape(1, -1) pred = neigh.predict(data) if pred == cond: equal_label += 1 else: diff_label += 1 similarity_matrix[s1, s2] = equal_label / (2 * n_samples) disparity_matrix[s1, s2] = diff_label / (2 * n_samples) for s1 in range(n_subs): for s2 in range(n_subs): similarity_matrix[s1, s2] = min(similarity_matrix[s1, s2], similarity_matrix[s2, s1]) similarity_matrix[s2, s1] = similarity_matrix[s1, s2] disparity_matrix[s1, s2] = max(disparity_matrix[s1, s2],
""" import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn.neighbors import KNeighborsClassifier data = pd.read_csv("data.csv") X = data.iloc[:, :-1].values y = data.iloc[:, 2].values Knn = KNeighborsClassifier(n_neighbors=3) Knn.fit(X, y) X_pred = np.array([6, 6]) Y_pred = Knn.predict(X_pred.reshape(1, -1)) print("\nClass of [6,6] using KNN:- ", Y_pred[0]) #Weighted KNN Knn = KNeighborsClassifier(n_neighbors=3, weights='distance') Knn.fit(X, y) Y_pred = Knn.predict(X_pred.reshape(1, -1)) h = 0.02 clf = Knn cmap_light = ListedColormap(['lightgreen', 'yellow']) cmap_bold = ListedColormap(['b', 'r']) # calculate min, max and limits x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
from sklearn.neighbors import KNeighborsClassifier # manual make a data directory, and download sonar.all-data URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data' df = pd.read_csv('data/sonar.all-data', header=None, prefix="X") # df = pd.read_csv(URL, header=None, prefix="X") print(df.shape) print(df.columns) # df.rename(columns={'X60': 'Label'}, inplace=True) # n_neighbots=2,4 clf1 = KNeighborsClassifier(n_neighbors=6) data, labels = df.iloc[:, :-1], df.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3) clf1.fit(X_train, y_train) y_predict = clf1.predict(X_test) print("score=", clf1.score(X_test, y_test)) # get confusion matrix result_cm1 = confusion_matrix(y_test, y_predict) print(result_cm1) scores = cross_val_score(clf1, data, labels, cv=5, groups=labels) print(scores) from joblib import dump, load dump(clf1, "knn1.joblib") knn2 = load("knn1.joblib") y_predict2 = knn2.predict(X_test) result2 = confusion_matrix(y_predict, y_predict2) print(result2)
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p = 2) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)):
#Declare empty arrays for accuracy and number of neigbours accuracy = np.zeros((6, 1)) accuracy_val = np.zeros((6, 1)) neighbours = np.linspace(2, 7, 6) for x in neighbours: #KNN model training from sklearn.neighbors import KNeighborsClassifier lr = KNeighborsClassifier(n_neighbors=int(x)) #KNN Model lr = lr.fit(X_train_lda, y_train) #Classification Report from sklearn.metrics import classification_report, accuracy_score lr = lr.fit(X_train_lda, y_train) y_pred = lr.predict(X_train_lda) y_pred_test = lr.predict(X_test_lda) #print(classification_report(y_train, y_pred)) accuracy[int(x - 2), :] = accuracy_score(y_train, y_pred) accuracy_val[int(x - 2), :] = accuracy_score(y_test, y_pred_test) print(accuracy_score(y_train, y_pred)) plt.plot(neighbours, accuracy, '-r', label='Training') plt.plot(neighbours, accuracy_val, '-k', label='Validation') plt.xlabel('Number of neighbours') plt.ylabel('Accuracy') plt.title('Sensitivity study') plt.legend()
le = preprocessing.LabelEncoder() buying = le.fit_transform(list(data['buying'])) maint = le.fit_transform(list(data['maint'])) door = le.fit_transform(list(data['door'])) persons = le.fit_transform(list(data['persons'])) lug_boot = le.fit_transform(list(data['lug_boot'])) safety = le.fit_transform(list(data['safety'])) clss = le.fit_transform(list(data['class'])) X = list(zip(buying, maint, door, persons, lug_boot, safety)) Y = list(clss) x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( X, Y, test_size=0.1) model = KNeighborsClassifier(n_neighbors=9) model.fit(x_train, y_train) acc = model.score(x_test, y_test) print(acc) predicted = model.predict(x_test) names = ['unacc', 'acc', 'good', 'vgood'] for x in range(len(predicted)): print( f'Predicted: {names[predicted[x]]}, Data: {x_test[x]}, Actual: {names[y_test[x]]}')