def train_k_neighbors_classifier(): neigh = KNeighborsClassifier(n_neighbors=5) # df = pd.read_csv('./test.csv') df = pd.concat([pd.read_csv('./test.csv'), pd.read_csv('./train.csv')]) n1 = int(len(df) * 0.8) df = df.sample(frac=1) y = df['fruit_type'] cols = [0, 1, 2, 6, 7, 8, 10] df = df.drop(df.columns[cols], axis=1) df1 = df[:n1] df2 = df[n1:] y1 = y[:n1] y2 = y[n1:] x1 = df1.values x2 = df2.values neigh.fit(x1, y1) neigh.kneighbors_graph() y2p = neigh.predict(x2) results = pd.DataFrame({'y2': y2p, 'y': y2}) results['score'] = results['y2'] == results['y'] score = int(sum(results['score']) / len(results) * 100) print('SCORE: ', score, '%') return neigh
def knn(id, datos, principal): pasos = "Dataset cargado" + '\n' dataset = pickdataset(int(id), datos) knn = KNeighborsClassifier() knn.fit(dataset.data, dataset.target) y_pred = knn.predict(dataset.data) guardarImagenKnn(dataset) matrizPorcentaje = metrics.classification_report(dataset.target, y_pred) confusion = metrics.confusion_matrix(dataset.target, y_pred) grafo = knn.kneighbors_graph(dataset.data) avgReal = str(metrics.accuracy_score(dataset.target, y_pred) * 100) + '%' pasos += "Target: " + '\n' + str(dataset.target) + '\n' pasos += "Resultados: " + '\n' + str(y_pred) + '\n' pasos += "Matriz de porcentajes: " + '\n' + matrizPorcentaje + '\n' pasos += "Matriz de confución: " + '\n' + str(confusion) + '\n' reglas = str(grafo.toarray()) + '\n' guardarImagenKnn(dataset) img = 'knn.png' if principal: context = { 'algoritmoPrincipal': 'K-NN', 'resultado': avgReal, 'pasos': pasos, 'reglas': reglas, 'img': img } else: context = { 'algoritmoComparar': 'K-NN', 'resultado2': avgReal, 'pasos2': pasos, 'reglas2': reglas, 'img2': img } return context
def startknn(self): temp = [] ff = open("knnout.txt", "w+") if self.wbr == False: from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr, self.classlabels) for i in self.te: temp.append(i) print neigh.predict(temp) elif self.ngraph == False: from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr, self.classlabels) for i in self.te: temp.append(i) print >> ff, neigh.predict(temp) A = neigh.kneighbors_graph(self.tr) print A else: a = train(self.tr, self.classlabels, self.k) out = open("output.txt", "w+") for i in self.te: print >> out, i, classify(a, i, distance_fn=self.dm) print "-------KNN --------Done------------"
def startknn(self): temp=[] ff=open("knnout.txt","w+") if self.wbr==False : from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr,self.classlabels) for i in self.te: temp.append(i) print neigh.predict(temp) elif self.ngraph==False: from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=self.k) neigh.fit(self.tr,self.classlabels) for i in self.te: temp.append(i) print>>ff,neigh.predict(temp) A = neigh.kneighbors_graph(self.tr) print A else: a=train(self.tr,self.classlabels,self.k) out=open("output.txt","w+") for i in self.te: print>>out,i,classify(a,i,distance_fn=self.dm) print "-------KNN --------Done------------"
def fit_clf(X_train, Y_train, X_test, X_test_actual_output, k): clf = KNN(n_neighbors=k) clf.fit(X_train, Y_train.values.ravel()) X_test_clf_output = clf.predict(X_test) if k == 5: print clf.kneighbors(X_test) print clf.kneighbors_graph(X_test) length = len(X_test_actual_output) count = 0.0 for i in range(length): if X_test_clf_output[i] != X_test_actual_output[i]: count += 1 print count / length
def main(CV=False, PLOT=True): """Entry Point. Parameters ---------- CV: bool Cross-validation flag PLOT: bool Plotting flag """ _data = fetch_data() if CV: method, params = cross_validate(_data, 10) else: method = 'l2' params = {'n_neighbors': 1, 'metric': chisquare} data = normalise(_data, method) X_train, y_train = data['train'] X_test, y_test = data['test'] classifier = KNeighborsClassifier(**params) classifier.fit(X_train, y_train) print('ACCURACY: ', classifier.score(X_test, y_test)) if PLOT: y_hat = classifier.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_hat) plot_confusion_matrix(cnf_matrix, classes=list(set(y_test)), title='K-Nearest-Neighbours\nConfusion Matrix', cmap=plt.cm.Greens) plt.savefig('data/out/knn_cnf_matrix.pdf', format='pdf', dpi=300, transparent=True) neighbors_matrix = classifier.kneighbors_graph(X_test) plot_kneighbors_graph(neighbors_matrix, title='Neighbours Graph') plt.savefig('data/out/knn_neighbours.pdf', format='pdf', dpi=300, transparent=True)
def meta_features_model_based(self, X, y): metafeatures = dict() knn4 = KNeighborsClassifier(n_neighbors=4) knn4.fit(X, y) distances, indices = knn4.kneighbors(X) # Average distance to 4th nearest neighbor metafeatures["knn4"] = np.mean(distances) A = knn4.kneighbors_graph(X) nodes = A.shape[0] edges = csr_matrix.getnnz(A) # Node to edge ratio in kNN graph Bisection # metafeatures["n2er"] = nodes / edges return metafeatures
def get_neighbors(self, A, description): neigh = KNeighborsClassifier( n_neighbors=3) #n_neighbors are no. of neighbors to be considered neigh.fit(A, None) N = neigh.kneighbors_graph(A).todense() N = np.array( N ) #put neighborhood relation into matrix.if neighbor then 1 else 0. #print "N=",N neigh_set = [] for a in N: neighs = np.where( a == 1 )[0] # diagonal elements will be 1 as each element is neighbor of itself neigh_set.append(set(neighs)) #print neigh_set d = {description: neigh_set} return d
df_scaled = StandardScaler() df_scaled.fit(X) df_TrsX = df_scaled.transform(X) # 划分训练集测试集 trainX, testX, trainY, testY = train_test_split(df_TrsX, Y, test_size=0.3, random_state=1) # 重复检测 ''' count = 0 frame = pd.DataFrame(df) print(frame.shape) IsDuplicated = frame.duplicated() print(IsDuplicated) frame = frame.drop_duplicated(['state']) print(frame.shape) ''' # KNN模型 neigh = KNeighborsClassifier(n_neighbors=11, weights='distance', algorithm='auto', p=2, metric='euclidean') neigh.fit(trainX, trainY.astype('int')) Y_pred = neigh.predict(testX) neigh.predict(testX) neigh.predict_proba(testX) print(metrics.accuracy_score(testY, Y_pred)) print(f1_score(testY, Y_pred)) print(np.unique(Y_pred, return_counts=True)) print(neigh.kneighbors_graph())
def KNN(): c=load_breast_cancer() ip_csv.drop(['id'], axis = 1, inplace = True) ip_np_data=ip_csv.values print("---------------numpy array formatted-----------------") #print(ip_np_data) print("---------------loading module to train---------------") knn=KNeighborsClassifier() print("---------------splitting testing and training data---------------") X_train,X_test,y_train,y_test=train_test_split(c.data,c.target,stratify=c.target,test_size=0.20,random_state=42) print("--------------------training data------------------") start_time=time.time() knn.fit(X_train,y_train) end_time=time.time() print("training time :",end_time-start_time) print("---------------testing accuracy data---------------\n") train_acc=(knn.score(X_train,y_train))*100 test_acc=(knn.score(X_test,y_test))*100 print(type(train_acc)) print(test_acc) ''' statement="Training data accuracy score: " label_training=tk.Label(text=statement) #label_training_acc.grid(row=3,column=1) cwgt.create_window(500,100, window=label_training, anchor=NW) label_training_acc=tk.Label(text=train_acc) #label_training_acc.grid(row=3,column=2) cwgt.create_window(700,100, window=label_training_acc, anchor=NW) statement="Testing data accuracy score: " label_testing=tk.Label(text=statement) #label_testing_acc.grid(row=4,column=1) cwgt.create_window(500,130, window=label_testing, anchor=NW) label_testing_acc=tk.Label(text=test_acc) #label_testing_acc.grid(row=4,column=2) cwgt.create_window(700,130, window=label_testing_acc, anchor=NW) matrix=metrics.confusion_matrix(y_test, knn.predict(X_test)) label_matrix=tk.Label(text=matrix) statement="Confusion Matrix is: " label_M=tk.Label(text=statement) cwgt.create_window(500,160, window=label_M, anchor=NW) cwgt.create_window(650,160, window=label_matrix, anchor=NW) ''' matrix=metrics.confusion_matrix(y_test, knn.predict(X_test)) #label_matrix=tk.Label(text=matrix) statement="Classification Report is: " #label_M=tk.Label(text=statement) #cwgt.create_window(500,210, window=label_M, anchor=NW) cReport=classification_report(y_test,knn.predict(X_test)) #label_cReport=tk.Label(text=cReport) #cwgt.create_window(670,210, window=label_cReport, anchor=NW) print("---------------predicting input data---------------\n") result_list=knn.predict(ip_np_data) ip_ids=pd.read_csv(filename, usecols=['id']) print("ID\t\t\tTYPE") diagnose_report=[] cn=CancerClassification(0,"t") var=200 for i in range(0,ip_ids.size): ctype='malignant' if result_list[i]==0 else 'begnine' #print(ctype) diagnose_report.append(CancerClassification(ip_ids.values[i],ctype).tostring()) print(diagnose_report) for i in range(0,len(diagnose_report)): label=tk.Label(text=diagnose_report[i]) #label.grid(row=var+i,column=1) labels.append(label) cwgt.create_window(500,var+i*30, window=label, anchor=NW) print(dir(knn)) print(knn.kneighbors_graph()) classify_report(matrix,cReport,train_acc=train_acc,test_acc=test_acc,algo="KNN")
# setting the parameters n_neighbors = 5 # build the model clf = KNeighborsClassifier(n_neighbors=n_neighbors) # train the model clf.fit(x_train, y_train) # looking at the methods get_params = clf.get_params() # returns the parameters of the model kneighbours = clf.kneighbors( x_test.head(1), n_neighbors=n_neighbors ) # the first array gives the distance between the new data point and the k neighbours, and the second array gives the sample number of the k neighbours kneighbours_graph = clf.kneighbors_graph( x_test.head(1), n_neighbors=n_neighbors, mode='distance' ) # returns a sparce matrix for the k neighbours for the new data points prediction_array = clf.predict(x_test) # predicted test values prediction_probability = clf.predict_proba( x_test) # the probability for each class for each test data point train_score = clf.score(x_train, y_train) # the mean auuracy of the training dataset test_score = clf.score(x_test, y_test) # the mean acccuracy for the test dataset print( 'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f' % (train_score, test_score)) pdb.set_trace()
print(confusionmatrix) print(accuracy_score(ytest,ypred)) print(accuracy_score(ytrain,ypred1)) plt.plot(ypred) plt.show() from sklearn.neighbors import KNeighborsClassifier knc=KNeighborsClassifier(n_neighbors=100) print(knc.fit(xtrain,ytrain)) ypred=(knc.predict(xtest)) ypred1=knc.predict(xtrain) print(ypred) print(list(le.inverse_transform(ypred))) print(knc.predict_proba(xtest)) print(knc.score(xtrain,ytrain)) print(knc.kneighbors()) print(knc.kneighbors_graph()) print(r2_score(ytest,ypred)) from sklearn.pipeline import make_pipeline from sklearn.neighbors import NeighborhoodComponentsAnalysis nca=NeighborhoodComponentsAnalysis(random_state=42) from mlxtend.preprocessing import DenseTransformer nca_pipe=(make_pipeline((NeighborhoodComponentsAnalysis()),(KNeighborsClassifier()))) print(nca_pipe) dense=DenseTransformer() print(dense.fit(xtrain,ytrain)) ##xtrain,ytrain=dense.transform(xtrain,ytrain) ##print(nca.fit(xtrain,ytrain)) ##knc.fit(nca.transform(xtrain,ytrain)) ##print(knc.score(nca.transform(xtest,ytest)) ##print(nca_pipe.fit(xtrain,ytrain)) ##print(nca_pipe.score(xtrain,ytrain))
knn_clf.fit(X_train_mod, y_train_mod) clean_digit = knn_clf.predict([X_test_mod[4000]]) plot_digit(clean_digit) # In[89]: knn_clf.fit(X_train_mod, y_train_mod) clean_digit = knn_clf.predict([X_test_mod[4000]]) plot_digit(clean_digit) # In[17]: knn_clf.kneighbors_graph() # In[ ]: # In[ ]:
df = pd.read_csv('../../data_set/iris.data', names=names) df.info() X = df[names[0:-1]] Y = df[names[-1]] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) """ n_neighbors=5, 使用多少个邻居 weights='uniform', k个样本等权重 distance 表示距离反比权重 algorithm='auto', 模型求解方式 可选为brute kdtree balltree leaf_size=30, 当求解方式为kdtree或balltree时,树最多允许的叶子数目 p=2, 在minkowski距离中 变化为欧几里得距离公式 metric='minkowski', 距离公式计算方式 metric_params=None, 距离公式计算中参数列表 n_jobs=None,使用多少线程计算 """ algo = KNeighborsClassifier(n_neighbors=3) algo.fit(X_train, Y_train) print(f'模型在训练集上的效果r:{algo.score(X_train, Y_train)}') print(f'模型在测试集上的效果r:{algo.score(X_test, Y_test)}') test1 = X_test.iloc[:10, :] print(test1) graph1 = algo.kneighbors_graph(test1, n_neighbors=3, mode='distance') print(graph1)
plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score clf = KNeighborsClassifier(n_neighbors=20) t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time() - t0, 3), "s" t1 = time() result = clf.predict(features_test) print "Predict time:", round(time() - t1, 3), "s" print('The accuracy is') print(accuracy_score(labels_test, result)) A = clf.kneighbors_graph(features_train) A.toarray() try: prettyPicture(clf, features_test, labels_test) except NameError: pass
# # DBSCAN # In[ ]: from sklearn.cluster import DBSCAN # In[ ]: # check k distance for dbscan from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=int(len(subset)**.5)) knn.fit(subset[colsOfInterest], [1]*len(subset)) distances = np.array(knn.kneighbors_graph(n_neighbors=int(len(subset)**.25), mode='distance').max(axis=1).todense().T)[0] distances.sort() plt.plot(distances) #plt.ylim((0, 1e6)) # In[ ]: # In[ ]: # fit dbascn dbscan = DBSCAN(eps=2e5, min_samples=1000) #scaler = RobustScaler()
fpr, tpr, threads = metrics.roc_curve(y_true[:, 0], y_score[:, 0]) auc = metrics.auc(fpr, tpr) print("对于类别1的单独计算AUC的值:{}".format(auc)) fpr, tpr, threads = metrics.roc_curve(y_true[:, 1], y_score[:, 1]) auc = metrics.auc(fpr, tpr) print("对于类别2的单独计算AUC的值:{}".format(auc)) fpr, tpr, threads = metrics.roc_curve(y_true[:, 2], y_score[:, 2]) auc = metrics.auc(fpr, tpr) print("对于类别3的单独计算AUC的值:{}".format(auc)) # 9. 其它 print(y_test.ravel()) print(test_predict) print("softmax/sigmoid函数返回的概率值:\n{}".format(algo.predict_proba(x_test))) test1 = x_test.iloc[:10, :] print(test1) """ kneighbors_graph:在训练数据中获取和当前传入数据最相似的n_neighbors的3个样本信息,mode如果为distance,返回的是距离;如果值为:connectivity,返回n_neighbors个连通的样本点;一般情况下,修改为distance """ graph1 = algo.kneighbors_graph(test1, n_neighbors=3, mode='connectivity') print(graph1) print("*" * 100) """ 直接返回给定x在训练数据中最相似的n_neighbors个样本,return_distance表示是否返回距离,设置为True表示返回距离;设置为False表示返回样本的索引值 """ neighbors = algo.kneighbors(test1, n_neighbors=5, return_distance=False) print(neighbors)
cvs = cross_val_score(knn, x, y, cv=10) # In[16]: ## Show cross validation scores cvs # In[17]: ## Show cross validation score mean and std print('%0.2f, %0.2f' % (cvs.mean(), cvs.std())) # In[18]: ## Check graph ("mean decrease impurity") knn.kneighbors_graph(x[0:5]).toarray() # In[19]: ## Predict y given test set predictions = knn.predict(x_test) # In[20]: ## Take a look at the confusion matrix ([TN,FN],[FP,TP]) confusion_matrix(y_test, predictions) # In[21]: ## Accuracy score print('%0.2f' % precision_score(y_test, predictions))
door = le.fit_transform(list(data["door"])) persons = le.fit_transform(list(data["persons"])) lug_boots = le.fit_transform(list(data["lug_boots"])) safety = le.fit_transform(list(data["safety"])) cls = le.fit_transform(list(data["class"])) predict = "class" # zipping our x data into tuples of atributes x = list(zip(buying, maint, door, persons, lug_boots, safety)) print(x) y = list(cls) x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( x, y, test_size=0.1) # Setting up KNN model with K as 5 model = KNeighborsClassifier(n_neighbors=9) model.fit(x_train, y_train) acc = model.score(x_test, y_test) print(acc) predictions = model.predict(x_test) names = ['unacc', 'acc', 'good', 'vgood'] #for a in range(30): # print("prediction:{}, {} {}".format(names[predictions[a]], x_test[a], y_test[a])) for a in range(30): print(model.kneighbors_graph([x_test[a]], 5))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3) knn = KNeighborsClassifier() knn.fit(x_train, y_train) y_pred = knn.predict(x_test) accuracy = knn.score(x_test, y_test) knn.predict([[3, 5.2, 4.7, 0.4]]) a = knn.kneighbors_graph(X) a.toarray() plt.figure() ax = plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y) #------------Logistic x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3) log = LogisticRegression() log.fit(x_train, y_train)
def fit(self, X, y): # 确定正类和负类 tmp_labels = [] tmp_nums = [] self.labels = set(y) if (len(self.labels)) != 2: raise Exception("标签类别数:", len(self.labels)) for y_label in self.labels: self.label_map_num[y_label] = np.sum(y == y_label) tmp_labels.append(y_label) tmp_nums.append(np.sum(y == y_label)) if tmp_nums[0] < tmp_nums[1]: self.pos_label = tmp_labels[0] self.neg_label = tmp_labels[1] else: self.pos_label = tmp_labels[1] self.neg_label = tmp_labels[0] # 计算类间不平衡度 self.ir = self.label_map_num[self.neg_label] / self.label_map_num[ self.pos_label] # 计算每个样本的最近k个点的类别分布 knn = KNeighborsClassifier(n_neighbors=self.k) knn.fit(X, y) distances = knn.kneighbors_graph().todense() for index in range(len(y)): indices = np.argwhere( np.asarray(distances[index]).reshape(-1) == 1).reshape(-1) self.k_neighbors_dist.append(y[indices].tolist()) # 计算重叠区域和噪声 for index in range(len(y)): label_set = set(self.k_neighbors_dist[index]) # 该点附近既有正样本又有负样本 if len(label_set) == 2: self.overlap_marks.append(1) else: self.overlap_marks.append(0) if len(label_set) == 1 and label_set.pop() != y[index]: self.abnormal_marks.append(1) else: self.abnormal_marks.append(0) self.overlap_marks = np.asarray(self.overlap_marks) self.abnormal_marks = np.asarray(self.abnormal_marks) self.pos_and_overlap_nums = np.sum( (y == self.pos_label) * (self.overlap_marks == 1)) self.neg_and_overlap_nums = np.sum( (y == self.neg_label) * (self.overlap_marks == 1)) self.pos_and_abn_nums = np.sum( (y == self.pos_label) * (self.abnormal_marks == 1)) self.neg_and_abn_nums = np.sum( (y == self.neg_label) * (self.abnormal_marks == 1)) # 计算类内的不平衡度 self.pos_db_scan_classes = len( set(DBSCAN().fit( X[np.argwhere(y == self.pos_label).reshape(-1)]).labels_)) - 1 self.neg_db_scan_classes = len( set(DBSCAN().fit( X[np.argwhere(y == self.neg_label).reshape(-1)]).labels_)) - 1
persons = le.fit_transform(list(data["persons"])) lug_boot = le.fit_transform(list(data["lug_boot"])) safety = le.fit_transform(list(data["safety"])) cls = le.fit_transform(list(data["class"])) #seperate data input and output x = list(zip(buying, maint, door, persons, lug_boot, safety, cls)) y = list(cls) #split between training and test x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split( x, y, test_size=0.1) model = KNeighborsClassifier(n_neighbors=9) model.fit(x_train, y_train) predict = model.predict(x_test) cls = ["unacc", "acc", "good", "vgood"] for i in range(len(predict)): print("predicted", cls[predict[i]], "test data:", x_test[i], "actual_data:", cls[y_test[i]]) #neighbors method takes 2D as input, this means if we want to pass one data point we need surround it with [] n = model.kneighbors([x_test[i]], 9, True) print("neighbours:", n) A = model.kneighbors_graph(x_test, 9) plot = "values" plt.scatter(buying, y) plt.xlabel(plot) plt.ylabel("classes") plt.show()