def fit(self, X, y): """ Partitions the data into k-folds.""" # Split data into n_folds self.kFold = KFold(n_folds=self.n_folds, random_state=self.random_state) self.kFold.generate_data(X, y) # Determine k_max, the maximum value of k given N and the number of folds. N = X.shape[0] fold_size = math.floor(N / self.n_folds) self.k_max = (fold_size * (self.n_folds - 1)) - 1 # Iterate over each value of k, storing mean error for each fold k_scores = [] for i in range(1, self.k_max + 1): model = knn(i) fold_scores = [] for j in range(self.n_folds): data = self.kFold.get_fold_data(j) model.fit(data['X_train'], data['y_train']) fold_scores.append(model.score(data['X_val'], data['y_val'])) k_scores.append(np.mean(fold_scores)) # Obtain best k self.best_k = self._get_best_k(k_scores) self.model = knn(self.best_k) self.model.fit(X, y)
def doit(X, k): x, y = loadData("train", 225) x = x.toarray() train_x = x[0:10000] train_y = y[0:10000] test_x = x[9000:10000] test_y = y[9000:10000] model = lwp() model.fit(train_x, train_y) prediction = model.predict(test_x) cent = model.centroids_ clas = model.classes_ # print(cent.shape) # print(clas) neigh = knn(n_neighbors=k) neigh.fit(cent, clas) kn = neigh.kneighbors(X.toarray())[:][1] # correct = 0 # wrong = 0 # for i in range(1000): # print(test_y[i],clas[kn[i]]) # if test_y[i] in clas[kn[i]]: # correct = correct+1 # else: # wrong = wrong+1 # print(correct,wrong) return clas[kn]
def full_knn(iris, num_features=4): """Perform knn classification on iris dataset using given number of feature dimensions (default = 4), shows results.""" # perform projection iris.data = iris.data[:, :num_features] # screw up scaling! (knn can be sensitive to feature scaling) # iris.data[:, :1] *= 100000000 # perform train/test split tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT) train_features, test_features, train_labels, test_labels = tts # initialize model, perform fit clf = knn(n_neighbors=NUM_NBRS) clf.fit(train_features, train_labels) # get accuracy (predictions made internally) acc = clf.score(test_features, test_labels) # get conf matrix (requires predicted labels) predicted_labels = clf.predict(test_features) cm = confusion_matrix(test_labels, predicted_labels) print 'k = {0}'.format(NUM_NBRS) print 'num_features = {0}'.format(num_features) print 'accuracy = {0} %\n'.format(round(100 * acc, 2)) print 'confusion matrix:\n', cm, '\n'
def Train_Data(self): feature = [] Temp = pd.read_csv("/home/cse/Work/Dataset/idf.csv") for i in Temp: if self.check == 0: self.attribute.append(i) self.check = 1 data = pd.read_csv("/home/cse/Work/Dataset/data.csv") for i in data: feature.append(i) feature = feature[:-1] X = data[feature] Y = data['label'] self.train_data, self.test_data, self.label_data, self.label_test = train_test_split(X,Y,test_size=0.3,random_state=1) print("Training model......") self.model = svm.LinearSVC(random_state=0) self.model.fit(self.train_data,self.label_data) self.modelknn = knn(n_neighbors=7) self.modelknn.fit(self.train_data,self.label_data) joblib.dump(self.model,"/home/cse/Work/Dataset/MODEL.pkl") joblib.dump(self.modelknn,"/home/cse/Work/Dataset/MODELKNN.pkl") print("Finish training.") print() return
def knnOptimization(cmMetric='accuracy'): kvals = [i for i in range(1, 15)] kmodels = {} kpreds = {} kpreds_prob = {} cutoffgrid = np.linspace(0, 1, 100) numk = [] for k in kvals: tknn = knn(n_neighbors=k).fit(XS, y) kmodels[k] = tknn tknn_preds = tknn.predict(XS) kpreds[k] = tknn_preds tknn_preds_prob = tknn.predict_proba(XS) kpreds_prob[k] = [x[0] for x in tknn_preds_prob] for k in kvals: tcm = [ confusionMatrixInfo(kpreds_prob[k] < i, y, labels=[1, 0])[cmMetric] for i in cutoffgrid ] numk.append(max(tcm)) count = 0 for x in numk: count += 1 if x == max(numk): return count, numk
def sk_knn(): train_labels = [] train_flist = os.listdir("./digit/trainingDigits") train_len = len(train_flist) train_mat = np.zeros((train_len, 1024)) for i, fname in enumerate(train_flist): # print(fname) flabel = int(fname.split("_")[0]) train_mat[i, :] = mat2vector("./digit/trainingDigits/{}".format(fname)) train_labels.append(flabel) # break print(train_labels) print(train_mat) knn_instance = knn(n_neighbors=3) #TODO neighbors <= 5效果最好 knn_instance.fit(train_mat, train_labels) test_flist = os.listdir("./digit/testDigits") # test file list err_count = 0 for fname in test_flist: test_label = int(fname.split("_")[0]) test_mat = mat2vector("./digit/testDigits/{}".format(fname)) res = knn_instance.predict(test_mat) if res != test_label: err_count += 1 print("error,the predict res is {}, the real label is {}".format( res, test_label)) print("./digit/testDigits/{}".format(fname)) print("the error rate is {}%".format(err_count / len(test_flist) * 100))
def knn_pca_std(n_neighbors, n_pca_components, X_train_df, X_test_df, y_train_df, y_test_df): """ Function performs KNN with PCA using standardized data set. Inputs: - n_neighbors - number of KNN nearest neighbors - n_pca_components - number of PCA components to use - X_train_df - dataframe containing X data for training - X_test_df - dataframe containing X data for testing - y_train_df - dataframe containing y data for training - y_test_df - dataframe containing y data for testing Returns: - KNN accuracy for specified K and number of pca components values """ # Standardize data scaler = preprocessing.StandardScaler() scaler.fit(X_train_df) X_train_std_df = scaler.transform(X_train_df) X_test_std_df = scaler.transform(X_test_df) # Conduct KNN - pca_standardized KNN = knn(n_neighbors=n_neighbors) pca = PCA(n_components=n_pca_components) pca.fit(X_train_std_df) X_train_std_pca_df = pca.transform(X_train_std_df) X_test_std_pca_df = pca.transform(X_test_std_df) KNN.fit(X_train_std_pca_df, y_train_df) y_pred = KNN.predict(X_test_std_pca_df) accuracy = accuracy_score(y_test_df, y_pred) return accuracy
def knn_no_pca (n_neighbors, X_train_df, X_test_df, y_train_df, y_test_df): """ Function performs KNN (no pca) using normalized and standardized data sets. Inputs: - n_neighbors - number of KNN nearest neighbors - n_splits - number of CV folds - X_train_df - dataframe containing X data for training - X_test_df - dataframe containing X data for testing - y_train_df - dataframe containing y data for training - y_test_df - dataframe containing y data for testing Returns: - knn accuracy """ # Standardize data scaler = preprocessing.StandardScaler() scaler.fit(X_train_df) X_train_std_df = scaler.transform(X_train_df) X_test_std_df = scaler.transform(X_test_df) # Perform KNN KNN = knn(n_neighbors=n_neighbors) KNN.fit(X_train_std_df,y_train_df) y_pred = KNN.predict(X_test_std_df) accuracy = accuracy_score(y_test_df, y_pred) return accuracy
def predict_knn(X_train, X_test, y_train, y_test): clf=knn(n_neighbors=3) print("knn started") clf.fit(X_train,y_train) y_pred=clf.predict(X_test) calc_accuracy("K nearest neighbours",y_test,y_pred) np.savetxt('submission_surf_knn.csv', np.c_[range(1,len(y_test)+1),y_pred,y_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d')
def label_cluster(X, class_labels, center): clf = knn(n_neighbors=nbr, algorithm='kd_tree') #clf=tree.DecisionTreeRegressor() #clf =SVC(kernel='rbf', class_weight='balanced') clf.fit(X, class_labels) Y = clf.predict(center) return Y
def SelectModel(modelname, param): if modelname == "SVM": from sklearn.svm import LinearSVC model = LinearSVC(C=param) elif modelname == "GBDT": from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier() elif modelname == "RF": from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() elif modelname == "KNN": from sklearn.neighbors import KNeighborsClassifier as knn model = knn() elif modelname == "LR": from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=param) elif modelname == 'NB': from sklearn.naive_bayes import MultinomialNB model = MultinomialNB(alpha=1) elif modelname == "Softmax": from sklearn.linear_model import LogisticRegression model = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=param) return model
def SelectModel(modelname): if modelname == "SVM": model = SVC(kernel='rbf', C=16, gamma=0.0313, probability=True) elif modelname == "GBDT": model = GradientBoostingClassifier() elif modelname == "RF": from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=500) elif modelname == "XGBOOST": from xgboost.sklearn import XGBClassifier #import xgboost as xgb #model = xgb() print('+++++++++++++++++++++++++') model = XGBClassifier() elif modelname == "KNN": from sklearn.neighbors import KNeighborsClassifier as knn model = knn() elif modelname == "lgb": model = lgb.LGBMClassifier(n_estimators=500, max_depth=15, learning_rate=0.2) else: pass return model
def knnIrisDataSet(X, y, n): #definir el clasificador the the classifier and fits it to the data res = 0.05 k1 = knn(n_neighbors=n, p=2, metric='minkowski') #entrenar los datos k1.fit(X, y) #definir la malla x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res), np.arange(x2_min, x2_max, res)) #realizar la prediccion Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) #colores cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) #dibujar la superficie de decision plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) #dibujar los puntos for idx, cl in enumerate(np.unique(y)): plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.show()
def knnDemo(X, y, n): #cresates the the classifier and fits it to the data res = 0.05 k1 = knn(n_neighbors=n, p=2, metric='minkowski') k1.fit(X, y) #sets up the grid x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res), np.arange(x2_min, x2_max, res)) #makes the prediction Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) #creates the color map cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) #Plots the decision surface plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) #plots the samples for idx, cl in enumerate(np.unique(y)): plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.show()
def sklearn_handwritingTest(): #获取训练特征 train_mat = [] train_labels = [] for sub in os.listdir(train_digits): label = sub.split('_')[0] train_labels.append(label) vector = img2vector(os.path.join(train_digits, sub)) train_mat.append(vector) train_mat = np.array(train_mat) neigh = knn(n_neighbors=3) neigh.fit(train_mat, train_labels) #进行测试 error = 0.0 total = 0.0 for sub in os.listdir(test_digits): total += 1 label = sub.split('_')[0] vector = img2vector(os.path.join(test_digits, sub)) vector = np.array(vector).reshape(1, -1) # 1d->2d pred_label = neigh.predict(vector) if pred_label != label: error += 1 print('错误率为{}/{}'.format(error, total))
def handwritingClassTest(): hwlabels = [] trainingFileList = listdir("../../data/mnist/trainingDigits") # 返回文件夹个数 m = len(trainingFileList) # 初始化训练集 trainMat = np.zeros((m, 1024)) # 从文件名解析训练集类别 for i in range(m): fileNameStr = trainingFileList[i] classNumber = int(fileNameStr.split('_')[0]) hwlabels.append(classNumber) trainMat[i, :] = img2vector("../../data/mnist/trainingDigits/" + fileNameStr) # 构建knn分类器 neign = knn(n_neighbors=3, algorithm='auto') # 拟合模型 neign.fit(trainMat, hwlabels) # 返回testDigits目录下的文件列表 testFileList = listdir("../../data/mnist/testDigits") errorCount = 0.0 mTest = len(testFileList) # 从文件中解析出测试集的类别并进行分类测试 for i in range(mTest): fileNameStr = testFileList[i] classNumber = int(fileNameStr.split('_')[0]) vectorUnderTest = img2vector("../../data/mnist/testDigits/" + fileNameStr) # 获取预测结果 classifierResult = neign.predict(vectorUnderTest) print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) if (classifierResult != classNumber): errorCount += 1 print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount / mTest * 100))
def full_knn(iris, num_features=4): """Perform knn classification on iris dataset using given number of feature dimensions (default = 4), shows results.""" # perform projection iris.data = iris.data[:,: num_features] # screw up scaling! (knn can be sensitive to feature scaling) # iris.data[:, :1] *= 100000000 # perform train/test split tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT) train_features, test_features, train_labels, test_labels = tts # initialize model, perform fit clf = knn(n_neighbors=NUM_NBRS) clf.fit(train_features, train_labels) # get accuracy (predictions made internally) acc = clf.score(test_features, test_labels) # get conf matrix (requires predicted labels) predicted_labels = clf.predict(test_features) cm = confusion_matrix(test_labels, predicted_labels) print 'k = {0}'.format(NUM_NBRS) print 'num_features = {0}'.format(num_features) print 'accuracy = {0} %\n'.format(round(100 * acc, 2)) print 'confusion matrix:\n', cm, '\n'
def getKNN(trainX, trainY): from sklearn.neighbors import KNeighborsClassifier as knn trainX = np.array(trainX) trainY = np.array(trainY) model = knn(n_neighbors=30, weights='distance') model.fit(trainX, trainY.ravel()) return model
def knnclassify(traindata, trainlabel, target): """ use knn(3) by default trainingdata and target has be to numpy array """ model = knn().fit(traindata, trainlabel) prediction = model.predict(target) return prediction
def remove_outliers(self, X, k = 20, q = 0.1): nneigh = knn(k + 1) nneigh.fit(X) dist = nneigh.kneighbors(X,return_distance = True)[0][:,1:] dens = 1/np.mean(dist,axis = 1) keepers = (dens >= np.quantile(dens, q)) return keepers
def run_knn(train_features, train_labels,test_features, test_labels, iteration=None,k=None): clf = knn(n_neighbors = k) clf.fit(train_features, train_labels) acc = clf.score(test_features, test_labels) predicted_labels = clf.predict(test_features) cm = confusion_matrix(test_labels, predicted_labels) CROSS_VAL_ARRAY.append(acc)
def run_knn(): clf=knn(n_neighbors=3) print("knn started") clf.fit(x,y) #print(clf.classes_) #print clf.n_layers_ pred=clf.predict(x_) #print(pred) np.savetxt('submission_knn.csv', np.c_[range(1,len(test)+1),pred,label_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d') calc_accuracy("K nearest neighbours",label_test,pred)
def test_data(X_test, center, y1): clf = knn(n_neighbors=1, algorithm='auto') #neigh.fit(center,y1) #Y=neigh.predict(X_test) #clf=RandomForestClassifier() #clf =SVC(kernel='rbf', class_weight='balanced') #clf=tree.DecisionTreeRegressor() clf.fit(center, y1) Y = clf.predict(X_test) return Y
def find_knn_k( max_k = max_knn): #this function fits the knn model and finds the highest accuracy value and index #initialize results set all_fpr, all_tpr, all_auc, all_acc, all_cm = (np.zeros(max_k), np.zeros(max_k), np.zeros(max_k),np.zeros(max_k), np.zeros(max_k)) #Perfom CV to find best value of k for i in xrange(max_k): #randomize data #perm = np.random.permutation(len(labels)) #features = features.iloc[perm] #lables = labels.iloc[perm] # perform train/test split tts = cv.train_test_split(features,labels, train_size=train_pct) train_features, test_features, train_labels, test_labels = tts #print test_features, '\n' #initialize model, perform fit kclf = knn(n_neighbors=i+1) kclf.fit(train_features,train_labels) # get conf matrix (requires predicted labels) predicted_labels = kclf.predict(test_features) cm = confusion_matrix(test_labels, predicted_labels) #calc ROC, AUC, and accuracy fpr, tpr, thresholds = roc_curve(test_labels,predicted_labels, pos_label=1) roc_auc = auc(fpr,tpr) #get model accuracy acc = kclf.score(test_features, test_labels) #Put all stats in arrays all_fpr[i] = fpr[1] all_tpr[i] = tpr[1] all_auc[i] = roc_auc all_acc[i] = acc #all_cm[i] = cm #all_k[i] = all_acc.argmax(axis=0) #print i #print 'confusion matrix:\n', cm, '\n' print 'Accuracy Matrix = \n', all_acc #print np.mean(all_acc) print '\nMax accuracy = {0}'.format(max(all_acc)) print '\nK = {0}'.format(all_acc.argmax(axis=0) + 1) #print len(all_acc) #print all_k return all_acc, max_k, predicted_labels, test_labels
def __init__(self): """ Constructor. Arg: name the name of the classifier """ super().__init__("KNN") grid_parameters = {'n_neighbors': range(2, 15)} self.knn = GridSearchCV( knn(), grid_parameters, cv=3, iid=False ) # least populated class in y has only 3 members, so cv is set to 3
def elbow_curve(k): empty_lst = [] #empty list for i in k: #instance for knn clf = knn(n_neighbors=i) clf.fit(train_x,train_y) tmp = clf.predict(test_x) tmp = m.accuracy_score(tmp,test_y) error = 1-tmp empty_lst.append(error) return empty_lst
def ecur(k): error_test = [] for i in k: c = knn(n_neighbors=i) c.fit(train_x, train_y) tmp = c.predict(test_x) tmp = metrics.accuracy_score(tmp, test_y) error = 1 - tmp error_test.append(error) return error_test
def eval_acc(X_train): global_train = [] for jj in range(1, 101): clf = knn(n_neighbors=jj) clf.fit(X_train, Y_train) global_train.append(round(clf.score(X_train, Y_train), 2)) for z in range(0, len(X_train_k)): X_train_k.iloc[z, :] = X_train_k.iloc[z, :] + global_train return (X_train_k)
def knnhelper(x, y): xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30, random_state=42, stratify=y) neig = knn(n_neighbors=2, algorithm='kd_tree') neig = neig.fit(xtrain, ytrain) ans = neig.predict(xtest) TP, FP, TN, FN = perf_measure(ytest, ans)
def select_classify(): return [ naive(), tree(criterion="entropy"), knn(n_neighbors=8, weights='uniform', metric="manhattan"), mlp(hidden_layer_sizes=(128, ), alpha=0.01, activation='tanh', solver='sgd', max_iter=300, learning_rate='constant', learning_rate_init=0.001) ]
def getTrainedCLassifier(classifierType, train): if classifierType == "naiveBayes": from nltk.classify import NaiveBayesClassifier trainedClassifier = NaiveBayesClassifier.train(train) elif classifierType == "randomForest": from sklearn.ensemble import RandomForestClassifier as rfc trainedClassifier = SklearnClassifier(rfc(n_estimators=25, n_jobs = 2)) trainedClassifier.train(train) elif classifierType == "knn5": from sklearn.neighbors import KNeighborsClassifier as knn trainedClassifier = SklearnClassifier(knn(5)) trainedClassifier.train(train) return trainedClassifier
def cal_cost_knn(x, trn, trg): x = list(map(int, np.round(x))) if sum(x) == 0 : return np.inf, np.inf, 1 x_index = [i for i in range(len(x)) if x[i]==1] trn = trn.reshape(trn.shape[1], -1) trn = trn[x_index, :] trn = np.transpose(trn) clf = knn(n_neighbors=nn) clf.fit(trn, trg) pre = clf.predict(trn) score = acc(pre, trg) error = 1 - score return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
def predict_knn(X, y, X_train, X_test, y_train, y_test): clf = knn(n_neighbors=3) print("======= KNN =======") clf.fit(X_train, y_train) pickle.dump(clf, open('knn_trained_new.sav', 'wb')) y_pred = clf.predict(X_test) calc_accuracy("K nearest neighbours", y_test, y_pred) np.savetxt('submission_surf_knn.csv', np.c_[range(1, len(y_test) + 1), y_pred, y_test], delimiter=',', header='ImageId,Label,TrueLabel', comments='', fmt='%d')
def chooseClassification(name): print "Choosen classfier:",name return { 'NB': GaussianNB(), 'ADA': adaBoost(n_estimators=50), 'RF': rf(n_estimators = 100), 'KNN': knn(n_neighbors=15, p=1), 'SVM': svm.SVC(kernel='rbf', probability=True), 'BAG':BaggingClassifier(n_estimators = 30)#base_estimator=knn(), #bootstrap=True, #bootstrap_features=True, #oob_score=True, #max_features = 10, #max_samples = 100), }.get(name, GaussianNB()) # default Gaussian Naive Bayes
def find_features(step_num=1,num_cv=5): #This function finds the best features for each model #NOTE: This only works for the Logistic Regression model #initialize model model = LR() kclf = knn(n_neighbors=15) selector_LG = RFECV(model, step=step_num, cv=num_cv) selector_LG.fit(features,labels) #selector_KNN = RFECV(kclf, step=step_num, cv=num_cv) #selector_KNN.fit(features,labels) print 'LG features' print selector_LG.support_ print selector_LG.ranking_, '\n'
def KNN(trainXY, testXY): # clf = knn(n_neighbors=3) params = { 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'weights': ['uniform', 'distance'], } clf = knn() clf = grid_search.GridSearchCV(clf, params) print('clf = {:s}'.format(clf)) clf.fit(trainXY[0], trainXY[1][:, 0]) print('clf.best_estimator_ = {:s}'.format(clf.best_estimator_)) clf = clf.best_estimator_ prd = clf.predict(testXY[0]) print('prd = {:s}'.format(prd)) print('ans = {:s}'.format(testXY[1][:, 0].transpose())) print('accuracy = {:f}'.format( accuracy(prd, testXY[1][:, 0].transpose())))
def knnTrain(datafile,featureNum,fold = 10): import sys train,test = loaddata(datafile) row,col = train['counts'].shape if col < featureNum: featureNum = col X_train = train['counts'][:,0:featureNum] y_train = train['labels'][0,:] X_test = test['counts'][:,0:featureNum] y_test = test['labels'][0,:] tuned_parameters = [{'n_neighbors':[2,3,4,6,10,15,18,20,30,40,50]}] model = knn(n_neighbors = 1) categories = train['category'] feature_names =np.array([k.strip() for k in train['feature_names']]) data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold] clf,accuracy = cross_validation(*data) return accuracy
def plot_dby(iris): """Performs knn classification on projected iris dataset, plots results as well as decision boundaries.""" # project features into 2-dim space (for viz purposes) # NOTE "projection" just means that we're dropping the other features...this is # not the same thing as "feature selection" (which requires more care) # or "dimensionality reduction" (which requires more math) X = iris.data[:, :2] #lop off two last columns y = iris.target # initialize & fit knn model clf = knn(n_neighbors=NUM_NBRS) clf.fit(X, y) # create x, y mesh to plot decision boundaries x_min = -1 + X[:, 0].min() y_min = -1 + X[:, 1].min() x_max = 1 + X[:, 0].max() y_max = 1 + X[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, MESH_SIZE), np.arange(y_min, y_max, MESH_SIZE)) # create predictions & reshape to fit mesh preds = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # preds = preds.reshape(xx.shape) # no train vs test - > because the test set is # every point in the 2d plane # plot prediction results pl.figure() pl.pcolormesh(xx, yy, preds, cmap=COLORS_1) # plot training examples pl.scatter(X[:, 0], X[:, 1], c=y, cmap=COLORS_2) # set additional plot parameters pl.xlim(xx.min(), xx.max()) pl.ylim(yy.min(), yy.max()) pl.title('knn classification of iris dataset (k = {0})'.format(NUM_NBRS)) pl.show(1)
def sentPred(trainfile, testfile, result, report): traindata = np.loadtxt(trainfile) testdata = np.loadtxt(testfile) x_train = traindata[:,1:] y_train = traindata[:,0] y_pred_stan = traindata[:,-1] score_train_stan = ascore(y_train, y_pred_stan) rep_train_stan = prf(y_train, y_pred_stan, average=None) clf_lda = lda() clf_lda.fit(x_train, y_train) y_pred_lda = clf_lda.predict(x_train) score_train_lda = ascore(y_train, y_pred_lda) rep_train_lda = prf(y_train, y_pred_lda, average=None) test_pred_lda = clf_lda.predict(testdata) clf_log = log() clf_log.fit(x_train, y_train) y_pred_log = clf_log.predict(x_train) score_train_log = ascore(y_train, y_pred_log) rep_train_log = prf(y_train, y_pred_log, average=None) test_pred_log = clf_log.predict(testdata) clf_knn = knn(n_neighbors = 1) clf_knn.fit(x_train, y_train) y_pred_knn = clf_knn.predict(x_train) score_train_knn = ascore(y_train, y_pred_knn) rep_train_knn = prf(y_train, y_pred_knn, average=None) test_pred_knn = clf_knn.predict(testdata) separator = np.array((9,)) test_pred = np.concatenate((test_pred_lda,separator,test_pred_log,separator,test_pred_knn)) np.savetxt(result, test_pred, fmt='%i') np.savetxt(report, rep_train_stan + rep_train_lda + rep_train_log + rep_train_knn, fmt = '%10.5f') f = open(report, 'ab') f.write('stan: ' + str(score_train_stan) + '\n') f.write('lda: ' + str(score_train_lda) + '\n') f.write('log: ' + str(score_train_log) + '\n') f.write('knn: ' + str(score_train_knn) + '\n') f.close()
def train_and_classify(nn=5, X = records, y = labels, n_folds=3): kf = cv.KFold(n=len(X), n_folds=n_folds, shuffle=True) accs = [] for k, (train_idxs, test_idxs) in enumerate(kf): # Get all train/test samples for this fold print "*"*10 + "kNN" + "*"*10 print str(train_idxs) print str(test_idxs) train_X = X.loc[train_idxs] train_y = y.loc[train_idxs] test_X = X.loc[test_idxs] test_y = y.loc[test_idxs] # Train the model model = knn(n_neighbors=nn) model.fit(train_X, train_y) # Test the model acc = model.score(test_X, test_y) print str(acc) accs.append(acc) pred_y = model.predict(test_X) cm = confusion_matrix(test_y, pred_y) print str(cm) # Train the model with LR print "*"*10 + "LR" + "*"*10 modelLR = LR() modelLR.fit(train_X, train_y) # Test the model with LR accLR = modelLR.score(test_X, test_y) print str(accLR) pred_y = modelLR.predict(test_X) cmLR = confusion_matrix(test_y, pred_y) print str(cmLR)
def knnDemo(X,y, n): #cresates the the classifier and fits it to the data res=0.05 k1 = knn(n_neighbors=n,p=2,metric='minkowski') k1.fit(X,y) #sets up the grid x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res),np.arange(x2_min, x2_max, res)) # makes the prediction Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) # creates the color map cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # Plots the decision surface plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) # plots the samples for idx, cl in enumerate(np.unique(y)): plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.show()
#################### SEPARATING EVALUATION DATA ######################### #X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0) ############################################################################### # Classification # Run classifier with cross-validation and plot ROC curves # folds=10 cv = StratifiedKFold(y_all, n_folds=folds,shuffle=True) #cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0) #classifier = svm.SVC(kernel='linear', probability=True) #classifier = RandomForestClassifierWithCoef(RandomForestClassifier) classifier=knn(n_neighbors=3) all_indexes=[] index_list=[] y_test_report=[]; y_predicted_report=[] y_proba_report=[] for i, (train, test) in enumerate(cv): ## prepare and normalize test train matrices normalized_matrix_train=cl.normalise_mean_var(all_feature_matrix[train]) normalised_matrix_test=cl.normalise_mean_var(all_feature_matrix[test]) y_predicted2=[]
def model_rank_loo(): #This function uses a LOO cv iterator and fits a knn and logistic regression model to rank performance for model selection #get cv iterator kfloo = cv.LeaveOneOut(num_recs) #result set for Leave One Out CV LG_fpr, LG_tpr, LG_auc, LG_acc = (np.zeros(num_recs), np.zeros(num_recs), np.zeros(num_recs),np.zeros(num_recs)) KNN_fpr, KNN_tpr, KNN_auc, KNN_acc = (np.zeros(num_recs), np.zeros(num_recs), np.zeros(num_recs),np.zeros(num_recs)) for i,(traini, testi) in enumerate(kfloo): #initialize model model = LR() kclf = knn(n_neighbors=15) #make sure the records don't have null values_____________________________________________ train_features = features.iloc[traini].dropna() #train labels for LOO CV train_labels = labels.iloc[traini] test_features = features.iloc[testi].dropna() #test labels for LOO CV test_labels = labels.iloc[testi] #initialize model, perform fit kclf.fit(train_features,train_labels) results_LG = model.fit(train_features,train_labels) #predict the labels predict_LG = results_LG.predict(test_features) predict_KNN = kclf.predict(test_features) print 'Index =', i, '\n' print 'Logistic Regression Classifier Stats \n' print 'True class' print test_labels, '\n' print 'Predicted Class' print predict_LG[0] print '\n'*2 print '+' * 80 print 'KNN Classifier Stats \n' print 'True class' print test_labels, '\n' print 'Predicted Class' print predict_KNN[0] print '\n'*2 #Here we update the results arrays with 1 if our classifier was correct, #later we take the average of the arrays - this give us the approximate accuracy of each model if test_labels == predict_LG[0]: LG_acc[i] = 1 else: LG_acc[i] = 0 if test_labels == predict_KNN[0]: KNN_acc[i] = 1 else: KNN_acc[i] = 0 print '*' * 80, '\n', '*' * 80 print '\n', '@_' * 40 print 'Logistic Regression Model accuracy on trials =\n', LG_acc, '\n' print 'Mean LG accuracy = {0}'.format(np.mean(LG_acc)), '\n' print 'KNN Model accuracy on trials =\n' ,KNN_acc, '\n' print 'Mean KNN accuracy = {0}'.format(np.mean(KNN_acc)) print '\n' * 2
imgsize = 28 td = np.array(td_df) tc = np.array(tc_df) tsd = np.array(tsd_df) test_data = np.array(test_data_df) print test_data.shape test_data = np.reshape(test_data,(test_data.shape[0],imgsize,imgsize)) test_class = np.array(test_class_df) dtsize = test_data.shape[0] for nb in range(80,79,-1): mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001) mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8) mdl = knn(n_neighbors=nb) mdl.fit(td,tc) for i in range(dtsize): td_index = [] for k in range(repl_fact): td_index.append( dtsize*k + i) tsd_1 = np.array(tsd[td_index,:]) tst_class_act=test_class[i] tst_class_pred_df = pd.DataFrame(mdl.predict(tsd_1)) #print tst_class_pred try: tst_class_pred_l = list(tst_class_pred_df.mode().iloc[0])
feature_train = npzfile['x'] label_train = npzfile['y'] npzfile = np.load('feature_test_ECG_MTS.npz') feature_test = npzfile['x'] label_test = npzfile['y'] train_num = feature_train.shape[0] windowWidth = 0.05 #0.01 numSymbols = 3 #12 alphabetSize = 12 #7 feature = np.vstack((feature_train, feature_test)) bops = bop_vec(feature, windowWidth = int(windowWidth * feature.shape[1]), numSymbols = numSymbols, alphabetSize= alphabetSize ) bop_train = bops[:train_num] bop_test = bops[train_num:] #%% clf = knn(n_neighbors=1) clf.fit(bop_train, label_train) print clf.score(bop_train, label_train) print clf.score(bop_test, label_test) #%% #%% clf = svm.LinearSVC() clf.fit(bop_train, label_train) svctrain = cross_validation.cross_val_score(clf, bop_train, label_train, cv=len(label_train)*2/3) svctrain = np.mean(svctrain) svctest = clf.score(bop_test, label_test) print clf.score(bop_train, label_train) print clf.score(bop_test, label_test) print svctrain, svctest #%% clf = knn(n_neighbors=3)
import pandas as pn import numpy as np from sklearn.cross_validation import KFold, cross_val_score from sklearn.neighbors import KNeighborsClassifier as knn from sklearn.preprocessing import scale data = pn.read_csv('wine.data') fields = [str(i) for i in xrange(2, 15)] X = data[fields] Y = data['1'] kf = KFold(len(data), n_folds=5, shuffle=True, random_state=42) results = [] for k in xrange(1, 50): results.append((k, np.mean(cross_val_score(estimator=knn(n_neighbors=k), cv=kf, X=X, y=Y)))) print max(results, key=lambda x: x[1]) X = scale(X) results = [] for k in xrange(1, 50): results.append((k, np.mean(cross_val_score(estimator=knn(n_neighbors=k), cv=kf, X=X, y=Y)))) print max(results, key=lambda x: x[1])
test_labels = test['test_label'] test_edges = test['test_edges'] # Preprocessing normalize data scaler = StandardScaler() scaler.fit(train_data) train_data = scaler.transform(train_data) #Preprocessing RandomizePCA #pca = RandomizedPCA(n_components=15) #pca.fit(train_data) scaler.fit(valid_data) valid_data = scaler.transform(valid_data) scaler.fit(test_data) test_data = scaler.transform(test_data) #valid_data = pca.transform(valid_data) clf = knn(n_neighbors=21, p=1) clf = clf.fit(train_data,train_labels.ravel()) print clf.score(valid_data,valid_labels.ravel()) print clf.score(test_data,test_labels.ravel()) """ for file_num in range(210,213):#test_files_count): # see test results sp_file_names = data['sp_file_names'][file_num].strip() im_file_names = data['im_file_names'][file_num].strip() # Extract features from image files fe = Feature() fe.loadImage(im_file_names) fe.loadSuperpixelImage() test_data = fe.getFeaturesVectors() # edges, feat = fe.getEdges()
from sklearn.svm import SVC iris_data = datasets.load_iris() print iris_data dt = iris_data.data lbls = iris_data.target #train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test. num_fold = 10 gen_k_sets = StratifiedKFold(lbls,num_fold) ab = [] for nb2 in range(1,31,1): mdl2 = knn(n_neighbors=nb2) for nb in range(11,12,1): dst_mdl = nn(n_neighbors=nb) overall_mis = 0 mdl = SVC(C=1.0) #mdl = rfc(n_estimators=100) #mdl = knn(n_neighbours=1) for train_index, test_index in gen_k_sets: train_data, test_data = dt[train_index], dt[test_index] train_class, test_class = lbls[train_index], lbls[test_index] tr_dts =[] tr_clses=[] print for k in range(3):
scaleInput = (scaleInput - mean) / std return scaleInput # takes the vectors of results def accuracy(predicted,actual): trues = 0; for x in range(len(predicted)): if predicted[x] == actual[x]: trues += 1 return trues/len(predicted) diabetesData = pd.read_csv("diabetes.csv",header=0); classValues = diabetesData["class"] # print accuracy(classValues,classValues) del diabetesData["class"] scaledData = doRelativeScaling(diabetesData) # print scaledData neigh = knn(n_neighbors=1) neigh.fit(scaledData,classValues) print(neigh.predict([[1.3, 1.6, 1.9,0.7,5,2,1,5]]))
def model_rank(num_fold=10): #this function fits a knn and logistic regression model to rank performance for model selection #get cv iterators kf = cv.KFold(n=num_recs, n_folds=num_fold, shuffle=True) #initialize result set LG_fpr, LG_tpr, LG_auc, LG_acc = (np.zeros(num_fold), np.zeros(num_fold), np.zeros(num_fold),np.zeros(num_fold)) KNN_fpr, KNN_tpr, KNN_auc, KNN_acc = (np.zeros(num_fold), np.zeros(num_fold), np.zeros(num_fold),np.zeros(num_fold)) for i,(traini, testi) in enumerate(kf): #initialize model model = LR() kclf = knn(n_neighbors=15) #make sure the records don't have null values train_features = features.iloc[traini].dropna() train_labels = labels.iloc[traini].dropna() test_features = features.iloc[testi].dropna() test_labels = labels.iloc[testi].dropna() #initialize model, perform fit kclf.fit(train_features,train_labels) results_LG = model.fit(train_features,train_labels) #predict the labels predict_LG = results_LG.predict(test_features) predict_KNN = kclf.predict(test_features) #calc ROC, AUC, and accuracy for LG model print 'Logistic Regression Classifier Stats \n' print 'True class' print test_labels, '\n' print 'Predicted Class' print '\n'*2 #NOTE: ROC ANALYSIS ONLY WORKS FOR BINARY CLASSIFICATION PROBLEMS #fpr_LG, tpr_LG, thresholds_LG = roc_curve(test_labels,predict_LG, pos_label=1) #roc_auc_LG = auc(fpr_LG,tpr_LG) acc_LG = model.score(test_features, test_labels) #print 'FPR = {0}'.format(fpr_LG), '\n' #print 'TPR = {0}'.format(tpr_LG), '\n' #print '\n' print 'acc =', acc_LG print confusion_matrix(test_labels,predict_LG), '\n' print classification_report(test_labels,predict_LG,[1,2,3] ,target_names=targets ) print 'LG kappa =', kappa(test_labels,predict_LG) print '+_' * 40 print 'KNN Classifier Stats \n' print 'True class' print test_labels, '\n' print 'Predicted Class' print predict_KNN print '\n'*2 #LG_fpr[i] = fpr_LG[1] #LG_tpr[i] = tpr_LG[1] #LG_auc[i] = roc_auc_LG LG_acc[i] = acc_LG #calc ROC, AUC, and accuracy for KNN model #fpr_KNN, tpr_KNN, thresholds_KNN = roc_curve(test_labels,predict_KNN, pos_label=1) #roc_auc_KNN = auc(fpr_KNN,tpr_KNN) acc_KNN = kclf.score(test_features, test_labels) print 'acc =', acc_KNN print confusion_matrix(test_labels,predict_KNN), '\n' print classification_report(test_labels,predict_KNN,[1,2,3] ,target_names=targets ) print 'KNN kappa =', kappa(test_labels,predict_KNN) print '*' * 80 print '*' * 80 #KNN_fpr[i] = fpr_KNN[1] #KNN_tpr[i] = tpr_KNN[1] #KNN_auc[i] = roc_auc_KNN KNN_acc[i] = acc_KNN print '\n', '@_' * 40 print 'Logistic Regression Model accuracy on trials =\n', LG_acc, '\n' print 'Mean LG accuracy = {0}'.format(np.mean(LG_acc)), '\n' print 'KNN Model accuracy on trials =\n' ,KNN_acc, '\n' print 'Mean KNN accuracy = {0}'.format(np.mean(KNN_acc))
#ridge is helpful if we are afriad of making our model too bias due to our training data #lasso is helpful if we are unsure of which features we should try to eliminate #C is related to the amount of penalty we apply. C is the inverse of alpha. Alpha #is the multiplier we use to apply penalty. As alpha increases, more penalty is #added, and the model becomes more sensitive to larger coefficients #If we change our threshold to .90, we would be much more confident that our #predicted survivals were true survivals. However, we would become less confident #that our deaths were true death. We would reduce our false positive rate, but #increase our false negative rate. #KNN #does not perform as well as LogReg, even with GridSeach knn =knn() knn.fit(x_train,y_train) knn.score(x_test,y_test) gridknn=skgs(knn,{'n_neighbors':range(1,55)},cv=12,scoring='accuracy') gridknn.fit(x_train,y_train) print gridknn.best_estimator_ print gridknn.score(x_test,y_test) #As we use more neighbors, our model becomes more bias because it becomes more complex #Logistice regression is usually a better choice than KNN because it is a more sophisticated #model and it requires less storage to run. KNN is a good model if you are looking #for something simple, the data set is not too large, and you want as transparent #of a model as possible. knnpred=gridknn.predict(x_test) print skcm(y_test,knnpred) #with knn, we got more true negatives, and less false positives,
print dt.shape num_fold = 10 gen_k_sets = StratifiedKFold(lbls,num_fold,shuffle=True) ab = [] overall_mis = 0 err=[] c= 1.0 mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001) mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8) mdl = knn(n_neighbors=1) imgsize = 8 patchsize = 6 ab= [] for train_index, test_index in gen_k_sets: train_data, test_data = dt[train_index], dt[test_index] train_class, test_class = lbls[train_index], lbls[test_index] dtsize= train_data.shape[0] train_data = train_data.reshape(dtsize,imgsize,imgsize) c1 = train_data[:,0:patchsize,0:patchsize] ''' a= c1[0,:,:] print a.shape print a
dgts_data = pd.read_csv("abcd.csv",index_col=0) print dgts_data.head() print dgts_data.shape dgts_data = np.array(dgts_data) print dgts_data.shape #print dgts_data dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0) #print dgts_lbl.head() print dgts_lbl.shape dgts_lbl = np.array(dgts_lbl) print dgts_lbl.shape #print dgts_lbl mdl = knn() gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.3) for train_index, test_index in gen_k_sets: train_data, test_data = dgts_data[train_index], dgts_data[test_index] train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index] mdl.fit(train_data,train_class) print mdl.score(test_data,test_class) clust_data = test_data print clust_data.shape pca = PCA(n_components=100) pca.fit(clust_data) tr_dt_p = pca.transform(clust_data)