def knnSimulate(param): trainSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) knnFit = KNeighborsClassifier(n_neighbors=int(param['k'])) knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y'])) testSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) out = OrderedDict() out['p'] = int(param['p']) out['k'] = int(param['k']) out['train'] = trainSet out['test'] = testSet out['resubPreds'] = knnFit.predict(trainSet['x']) out['resubProbs'] = knnFit.predict_proba(trainSet['x']) out['testPreds'] = knnFit.predict(testSet['x']) out['testProbs'] = knnFit.predict_proba(testSet['x']) out['resubTable'] = pd.crosstab( Series(out['resubPreds'], index=trainSet['y'].index), trainSet['y'] ) out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) / (1.0 * np.sum(np.sum(out['resubTable'])))) out['testTable'] = pd.crosstab( Series(out['testPreds'], index=testSet['y'].index), testSet['y'] ) out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) / (1.0 * np.sum(np.sum(out['testTable'])))) return out
def main(output=RESULTS1B): """ Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). Tests on a subset of trip_data_1.csv Uses sklearn to implement nearest neighbors """ features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'trip_time_in_secs'] ## Extract necessary data into pandas dataframes numrows = 100000 df_train_read = pd.read_csv(TRAIN_DATA) df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows) # first 100k rows, for speed df_test = df_test_read[features].dropna() df_train = df_train_read[features].dropna() ## Use sklearn to run nearest neighbors k = 1 clf = KNeighborsClassifier(n_neighbors=k) # default distance metric: euclidean clf.fit(df_train[features[0:4]], df_train[features[-1]]) preds = clf.predict(df_test[features[0:4]]) # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error) print "Calculating statistics" with open(output, "a+") as outputFile: outputFile.write("Ran knn with k={}".format(k) + \ " Trained on {}. Tested on first".format(TRAIN_DATA) + \ " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1)) calcAndLogStats( numpy.array(preds), numpy.array(df_test[features[-1]]), output=output)
def train(): data = Prediction.objects.filter(predict=False) df = pd.DataFrame(list(data.values())) users = [o.user for o in data] df['age'] = pd.DataFrame(list([user.userprofile.age for user in users])) for x in ['id', 'base_personal', 'base_general', 'predict', 'created', 'user_id', 'training_id']: df = df.drop(x, axis=1) y = df['next_level'] df = df.drop('next_level', axis=1) #neigh = svm.SVC() neigh = KNeighborsClassifier(n_neighbors=2) neigh.fit(df, y) #kf = KFold(len(ft[features]), n_folds=10) kf = ShuffleSplit(len(df), n_iter=K, test_size=test_size, random_state=0) # score is accuracy here accuracy = cross_val_score(neigh, df, y, cv=kf) batch = Training.objects.create(training_accuracy=sum(accuracy[:(1-test_size)*K])/K/(1-test_size), sample_size=len(df.index), fold=K, subset_accuracy=json.dumps(accuracy.tolist()), test_accuracy=sum(accuracy[(1-test_size)*K:])/K/test_size ) Prediction.objects.filter(predict=False).update(training=batch) if not os.path.exists('./models'): os.makedirs('./models') joblib.dump(neigh, './models/model.pkl')
def kann_classify(train_data,train_label,test_data): knnClf=KNeighborsClassifier(n_neighbors=5) knnClf.fit(train_data,ravel(train_label)) test_label=knnClf.predict(test_data) save_result(test_label,'sklearn_knn_Result.csv') return test_label
def _evaluate_projection(self, x, y): """ kNNEvaluate - evaluate class separation in the given projection using a k-NN method Parameters ---------- x - variables to evaluate y - class Returns ------- scores """ if self.percent_data_used != 100: rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100), replace=False) x = x[rand] y = y[rand] neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \ KNeighborsRegressor(n_neighbors=3) assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None)) neigh.fit(x, y) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) scores = cross_val_score(neigh, x, y, cv=3) return scores.mean()
def kppv_vecteur(): "Interprétation des images comme vecteurs de pixels et classification via les k plus proches voisins" best = np.zeros(6) for npix in range(50,200,50): _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix) X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed()) for iterations in range(250,1000,250): for n in range(2,12,2): for param in range (1,3): start_time = time.time() kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1) x1=np.array(X_train) x1 = np.reshape(x1, (x1.shape[0],x1.shape[2])) x2=np.array(X_test) x2 = np.reshape(x2, (x2.shape[0],x2.shape[2])) kppv.fit(X=x1, y=Y_train) score = kppv.score(x2,Y_test) end_time = time.time() if score>best[0]: best[0] = score best[1] = iterations best[2] = n best[3] = param best[4] = end_time-start_time best[5] = npix print("| K plus proches voisins | V.Pix {:4.0f} | n={:1.0f} param={:1.0f} iterations={:1.0f} | {:10.3f}ms | {:1.3f} |".format(best[5],best[2],best[3],best[1],best[4]*1000,best[0]))
def kppv_histo(): "Interprétation des images comme histogrammes de couleurs et classification via les k plus proches voisins" best = np.zeros(5) _, data, target, _ = utils.chargementHistogrammesImages(mer,ailleurs,1,-1) X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed()) for iterations in range(250,1000,250): for n in range(2,12,2): for param in range (1,3): start_time = time.time() kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1) x1=np.array(X_train) x2=np.array(X_test) kppv.fit(X=x1, y=Y_train) score = kppv.score(x2,Y_test) end_time = time.time() if score>best[0]: best[0] = score best[1] = iterations best[2] = n best[3] = param best[4] = end_time-start_time print("| K plus proches voisins | V.Histo | n={:1.0f} param={:1.0f} iterations={:1.0f} | {:10.3f}ms | {:1.3f} |".format(best[2],best[3],best[1],best[4]*1000,best[0]))
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def knn_accuracy(trn_data, trn_labels, tst_data, tst_labels, k_neighbors): knn = KNeighborsClassifier(k_neighbors) knn.fit(trn_data, trn_labels) results = knn.predict(tst_data) return np.sum(tst_labels == results)/float(tst_labels.size)
def BuildModel(self, data, labels): # Create and train the classifier. knc = KNeighborsClassifier( n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric ) knc.fit(data, labels) return knc
def analyze_image(self): ''' Load the image and analyze it with KNN im_file - pre-processed with histogram specification ''' if self._avg_pixels.size == 0: self._process_annotations() self._get_initial_classes() im = self._image rows = im.shape[0] clf = KNeighborsClassifier(n_neighbors = self._n_neighbors) clf.fit(self._avg_pixels, self._labels) im_1d = im.reshape(-1, 3) # calculate prediction reshape into image prediction = clf.predict(im_1d) prediction = prediction.reshape(rows, -1) prediction [self._mask == 0] = Labels.Masked self.display_current(prediction) return prediction
def main(): # obtain the number of features in the dataset with open('../data/test_lung_s3.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: num_columns = len(row) break print num_columns # load data mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, num_columns)) X = mat[:, 1:num_columns] # data y = mat[:, 0] # label X = X.astype(float) y = y.astype(float) n_samples, n_features = X.shape # using 10 fold cross validation cv = KFold(n_samples, n_folds=10, shuffle=True) # evaluation n_features = 100 neigh = KNeighborsClassifier(n_neighbors=1) acc = 0 for train, test in cv: idx = svm_backward.svm_backward(X[train], y[train], n_features) print idx X_selected = X[:, idx] neigh.fit(X_selected[train], y[train]) y_predict = neigh.predict(X_selected[test]) acc_tmp = accuracy_score(y[test], y_predict) print acc_tmp acc += acc_tmp print 'ACC', float(acc)/10
def run(class_num, subsample_size , cluster_num, window_size, method='knn' , n_nb = 2): # will load data as the patch size defined , 3 means 3*3 = 9 for each patch, and will return the dictionary included: # 'data' (one patch) , 'target' (the sample of this patch belongs to ) , 'filename' (the file comes from) bofs = [] lable = [] filename = "%s/TRAIN_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num) bofs , lable = get_vlad(filename) #knn_init = KNeighborsClassifier() #parameters = {'n_neighbors':[ 5, 10 , 15]} #knn = grid_search.GridSearchCV(knn_init, parameters) bofs_test = [] lable_test = [] filename = "%s/TEST_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num) bofs_test , lable_test = get_vlad(filename) start = time.time() if(method == "knn"): knn = KNeighborsClassifier(n_neighbors = n_nb) knn.fit(bofs, lable) predicted = knn.predict(bofs_test) score = knn.score(bofs_test,lable_test) print(time.time()-start) return score
def main_process(): data_dict = parse_txt() x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict) print 'data counts', len(x_data), len(y_data) print 'zone names counts', places_cnt print 'path counts', len(path_int_dict) # start to train, change list type to numpy.array x_data = np.array(x_data) y_data = np.array(y_data) knn = KNeighborsClassifier() indices = np.random.permutation(len(x_data)) x_train = x_data y_train = y_data x_test = x_data[indices[-TEST_DATA_ROWS:]] y_test = y_data[indices[-TEST_DATA_ROWS:]] knn.fit(x_train, y_train) # work test_result = knn.predict(x_test) # test proba_test_result = knn.predict_proba(x_test) # no duplicate value, so reverse this dictionary int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys())) print 'predict result:', test_result print [int_path_dict[x] for x in test_result] # test result
def train(x_train, y_train): # reg = LinearRegression() reg = KNeighborsClassifier() reg.fit(x_train, y_train) return reg
def exercise_2a(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) # plt.show() kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 accuracy_lst[k-1, 0] = accuracy_current.mean() # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def build_model(): """ request cip and skill data from DataUSA and develop predictive model using scikit-learn :return: fit model ready to accept user input to make a prediction """ # request data on college majors and relevant skills r = requests.get(r'http://api.datausa.io/api/?show=skill&sumlevel=all') data_usa = r.json() headers = data_usa['headers'] data = data_usa['data'] df = pd.DataFrame(data, columns=headers) df.drop('value_rca', axis=1, inplace=True) # reshape data so that each skill becomes a single column (i.e. feature for the model) pivot = df.pivot_table(index='cip', columns='skill', values='value') pivot = pivot.reset_index() X = pivot.drop('cip', axis=1) # feature matrix y = pivot.cip # response knn = KNeighborsClassifier(n_neighbors=10, weights='distance') knn.fit(X, y) return knn
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def knn(depth, joints, C, visualize=False, zScale=1.0): pts_world, labels = joints2skeleton(joints) pts_world[:, 2] *= zScale classifier = KNeighborsClassifier(n_neighbors=nNeighbors) classifier.fit(pts_world, labels) X = np.vstack((np.nonzero(depth)[1], np.nonzero(depth)[0])) X = np.vstack((X, depth[depth != 0])) X_world = pixel2world(X.T, C) X_world[:, 2] *= zScale predicts = classifier.predict(X_world) perPixelLabels = -np.ones(depth.shape) perPixelLabels[depth != 0] = predicts img = np.zeros((H, W, 3), np.uint8) for i in range(nJoints): img[perPixelLabels == i] = palette[i] skel = None if visualize is True: # foreground = visualizePts(world2pixel(pts_world, C), labels) # img[foreground != 0] = foreground[foreground != 0] skel = visualizePts(world2pixel(pts_world, C), labels) return (img, X_world, predicts, skel)
def onstartButton(self): cap = cv2.VideoCapture(str(self.file_name)) if self.isfileWorking == False and self.ishasFile == True: self.ishasFile = False self.startButton.setText("Close") # cap = cv2.VideoCapture(str(self.file_name)) self.isfileWorking = True data=spio.loadmat("openface_fea.mat") X=data['feature'] id=data['id'].astype(int)-1 Y=id[0,:] name=list(set(data['names'])) name.sort() print("***Train knn classifier***") knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2) knn.fit(X,Y) success,frame = cap.read() while success and self.isfileWorking : start=time.time() success, frame = cap.read() if success: img=frame.copy() bb,rep=getRep(img) if bb is None: print "Can't find any face in this picture" else: if rep is 0: print "Get rep failed..." else: rep=np.reshape(rep,(1,128)) idx=knn.predict(rep) # print("label is {} ".format(idx)) proba=knn.predict_proba(rep) actor=name[idx] self.namelineEdit.setText(actor) self.timelineEdit.setText(str(round(time.time()-start,3))) self.confidencelineEdit.setText(str(round(max(proba[0]),2))) # print("Proba is {} ".format(proba)) draw_dlib_rects(frame,bb,actor,(0,255,0)) image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped() pixmap = QtGui.QPixmap.fromImage(image) self.showlabel.setPixmap(pixmap) k = cv2.waitKey(5) else: self.ishasFile = False self.startButton.setText("Start") self.isfileWorking = False cap.release() self.showlabel.clear()
def exercise_2b(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0) # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 print mean_squared_error(y_test, clf.predict(X_test)) accuracy_lst[k-1, 0] = accuracy_current.mean() accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K') plt.ylabel('Variance') plt.show()
def build_classifier(images, labels): #this will actually build the classifier. In general, it #will call something from sklearn to build it, and it must #return the output of sklearn. Right now it does nothing. classifier = KNN(n_neighbors=3,weights='distance') classifier.fit(images, labels) return classifier
def xValidateKFold(n, k, iris_x, iris_y, VERBOSE): # K Fold cross validation # n = # of folds # k = # of nearest neighbors to check # iris_x = data # iris_y = classes # VERBOSE = flag to spit out more information on each fold iteration # Create the cross validator kf = cross_validation.KFold(n=len(iris_x), n_folds = n, random_state=0) if VERBOSE: print "kFold validator: "+str(kf) avgScore = 0.0 # function returns avg score for all runs # for each set of training and test data for train_index, test_index in kf: knn = KNeighborsClassifier(n_neighbors=k) # Create Classifier knn.fit(iris_x[train_index],iris_y[train_index]) # Training Classifier prediction = knn.predict(iris_x[test_index]) # Predict on test data score = knn.score(iris_x[test_index], iris_y[test_index]) # Evaluate success of prediction avgScore += score # Accrue score for averaging if VERBOSE: print "\tscore for this validation round: "+str(score) return avgScore/float(n) # Return average score for all iterations
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S')) trainX = downsample_features(trainX) testX = downsample_features(testX) trainX, testX = normalize(trainX, testX) print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), '50 - Neighbors') clf = KNeighborsClassifier(n_neighbors=50) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main(): means = [[-1, -1], [1.0, 1.0]] variances = [np.random.rand] knn_models = [3, 5, 10] data_sizes = [10, 25, 50, 75, 100, 125, 150, 175, 200] points_per_class = 500 data = dg.generate_gaussian_mixture(class_means=means, class_variances=np.eye(2), num_components=5, num_desired_points_per_class=points_per_class) class_0 = np.hstack((data[0], np.zeros((len(data[0]), 1)))) class_1 = np.hstack((data[1], np.ones((len(data[0]), 1)))) results_train = np.empty((len(knn_models), len(data_sizes))) results_test = np.empty((len(knn_models), len(data_sizes))) train_data_class_0, test_data_class_0 = split_train_test(class_0) train_data_class_1, test_data_class_1 = split_train_test(class_1) print 'train size, test size', len(train_data_class_1), len(test_data_class_1) train_data = np.vstack((train_data_class_0, train_data_class_1)) test_data = np.vstack((test_data_class_0, test_data_class_1)) for i, knn_model in enumerate(knn_models): kncs = KNeighborsClassifier(n_neighbors=knn_model) for j, data_size in enumerate(data_sizes): curr_train_class_0, curr_train_class_1 = train_data_class_0[:data_size], train_data_class_1[:data_size] curr_train_data = np.vstack((curr_train_class_0, curr_train_class_1)) kncs.fit(curr_train_data[:, :2], curr_train_data[:, -1]) predictions_train = kncs.predict(train_data[:, :2]) predictions_test = kncs.predict(test_data[:, :2]) results_train[i][j] = len(np.where(predictions_train != train_data[:, -1])[0]) / float(len(train_data)) results_test[i][j] = len(np.where(predictions_test != test_data[:, -1])[0]) / float(len(test_data)) plt.plot(data_sizes, results_test[0, :], 'r') plt.plot(data_sizes, results_test[1, :], 'b') plt.plot(data_sizes, results_test[2, :], 'g') plt.plot(data_sizes, results_train[0, :], 'r--') plt.plot(data_sizes, results_train[1, :], 'b--') plt.plot(data_sizes, results_train[2, :], 'g--') plt.show()
def knnClassify(enrollment_id, trainData, trainLabel, testData): knnClf = KNeighborsClassifier(n_neighbors=5) # default: k=5 # knnClf.fit(trainData,trainLabel) knnClf.fit(trainData, ravel(trainLabel)) # numpy.ravel将数组展平,变为一行 testLabel = knnClf.predict(testData) saveResult(enrollment_id, testLabel, "sklearn_knn_Result.csv") return testLabel
def process_one_cell(cell_train, cell_test, fw, th, n_neighbors): # Remove infrequent places cell_train = remove_infrequent_places(cell_train, th) # Store row_ids for test row_ids = cell_test[:, -1].flatten().astype(np.int32) cell_test = cell_test[:, :-1] # Preparing data y = cell_train[:, -1].flatten().astype(np.int64) X = cell_train[:, :-1] #Applying the classifier cte = 5.8 n_neighbors = int((y.size ** 0.5) / cte) clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=calculate_distance, p=1, n_jobs=2, leaf_size=15) clf.fit(X, y) y_pred = clf.predict_proba(cell_test) y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1] pred_labels = clf.classes_[y_pred_labels] cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) return cell_pred
def exercise_1(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) n_samples = len(X) kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None) # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None) error_total = np.zeros([49, 1], dtype=float) for k in range(1,50): error = [] clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) error.append( zero_one_loss(y_test, clf.predict(X_test)) ) # error.append(clf.predict(X_test)) # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test)) # error.append(mean_squared_error(y_test, clf.predict(X_test))) # error.append() # print error error_total[k-1, 0] = np.array(error).mean() # print error_total x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, error_total[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def neighborsPrediction(train_dfs, targetLabels, fold_cv): scoresNeighbor = [0.0] n_neighbors = 0 for i in range(1, 10): neighbor, instances_train, instances_test, target_train, target_test, scoresNeighborTmp = testScore(train_dfs, targetLabels, fold_cv, i * 2) if sum(scoresNeighborTmp) / len(scoresNeighborTmp) > sum(scoresNeighbor) / len(scoresNeighbor): scoresNeighbor = scoresNeighborTmp n_neighbors = i * 2 # print(sum(scoresNeighborTmp)/len(scoresNeighborTmp)) neighbor = KNeighborsClassifier(n_neighbors) neighbor.fit(train_dfs, targetLabels) instances_train, instances_test, target_train, target_test = cross_validation.train_test_split(train_dfs, targetLabels, test_size=0.4, random_state=0) predictions = neighbor.predict(instances_test) print("Generate random forest with: {0} neighbors".format(str(n_neighbors))) return neighbor, instances_train, target_train, target_test, predictions, scoresNeighbor
# csvFile.close() traindata = np.vstack( (pd.read_csv('wifidata0.csv', header=0).values, pd.read_csv('wifidata1.csv', header=0).values, pd.read_csv('wifidata2.csv', header=0).values, pd.read_csv('wifidata3.csv', header=0).values)) # pd.read_csv('wifidata4.csv', header=0).values, pd.read_csv('wifidata5.csv', header=0).values)) print traindata.shape trainlabel = pd.read_csv('shopdata.csv', header=0).values[:12000, :] print trainlabel.shape testdata = pd.read_csv('wifidata6.csv', header=0) testlabel = pd.read_csv('shopdata.csv', header=0).values[18000:21000, :] ########## clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf.fit(traindata, trainlabel) test = clf.predict(testdata.values) accuracy = np.trace(np.dot(np.array(test), testlabel.T)) / 3000 print clf.score(traindata, trainlabel) print clf.score(testdata, testlabel) print accuracy print datetime.now() ######## trainscore 1.0 testscore0.827666666667 samples 12000 n_neighbors=5,weights='distance' ########trainscore 1.0 testscore 0.832666666667 samples 12000 n_neighbors=3,weights='distance' ######## hao nei cun , man 20min
X.info() # 3. split the data into train, and test datasets. We will be predicting whether or not someone votes based on the remaining features. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123) # 4. Fit a k-neighbors classifer on the training data. Use 4 for your number of neighbors. # How accurate is your model? # How does it perform on the test data? #Create KNN Object knn = KNeighborsClassifier(n_neighbors = 4) # Fit the model to the training data knn.fit(X_train, y_train) # Estimate whether or not a person will vote, using the training data. y_pred = knn.predict(X_train) best = [0] for k in range(1,5): knn = KNeighborsClassifier(n_neighbors = k) knn.fit(X_train, y_train) print(f'for k = {k}') print('Accuracy of KNN classifer on test set: {:.2f}'.format(knn.score(X_test, y_test))) if knn.score(X_test, y_test) > best[0]: best = [knn.score(X_test,y_test)] ypred = knn.predict(X_test)
df = pd.read_csv("week4_set2.csv") X1 = df.iloc[:, 0] X2 = df.iloc[:, 1] X = np.column_stack((X1, X2)) y = df.iloc[:, 2] # --- KNN MODEL ---- knn_model = KNeighborsClassifier( n_neighbors=5, weights="uniform") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) knn_model.fit(X_train, y_train) predictions = knn_model.predict(X_test) calculate_confusion_matrix(y_test, predictions, "KNN") y_score = knn_model.predict_proba(X_test) plot_roc_curve(y_test, y_score[:, 1], "KNN") # --- LOGISTIC REGRESSION --- # not using polynomial features as they dont make much of a difference polynomial_features = prep.PolynomialFeatures(degree=2) # use initial q X = polynomial_features.fit_transform(X) log_model = LogisticRegression(C=1, penalty="l2", random_state=0) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) log_model.fit(X_train, y_train) predictions = log_model.predict(X_test)
y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Creating classifier from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) # Fitting the classifier to the training set classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01),
from numpy.random import random # Use 20-fold cross-validation to evaluate the classification error rate of k-NN over # the Iris dataset in sklearn, for each of the values k = 1, 2, 4, 8, 16, 32. Use a # KNeighborsClassifier with the appropriate parameter values. # Plot the crossvalidated error rate values as a function of k iris = datasets.load_iris() X = iris.data y = iris.target k_range = range(1, 33) k_scores = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X, y) scores = cross_val_score(knn, X, y, cv=20) k_scores.append(scores.mean()) plt.plot(k_range, k_scores) plt.title("nearest neighbor optimization") plt.xlabel('Value of K for KNN') plt.ylabel('Cross-Validated Accuracy') plt.show() # (a) Write a Python program around the randPointUnitBall(d) function that generates a data set consisting of 1000 independently sampled points in the ddimensional unit ball for each dimension d = 1, 10, 100, and that reports the # mean Euclidean length (norm) of these examples for each d. Submit your documented source code (copy the text into your writeup, and attach the source file # separately), as well as the results. Describe the results. # (b) For additional insight, plot a histogram of the Euclidean lengths for dimensions # 2 = 1, 10, 100. Use matplotlib.pyplot.hist and specify density=True as one
kclf = KNeighborsClassifier(n_neighbors=10) # In[ ]: from numpy import loadtxt train = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/train.dat') test = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/test.dat') labels = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/train.labels') sample_format = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/format.dat') # In[ ]: #Dimensionality Reduction svd = TruncatedSVD(n_components=80) x_rd = svd.fit(train).transform(train) # In[ ]: #Using K Nearest Neighbours to classify data kclf = kclf.fit(train, labels) pred = kclf.predict(test) np.savetxt('/Users/ishaan/Desktop/predictions17.dat', pred, delimiter=',', fmt='%i') # In[ ]:
def imagenet_knn(train_file='gs://dataset-jesus-bucket/DataSet/', job_dir='gs://dataset-jesus-bucket/', **args): from keras.applications.vgg16 import VGG16 import numpy as np file_stream = file_io.FileIO( "gs://data-daisy/full_gs_paths_large_size.pickle", mode='rb') data_frame = pickle.load(file_stream) vgg16_model = VGG16(weights='imagenet', include_top=True) vgg16_rep_layer = Model(inputs=vgg16_model.input, outputs=vgg16_model.get_layer(index=21).output) print(vgg16_rep_layer.summary()) x_001, y_001, normalized_check = read_data_file_io(data_frame, ['/001/'], data_type="test") x_002, y_002, normalized_check = read_data_file_io(data_frame, ['/002/'], data_type="test") x_001_list, y_001_list = x_001.tolist(), y_001.tolist() x_002_list, y_002_list = x_002.tolist(), y_002.tolist() list_to_randomize = [] list_test = [] for (x, y) in zip(x_001_list, y_001_list): list_to_randomize.append([x, y]) random.shuffle(list_to_randomize) # shuffle data used to train n = 10 batch_size = len(list_to_randomize) // n remainder = len(list_to_randomize) - batch_size * n print(batch_size) for (x, y) in zip(x_002_list, y_002_list): list_to_randomize.append([x, y]) # extract data to test (001 dataset up to batch size * n + remainder x_001_randarr = np.array([ item[0] for item in list_to_randomize[0:n * batch_size + remainder - 1] ]) y_001_randarr = np.array([ item[1] for item in list_to_randomize[0:n * batch_size + remainder - 1] ]) x_002_list = [ item[0] for item in list_to_randomize[n * batch_size + remainder:] ] # used for ref. point y_002_list = [ item[1] for item in list_to_randomize[n * batch_size + remainder:] ] clf = KNeighborsClassifier() # creat KNN object accuracy_list = [] # train with dataset 2 x_002_arr = np.array(x_002_list) int_output = vgg16_rep_layer.predict(x_002_arr) int_output = int_output.reshape(x_002_arr.shape[0], -1) clf.fit(int_output, np.array(y_002_list)) init_loss = knn_accuracy(clf, x_001_randarr, y_001_randarr, vgg16_rep_layer) # Test on 001 accuracy_list.append(init_loss) z = 1 for i in range(10): print("Fitting on batch number:", z) x_test_list = [ item[0] for item in list_to_randomize[0:(i + 1) * batch_size - 1 + remainder * (i // 9)] ] + [ item[0] for item in list_to_randomize[n * batch_size + remainder:] ] y_test_list = [ item[1] for item in list_to_randomize[0:(i + 1) * batch_size - 1 + remainder * (i // 9)] ] + [ item[1] for item in list_to_randomize[n * batch_size + remainder:] ] x = np.array(x_test_list) y = np.array(y_test_list) print(x.shape, y.shape) int_output = vgg16_rep_layer.predict(x) int_output = int_output.reshape(x.shape[0], -1) clf.fit(int_output, y) accuracy = knn_accuracy(clf, x_001_randarr, y_001_randarr, vgg16_rep_layer) print(accuracy) accuracy_list.append(accuracy) z += 1 with open('gs://data-daisy/increasing_knn_acc_vgg16.pickle', 'wb+') as handle: pickle.dump(accuracy_list, handle) print(accuracy_list)
#plot_data() print("Features: ") print(x) print('Labels:- ') print(y) print('features:-') print(x.values) print("Labels:-\n ") print(y.values) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x, y) #print("Fire Status : {}%".format(knn.predict([[9,175,218]])[0])) forestClient.connect(hostname, 1883, 60) def on_message(client, userdata, msg): t = msg.topic d = msg.payload.decode() data.append(d) print(msg.topic, " , ", d) datas.append(data) print(datas) knn.predict(datas) # count=0
y_pred=lda.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) end=time.time() print("traning time is:") print(end - start) print(classification_report(y_test, y_pred)) plot_confusion_matrix(confusion_matrix(y_test, y_pred)) from sklearn.neighbors import KNeighborsClassifier start = time.time() knn = KNeighborsClassifier() knn.fit(x_train, y_train) y_pred=knn.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) end=time.time() print("traning time is:") print(end - start) print(classification_report(y_test, y_pred)) plot_confusion_matrix(confusion_matrix(y_test, y_pred)) from sklearn.tree import DecisionTreeClassifier start = time.time() clf = DecisionTreeClassifier().fit(x_train, y_train)
print(cancer.feature_names) print(cancer.target_names) for i in range(0,569): for j in range(0,569): if(cancer.target[j]<cancer.target[i]): temp1=cancer.data[j] cancer.data[j]=cancer.data[i] cancer.data[i]=temp1 temp2=cancer.target[j] cancer.target[j]=cancer.target[i] cancer.target[i]=temp2 print(cancer.target) i=0 while(i<369): print(cancer.data[i],"*****",cancer.target[i]) i +=1 x=cancer.data y=cancer.target x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.5) normal_classifier=KNeighborsClassifier() normal_classifier.fit(x_train,y_train) predictions1=normal_classifier.predict(x_test) print(accuracy_score(y_test,predictions1)) myclassifier=classifier() myclassifier.fit(x_train,y_train) predictions2=myclassifier.predict(x_test)
def KNN(cls, X_train, Y_train): knn = KNeighborsClassifier() knn.fit(X_train, Y_train) cls.save(knn, 'KNN') return knn
recall_svm = cm_svm[0][0] / (cm_svm[0][0] + cm_svm[0][1]) precision_svm = cm_svm[0][0] / (cm_svm[0][0] + cm_svm[1][1]) print(recall_svm, precision_svm) # the results of this section is Accuracy : 0.741 Recall : 0.85 Precision : 0.739 # Now we are do the test for KNN from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, leaf_size=60, algorithm='brute') knn_clf.fit(X_train, y_train) KNeighborsClassifier(algorithm='brute', leaf_size=60, metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform') y_pred_knn = knn_clf.predict(X_test) print(y_pred_knn) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix
""" Created on Tue Nov 27 14:17:10 2018 @author: jaynanda """ import pandas as pd import numpy as np train = pd.read_csv( "/Users/jaynanda/Desktop/Assignments/660/Project/Numeric Data/kids_family_numeric.csv" ) feature = pd.DataFrame(train['Genre']) train = train.drop('Genre', axis=1) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(train, feature, test_size=0.30) from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors=3) clf.fit(X_train, y_train) res = clf.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(res, y_test))
class KNNClassifier(BaseEstimator, ClassifierMixin): """k nearest neighbors classifier. Parameters ---------- n_neighbors : int, optional (default = 5) Number of neighbors to use by default for :meth:`k_neighbors` queries. weights : str or callable, optional (default = 'uniform') weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. - 'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : string or DistanceMetric object (default = 'minkowski') the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. 'dtw' and 'fast_dtw' are also available. p : integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Doesn't affect :meth:`fit` method. """ def __init__(self, n_neighbors=1, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1, **kwargs): self.n_neighbors = n_neighbors self.weights = weights self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.metric = metric self.metric_params = metric_params self.n_jobs = n_jobs self.kwargs = kwargs def fit(self, X, y): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target vector relative to X Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) if self.metric == 'dtw': self._clf = KNeighborsClassifier(self.n_neighbors, self.weights, self.algorithm, self.leaf_size, self.p, dtw, self.metric_params, self.n_jobs, **self.kwargs) elif self.metric == 'fast_dtw': self._clf = KNeighborsClassifier(self.n_neighbors, self.weights, self.algorithm, self.leaf_size, self.p, fast_dtw, self.metric_params, self.n_jobs, **self.kwargs) else: self._clf = KNeighborsClassifier(self.n_neighbors, self.weights, self.algorithm, self.leaf_size, self.p, self.metric, self.metric_params, self.n_jobs, **self.kwargs) self._clf.fit(X, y) return self def predict(self, X): """Predict the class labels for the provided data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- y : array-like, shape [n_samples] Class labels for each data sample. """ check_is_fitted(self, '_clf') X = check_array(X) return self._clf.predict(X)
#obs_all = ['ID','Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points','Wilderness_Area_1','Wilderness_Area_2','Wilderness_Area_3','Wilderness_Area_4','2702','2703','2704','2705','2706','2717','3501','3502','4201','4703','4704','4744','4758','5101','5151','6101','6102','6731','7101','7102','7103','7201','7202','7700','7701','7702','7709','7710','7745','7746','7755','7756','7757','7790','8703','8707','8708','8771','8772','8776','Cover_Type'] cls = ['Cover_Type'] trainObs = train.as_matrix(obs_bin) trainCls = train.as_matrix(cls).ravel() testObs = test.as_matrix(obs_bin) testCls = test.as_matrix(cls).ravel() # ---- K Nearest Neighbor Classification print("---- KNN ----") # Set up a K Nearest Neighbor Classifier with the number of neighbors = 3 and weights based on Euclidean distance knn = KNeighborsClassifier(n_neighbors=3, weights='distance') # Fit the K Nearest Neighbor classifier to the training data and use the resulting classifier to predict the class values for the test dataset knn.fit(trainObs, trainCls) knn_pred = knn.predict(testObs) print(knn_pred) # Calculate the accuracy of the classifier. print("KNN Accuracy:") print((sum(testCls == knn_pred)) / len(knn_pred)) # Create a confusion matrix using Scikit-Learn confusion_matrix knn_tab = confusion_matrix(testCls, knn_pred, labels=labs) print(knn_tab) # Create a classification report for the result including precision, recall, and f measure. print(metrics.classification_report(testCls, knn_pred)) # Exercise 1: Now go back and experiment with different values of k. What happened? # ---- Decision Tree Classification
# Criando X and y par ao algorítmo de aprendizagem de máquina.\ print( ' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset' ) # Caso queira modificar as colunas consideradas basta algera o array a seguir. feature_cols = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ] X = data[feature_cols] y = data.Outcome # Ciando o modelo preditivo para a base trabalhada print(' - Criando modelo preditivo') neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) #realizando previsões com o arquivo de print(' - Aplicando modelo e enviando para o servidor') data_app = pd.read_csv('diabetes_app.csv') y_pred = neigh.predict(data_app) # Enviando previsões realizadas com o modelo para o servidor URL = "https://aydanomachado.com/mlclass/01_Preprocessing.php" #TODO Substituir pela sua chave aqui DEV_KEY = "Tô de ouvinte" # json para ser enviado para o servidor data = { 'dev_key': DEV_KEY,
def machineTrain(self): ## Load dataset url = "/home/pi/Desktop/6pplData.csv" urlFeature = "/home/pi/Desktop/features.csv" urlOutput = "/home/pi/Desktop/outputs.csv" dataset = pandas.read_csv(url, header=None) global window_size window_size = 30 shift_size = 30 models = [] models.append(('KNN', KNeighborsClassifier(n_neighbors=7))) knn = KNeighborsClassifier(n_neighbors=7) ## Split-out validation dataset array = dataset.values X = array[:, 0:12] Y = array[:, 12] ## Declaring acc and gyro array accData = numpy.empty((array.shape[0], 6)) gyroData = numpy.empty((array.shape[0], 6)) accData = array[:, :6] gyroData = array[:, 6:12] ## Creating the normalizer global normalizerAcc global normalizerGyro normalizerAcc = preprocessing.Normalizer().fit(accData) normalizerGyro = preprocessing.Normalizer().fit(gyroData) ## Normalizing the data # accData = normalizerAcc.transform(accData) # gyroData = normalizerGyro.transform(gyroData) global le le = preprocessing.LabelEncoder() le.fit(['nomove', 'wavehands', 'busdriver', 'frontback', 'sidestep', 'jumping', 'jumpingjack', 'turnclap', 'squatturnclap', 'windowcleaning', 'windowcleaner360', 'final']) ######### Segmentation and Feature Extraction ##################################################################### # Y_encoded = le.transform(Y) # N = dataset.shape[0] # dim_X = X.shape[1] # K = (N // shift_size) - 15 # segments_X = numpy.empty((K, window_size, 3*(dim_X))) # segments_Y = numpy.empty((K, 3*window_size)) # segment_X = numpy.empty((window_size, 3*(dim_X))) # for i in range(K): # segment_X[:, :6] = accData[i * shift_size : (i*shift_size) + window_size, :] # segment_X[:,6:12] = gyroData[i * shift_size: (i*shift_size) + window_size, :] # segment_X[:, 12:18] = accData[i * shift_size + window_size : (i*shift_size) + 2*window_size, :] # segment_X[:, 18:24] = gyroData[i * shift_size + window_size: (i*shift_size) + 2*window_size, :] # segment_X[:, 24:30] = accData[i * shift_size + 2*window_size : (i*shift_size) + 3*window_size, :] # segment_X[:, 30:36] = gyroData[i * shift_size + 2*window_size: (i*shift_size) + 3*window_size, :] # segment_Y = Y_encoded[i * shift_size : (i * shift_size) + 3*window_size] # segments_X[i] = segment_X # segments_Y[i] = segment_Y # for i in range(K): # segment_X = X[i * shift_size : (i * shift_size) + window_size , :] # segment_Y = Y_encoded[i * shift_size : (i * shift_size) + window_size] # segments_X[i] = segment_X # segments_Y[i] = segment_Y # features = numpy.empty((K, 72)) # outputs = numpy.empty((K)) # for i in range(K): # for j in range(0, features.shape[1] - 1, 2): # features[i, j] = segments_X[i, : , j // 2].mean() # features[i, j + 1] = segments_X[i, : , j // 2].std() # outputs[i] = stats.mode(segments_Y[i])[0] # df = pandas.DataFrame(features) # df.to_csv("features.csv", header = None) # df = pandas.DataFrame(outputs) # df.to_csv("outputs.csv", header = None) ################################################################################################################### features_csv = pandas.read_csv(urlFeature, header=None) features = features_csv.values outputs_csv = pandas.read_csv(urlOutput, header=None) outputs = numpy.ravel(outputs_csv.values, order = "C") validation_size = 0.2 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(features, outputs, test_size=validation_size, random_state=seed) # Test options and evaluation metric scoring = 'accuracy' # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # Make predictions on validation dataset knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) print("exit training") # print("Accuracy Score: ", accuracy_score(Y_validation, predictions), file=open('summary.txt', 'a')) # print("Confusion Matrix: \n", confusion_matrix(Y_validation, predictions, labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), file=open('summary.txt', 'a')) # print("Classification Report: \n", classification_report(Y_validation, predictions), file=open('summary.txt', 'a')) return knn # run = learning() # run.machineTrain()
SnapShots.append(tmp) print(len(SnapShots)) SnapShots=np.concatenate(SnapShots,axis=0) #along column or row wise SnapShots=SnapShots.reshape(SnapShots.shape[0],-1) #it make our data suitable for knn print(SnapShots.shape) #40:30000 in this 30000 is feature or ndim for cal nearest neighbour labels=np.repeat(labels,10) labels=labels.reshape(labels.shape[0],-1) # 40:1 #Here we make our label in row : col=1 for suitable to attach to our snapashot dataset=np.hstack((SnapShots,labels))#along row or column wise knn=KNeighborsClassifier(n_neighbors=5) knn.fit(dataset[:,:-1],dataset[:,-1]) cap=cv2.VideoCapture(0) Cascadeclassifier=cv2.CascadeClassifier("haarcascade_frontalface_default.xml") while True: ref,frame=cap.read() if not ref : continue faces=Cascadeclassifier.detectMultiScale(frame,1.3,5)
def train_model(feats_csv): df = pd.DataFrame() df = pd.read_csv(feats_csv).iloc[:,1:] y = np.ravel(df.iloc[:,-1:]) X = np.array(df.iloc[:,:-1]) ############ 15 Best selected features using ANOVA F-value score function ############### X_new = SelectKBest(f_classif, k=15).fit_transform(X, y) selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True) ############ KNN manhattan ############### ##### preprocessing: data scaling######## min_max_scaler = MinMaxScaler() X_new = min_max_scaler.fit_transform(X_new) model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform') model.fit(X_new,y) newdir = '../kNN_clfr' os.mkdir(newdir) joblib.dump(model, os.path.join(newdir,'kNN.pkl')) return