def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def main(output=RESULTS1B): """ Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). Tests on a subset of trip_data_1.csv Uses sklearn to implement nearest neighbors """ features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'trip_time_in_secs'] ## Extract necessary data into pandas dataframes numrows = 100000 df_train_read = pd.read_csv(TRAIN_DATA) df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows) # first 100k rows, for speed df_test = df_test_read[features].dropna() df_train = df_train_read[features].dropna() ## Use sklearn to run nearest neighbors k = 1 clf = KNeighborsClassifier(n_neighbors=k) # default distance metric: euclidean clf.fit(df_train[features[0:4]], df_train[features[-1]]) preds = clf.predict(df_test[features[0:4]]) # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error) print "Calculating statistics" with open(output, "a+") as outputFile: outputFile.write("Ran knn with k={}".format(k) + \ " Trained on {}. Tested on first".format(TRAIN_DATA) + \ " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1)) calcAndLogStats( numpy.array(preds), numpy.array(df_test[features[-1]]), output=output)
def main(): print("k nearest neighbours classifier!") X,Y,Xtest = importdata() print(Y.shape) param_grid={ "n_neighbors":[10,20,50,100,200], "algorithm":['auto','ball_tree','kd_tree','brute'], "weights":['uniform','distance'] } knn = KNeighborsClassifier() Gridsearch_impl(X,Y,knn,param_grid,5) # for i in range(10,11,5): # clf = DecisionTreeClassifier(min_samples_split=i) # rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i) # ab = AdaBoostClassifier(rf,n_estimators = 10) #ab = GradientBoostingClassifier(n_estimators = 100) # score = cross_validation.cross_val_score(ab,X,Y,cv=3) # print(score) # print("average score %f"%np.mean(score)) # print("std %f"%np.std(score)) # ab.fit(X,Y) Ytest = knn.predict(Xtest) output(Ytest,'submit3.csv')
def fit(self, X, y): """ Fit the model according to the given training data Parameters ---------- X: {array-like}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features y: array-like, shape (n_samples,) Target vector relative to X. Returns ------- self : object return self. """ clf = KNeighborsClassifier(n_neighbors=self.n_neighbors, p = self.p, weights = self.weights, algorithm="kd_tree") if self.scaler: X = self.scaler.transform(X) self.model = clf.fit(X, y) # self.print_coefficients() self.num_class = len(np.unique(y))
def _evaluate_projection(self, x, y): """ kNNEvaluate - evaluate class separation in the given projection using a k-NN method Parameters ---------- x - variables to evaluate y - class Returns ------- scores """ if self.percent_data_used != 100: rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100), replace=False) x = x[rand] y = y[rand] neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \ KNeighborsRegressor(n_neighbors=3) assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None)) neigh.fit(x, y) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) scores = cross_val_score(neigh, x, y, cv=3) return scores.mean()
def kppv_histo(): "Interprétation des images comme histogrammes de couleurs et classification via les k plus proches voisins" best = np.zeros(5) _, data, target, _ = utils.chargementHistogrammesImages(mer,ailleurs,1,-1) X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed()) for iterations in range(250,1000,250): for n in range(2,12,2): for param in range (1,3): start_time = time.time() kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1) x1=np.array(X_train) x2=np.array(X_test) kppv.fit(X=x1, y=Y_train) score = kppv.score(x2,Y_test) end_time = time.time() if score>best[0]: best[0] = score best[1] = iterations best[2] = n best[3] = param best[4] = end_time-start_time print("| K plus proches voisins | V.Histo | n={:1.0f} param={:1.0f} iterations={:1.0f} | {:10.3f}ms | {:1.3f} |".format(best[2],best[3],best[1],best[4]*1000,best[0]))
def kann_classify(train_data,train_label,test_data): knnClf=KNeighborsClassifier(n_neighbors=5) knnClf.fit(train_data,ravel(train_label)) test_label=knnClf.predict(test_data) save_result(test_label,'sklearn_knn_Result.csv') return test_label
def knn_accuracy(trn_data, trn_labels, tst_data, tst_labels, k_neighbors): knn = KNeighborsClassifier(k_neighbors) knn.fit(trn_data, trn_labels) results = knn.predict(tst_data) return np.sum(tst_labels == results)/float(tst_labels.size)
def kppv_vecteur(): "Interprétation des images comme vecteurs de pixels et classification via les k plus proches voisins" best = np.zeros(6) for npix in range(50,200,50): _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix) X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed()) for iterations in range(250,1000,250): for n in range(2,12,2): for param in range (1,3): start_time = time.time() kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1) x1=np.array(X_train) x1 = np.reshape(x1, (x1.shape[0],x1.shape[2])) x2=np.array(X_test) x2 = np.reshape(x2, (x2.shape[0],x2.shape[2])) kppv.fit(X=x1, y=Y_train) score = kppv.score(x2,Y_test) end_time = time.time() if score>best[0]: best[0] = score best[1] = iterations best[2] = n best[3] = param best[4] = end_time-start_time best[5] = npix print("| K plus proches voisins | V.Pix {:4.0f} | n={:1.0f} param={:1.0f} iterations={:1.0f} | {:10.3f}ms | {:1.3f} |".format(best[5],best[2],best[3],best[1],best[4]*1000,best[0]))
def analyze_image(self): ''' Load the image and analyze it with KNN im_file - pre-processed with histogram specification ''' if self._avg_pixels.size == 0: self._process_annotations() self._get_initial_classes() im = self._image rows = im.shape[0] clf = KNeighborsClassifier(n_neighbors = self._n_neighbors) clf.fit(self._avg_pixels, self._labels) im_1d = im.reshape(-1, 3) # calculate prediction reshape into image prediction = clf.predict(im_1d) prediction = prediction.reshape(rows, -1) prediction [self._mask == 0] = Labels.Masked self.display_current(prediction) return prediction
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def run(class_num, subsample_size , cluster_num, window_size, method='knn' , n_nb = 2): # will load data as the patch size defined , 3 means 3*3 = 9 for each patch, and will return the dictionary included: # 'data' (one patch) , 'target' (the sample of this patch belongs to ) , 'filename' (the file comes from) bofs = [] lable = [] filename = "%s/TRAIN_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num) bofs , lable = get_vlad(filename) #knn_init = KNeighborsClassifier() #parameters = {'n_neighbors':[ 5, 10 , 15]} #knn = grid_search.GridSearchCV(knn_init, parameters) bofs_test = [] lable_test = [] filename = "%s/TEST_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num) bofs_test , lable_test = get_vlad(filename) start = time.time() if(method == "knn"): knn = KNeighborsClassifier(n_neighbors = n_nb) knn.fit(bofs, lable) predicted = knn.predict(bofs_test) score = knn.score(bofs_test,lable_test) print(time.time()-start) return score
def BuildModel(self, data, labels): # Create and train the classifier. knc = KNeighborsClassifier( n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric ) knc.fit(data, labels) return knc
def train(x_train, y_train): # reg = LinearRegression() reg = KNeighborsClassifier() reg.fit(x_train, y_train) return reg
def main(): # obtain the number of features in the dataset with open('../data/test_lung_s3.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: num_columns = len(row) break print num_columns # load data mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, num_columns)) X = mat[:, 1:num_columns] # data y = mat[:, 0] # label X = X.astype(float) y = y.astype(float) n_samples, n_features = X.shape # using 10 fold cross validation cv = KFold(n_samples, n_folds=10, shuffle=True) # evaluation n_features = 100 neigh = KNeighborsClassifier(n_neighbors=1) acc = 0 for train, test in cv: idx = svm_backward.svm_backward(X[train], y[train], n_features) print idx X_selected = X[:, idx] neigh.fit(X_selected[train], y[train]) y_predict = neigh.predict(X_selected[test]) acc_tmp = accuracy_score(y[test], y_predict) print acc_tmp acc += acc_tmp print 'ACC', float(acc)/10
def exercise_2b(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0) # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 print mean_squared_error(y_test, clf.predict(X_test)) accuracy_lst[k-1, 0] = accuracy_current.mean() accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K') plt.ylabel('Variance') plt.show()
def main_process(): data_dict = parse_txt() x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict) print 'data counts', len(x_data), len(y_data) print 'zone names counts', places_cnt print 'path counts', len(path_int_dict) # start to train, change list type to numpy.array x_data = np.array(x_data) y_data = np.array(y_data) knn = KNeighborsClassifier() indices = np.random.permutation(len(x_data)) x_train = x_data y_train = y_data x_test = x_data[indices[-TEST_DATA_ROWS:]] y_test = y_data[indices[-TEST_DATA_ROWS:]] knn.fit(x_train, y_train) # work test_result = knn.predict(x_test) # test proba_test_result = knn.predict_proba(x_test) # no duplicate value, so reverse this dictionary int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys())) print 'predict result:', test_result print [int_path_dict[x] for x in test_result] # test result
def exercise_1(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) n_samples = len(X) kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None) # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None) error_total = np.zeros([49, 1], dtype=float) for k in range(1,50): error = [] clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) error.append( zero_one_loss(y_test, clf.predict(X_test)) ) # error.append(clf.predict(X_test)) # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test)) # error.append(mean_squared_error(y_test, clf.predict(X_test))) # error.append() # print error error_total[k-1, 0] = np.array(error).mean() # print error_total x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, error_total[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def exercise_2a(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) # plt.show() kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 accuracy_lst[k-1, 0] = accuracy_current.mean() # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def build_model(): """ request cip and skill data from DataUSA and develop predictive model using scikit-learn :return: fit model ready to accept user input to make a prediction """ # request data on college majors and relevant skills r = requests.get(r'http://api.datausa.io/api/?show=skill&sumlevel=all') data_usa = r.json() headers = data_usa['headers'] data = data_usa['data'] df = pd.DataFrame(data, columns=headers) df.drop('value_rca', axis=1, inplace=True) # reshape data so that each skill becomes a single column (i.e. feature for the model) pivot = df.pivot_table(index='cip', columns='skill', values='value') pivot = pivot.reset_index() X = pivot.drop('cip', axis=1) # feature matrix y = pivot.cip # response knn = KNeighborsClassifier(n_neighbors=10, weights='distance') knn.fit(X, y) return knn
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def onstartButton(self): cap = cv2.VideoCapture(str(self.file_name)) if self.isfileWorking == False and self.ishasFile == True: self.ishasFile = False self.startButton.setText("Close") # cap = cv2.VideoCapture(str(self.file_name)) self.isfileWorking = True data=spio.loadmat("openface_fea.mat") X=data['feature'] id=data['id'].astype(int)-1 Y=id[0,:] name=list(set(data['names'])) name.sort() print("***Train knn classifier***") knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2) knn.fit(X,Y) success,frame = cap.read() while success and self.isfileWorking : start=time.time() success, frame = cap.read() if success: img=frame.copy() bb,rep=getRep(img) if bb is None: print "Can't find any face in this picture" else: if rep is 0: print "Get rep failed..." else: rep=np.reshape(rep,(1,128)) idx=knn.predict(rep) # print("label is {} ".format(idx)) proba=knn.predict_proba(rep) actor=name[idx] self.namelineEdit.setText(actor) self.timelineEdit.setText(str(round(time.time()-start,3))) self.confidencelineEdit.setText(str(round(max(proba[0]),2))) # print("Proba is {} ".format(proba)) draw_dlib_rects(frame,bb,actor,(0,255,0)) image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped() pixmap = QtGui.QPixmap.fromImage(image) self.showlabel.setPixmap(pixmap) k = cv2.waitKey(5) else: self.ishasFile = False self.startButton.setText("Start") self.isfileWorking = False cap.release() self.showlabel.clear()
def kNearestNeighbors(features_test, features_train,labels_test,labels_train): print "Using K Nearest Neighbors" #Setup classifier clf = KNeighborsClassifier(algorithm='ball_tree',n_jobs=-1,weights='distance'); #Timing fit algorithm t0 = time(); #Fitting data clf = clf.fit(features_train,labels_train); print "Training Time: ", round(time() - t0, 3), "s"; #Reset timer for prediction; t0 = time(); #Predicting using test data nbrs_predict = clf.predict(features_test); print "Prediction Time: ", round(time() - t0, 3), "s"; nbrs_acc = accuracy_score(nbrs_predict,labels_test); print "Accuracy: ", nbrs_acc; return clf;
def build_classifier(images, labels): #this will actually build the classifier. In general, it #will call something from sklearn to build it, and it must #return the output of sklearn. Right now it does nothing. classifier = KNN(n_neighbors=3,weights='distance') classifier.fit(images, labels) return classifier
def train(): data = Prediction.objects.filter(predict=False) df = pd.DataFrame(list(data.values())) users = [o.user for o in data] df['age'] = pd.DataFrame(list([user.userprofile.age for user in users])) for x in ['id', 'base_personal', 'base_general', 'predict', 'created', 'user_id', 'training_id']: df = df.drop(x, axis=1) y = df['next_level'] df = df.drop('next_level', axis=1) #neigh = svm.SVC() neigh = KNeighborsClassifier(n_neighbors=2) neigh.fit(df, y) #kf = KFold(len(ft[features]), n_folds=10) kf = ShuffleSplit(len(df), n_iter=K, test_size=test_size, random_state=0) # score is accuracy here accuracy = cross_val_score(neigh, df, y, cv=kf) batch = Training.objects.create(training_accuracy=sum(accuracy[:(1-test_size)*K])/K/(1-test_size), sample_size=len(df.index), fold=K, subset_accuracy=json.dumps(accuracy.tolist()), test_accuracy=sum(accuracy[(1-test_size)*K:])/K/test_size ) Prediction.objects.filter(predict=False).update(training=batch) if not os.path.exists('./models'): os.makedirs('./models') joblib.dump(neigh, './models/model.pkl')
def neighborsPrediction(train_dfs, targetLabels, fold_cv): scoresNeighbor = [0.0] n_neighbors = 0 for i in range(1, 10): neighbor, instances_train, instances_test, target_train, target_test, scoresNeighborTmp = testScore(train_dfs, targetLabels, fold_cv, i * 2) if sum(scoresNeighborTmp) / len(scoresNeighborTmp) > sum(scoresNeighbor) / len(scoresNeighbor): scoresNeighbor = scoresNeighborTmp n_neighbors = i * 2 # print(sum(scoresNeighborTmp)/len(scoresNeighborTmp)) neighbor = KNeighborsClassifier(n_neighbors) neighbor.fit(train_dfs, targetLabels) instances_train, instances_test, target_train, target_test = cross_validation.train_test_split(train_dfs, targetLabels, test_size=0.4, random_state=0) predictions = neighbor.predict(instances_test) print("Generate random forest with: {0} neighbors".format(str(n_neighbors))) return neighbor, instances_train, target_train, target_test, predictions, scoresNeighbor
def brute_force_acc_rd(features_train, labels_train, features_test, labels_test, ids): clf = KNeighborsClassifier( n_neighbors=100, ) clf = clf.fit(features_train, labels_train) # print(clf.best_estimator_) pred = clf.predict(features_test) acc = accuracy_score(labels_test, pred) #print pred print acc if(acc > 0.8): print ("Acc: {} ").format(acc) if(acc > 0.831): data_train.to_csv("data_train{}.tst".format(round(acc,5)), "\t") predictions_file = open("data/canivel_knn_{}.csv".format(round(acc, 5)), "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["PassengerId", "Survived"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close() print ("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! NEW FILE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! YEA!!!!") return acc
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S')) trainX = downsample_features(trainX) testX = downsample_features(testX) trainX, testX = normalize(trainX, testX) print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), '50 - Neighbors') clf = KNeighborsClassifier(n_neighbors=50) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def knn(depth, joints, C, visualize=False, zScale=1.0): pts_world, labels = joints2skeleton(joints) pts_world[:, 2] *= zScale classifier = KNeighborsClassifier(n_neighbors=nNeighbors) classifier.fit(pts_world, labels) X = np.vstack((np.nonzero(depth)[1], np.nonzero(depth)[0])) X = np.vstack((X, depth[depth != 0])) X_world = pixel2world(X.T, C) X_world[:, 2] *= zScale predicts = classifier.predict(X_world) perPixelLabels = -np.ones(depth.shape) perPixelLabels[depth != 0] = predicts img = np.zeros((H, W, 3), np.uint8) for i in range(nJoints): img[perPixelLabels == i] = palette[i] skel = None if visualize is True: # foreground = visualizePts(world2pixel(pts_world, C), labels) # img[foreground != 0] = foreground[foreground != 0] skel = visualizePts(world2pixel(pts_world, C), labels) return (img, X_world, predicts, skel)
def blend_models(n_folds, train_data, train_labels, holdout, test_data, test_mode): """ Function which performs the blending procedure explained below: Step 1) initialize classifiers to use in the blending task as clfs variable (add classifiers to that variable) TODO: extract it out into config Step 2) split training data into kfolds Step 3) Do for every classifier: Do for every fold: train each classifier in blender on the kth training fold and do the following: a) predict probabilities of the kth "test" fold only b) append predictions to holdout set "dataset_blend_holdout_j" for classifier trained on that fold only c) append predictions to test set "dataset_blend_test_j" for classifier trained on that fold only When all folds are finished processing take a mean of predictions generated for the classifier trained on different folds for both dataset_blend_holdout_j and dataset_blend_test_j and append mean values to dataset_blend_holdout, dataset_blend_test Args: n_folds: number of folds in the blender train_data: trianing data train_labels: true labels for training data holdout: holdout set test_data: test data set to generate final predictions on test_mode: this is the debug mode (it uses only one classifier in the blender) Returns: dataset_blend_train: blended training data set based on above procedure dataset_blend_holdout: blended holdout set based on above procedure dataset_blend_test: blended test set based on above procedure """ np.random.seed(0) # seed to shuffle the train set shuffle = False X = train_data y = train_labels.ravel() X_submission = holdout if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(cross_validation.KFold(len(y), n_folds)) if test_mode: clfs = [KNeighborsClassifier(weights="uniform", n_jobs=-1)] else: clfs = [KNeighborsClassifier(weights="uniform", n_jobs=-1), KNeighborsClassifier(weights="distance", n_jobs=-1), SVC(), RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=250, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=250, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis()] #MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(200,), verbose=False, random_state=55), #AdaBoostClassifier(_ADABOOST_BASE_ESTIMATOR_, n_estimators=_ADABOOST_NUM_ESTIMATORS_, algorithm=_ADABOOST_LALGO_, learning_rate=_ADABOOST_LEARNING_RATE_)] print "Creating train and test sets for blending." dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_holdout = np.zeros((X_submission.shape[0], len(clfs))) dataset_blend_test = np.zeros((test_data.shape[0], len(clfs))) for j, clf in enumerate(clfs): print "Classifier no: ", j + 1 print clf dataset_blend_holdout_j = np.zeros((X_submission.shape[0], len(skf))) dataset_blend_test_j = np.zeros((test_data.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "====Fold", i X_train = X.iloc[train] y_train = y[train] X_test = X.iloc[test] y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission dataset_blend_holdout_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test_j[:, i] = clf.predict_proba(test_data)[:,1] dataset_blend_holdout[:,j] = dataset_blend_holdout_j.mean(1) dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) return pd.DataFrame(dataset_blend_train), pd.DataFrame(dataset_blend_holdout), pd.DataFrame(dataset_blend_test)
from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier from examples.load_wine import load_wine from selexor.random_forest import RFSelector # we will use Wine dataset for demonstration x_train_std, x_test_std, y_train, y_test = load_wine() # let's create a classifier and calculate accuracy score knn = KNeighborsClassifier(n_jobs=-1) knn.fit(x_train_std, y_train) y_pred = knn.predict(x_test_std) print( f'Accuracy score before RFSelector: {accuracy_score(y_pred=y_pred, y_true=y_test)}' ) # now, let's create RFSelector instance and use fit_transform and transform methods to fit the dataset and transform # samples rf = RFSelector(n_components=2, estimator_params={ 'max_depth': 3, 'n_jobs': -1 }) x_train_rf = rf.fit_transform(x_train_std, y_train) x_test_rf = rf.transform(x_test_std) # let's fit the classifier on new data and calculate accuracy score again knn.fit(x_train_rf, y_train) y_pred = knn.predict(x_test_rf)
def knncls(): """ K-近邻预测用户签到位置 :return:None """ # 读取数据 data = pd.read_csv("./data/FBlocation/train.csv") print(data.head(10)) # 处理数据 # 1、缩小数据,查询数据晒讯 data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75") # 处理时间的数据 time_value = pd.to_datetime(data['time'], unit='s') print(time_value) # 把日期格式转换成 字典格式 time_value = pd.DatetimeIndex(time_value) # 构造一些特征 data['day'] = time_value.day data['hour'] = time_value.hour data['weekday'] = time_value.weekday # 把时间戳特征删除 data = data.drop(['time'], axis=1) print(data) # 把签到数量少于n个目标位置删除 place_count = data.groupby('place_id').count() tf = place_count[place_count.row_id > 3].reset_index() data = data[data['place_id'].isin(tf.place_id)] # 取出数据当中的特征值和目标值 y = data['place_id'] x = data.drop(['place_id'], axis=1) # 进行数据的分割训练集合测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 特征工程(标准化) std = StandardScaler() # 对测试集和训练集的特征值进行标准化 x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # 进行算法流程 # 超参数 knn = KNeighborsClassifier() # # fit, predict,score knn.fit(x_train, y_train) # # 得出预测结果 y_predict = knn.predict(x_test) # # print("预测的目标签到位置为:", y_predict) # # # 得出准确率 # print("预测的准确率:", knn.score(x_test, y_test)) # 构造一些参数的值进行搜索 param = {"n_neighbors": [3, 5, 10]} # 进行网格搜索 gc = GridSearchCV(knn, param_grid=param, cv=2) gc.fit(x_train, y_train) # 预测准确率 print("在测试集上准确率:", gc.score(x_test, y_test)) print("在交叉验证当中最好的结果:", gc.best_score_) print("选择最好的模型是:", gc.best_estimator_) print("每个超参数每次交叉验证的结果:", gc.cv_results_) return None
#### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(features_train, labels_train) pred = neigh.predict(features_test) from sklearn.metrics import accuracy_score print accuracy_score(pred, labels_test) ######################## ### adaboost algorithm ######################## from time import time from sklearn.ensemble import AdaBoostClassifier print "-:: adaboost ::------------------" t0 = time() adab = AdaBoostClassifier(n_estimators=100, learning_rate=1)
def fitness(individual, granulationBucket, trEmbeddBucket, vsEmbeddBucket, TRindices, VSindices, TRlabels, VSlabels): Q = individual[0] wNSub = individual[1] wNIns = individual[2] wNDel = individual[3] wESub = individual[4] wEIns = individual[5] wEDel = individual[6] tau = individual[7] eta = individual[8] Repr = Medoid #Setting GED graphDist = BMF(nodeDissimilarity, edgeDissimilarity) graphDist.nodeSubWeight = wNSub graphDist.nodeInsWeight = wNIns graphDist.nodeDelWeight = wNDel graphDist.edgeSubWeight = wESub graphDist.edgeInsWeight = wEIns graphDist.edgeDelWeight = wEDel #Setting granulation strategy granulationStrategy = BsasBinarySearch(graphDist, Repr, 0.1) granulationStrategy.BsasQmax = Q granulationStrategy.eta = eta granulationStrategy.symbol_thr = tau #Setup embedder embeddingStrategy = SymbolicHistogram(Dissimilarity=graphDist, isSymbolDiss=False, isParallel=False) #Start granulation granulationStrategy.granulate(granulationBucket) #retrieving alphabet alphabet = granulationStrategy.symbols if alphabet: #Embedded with current symbols # embeddingStrategy.getSet(trEmbeddBucket, alphabet) # TRembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet) # TRpatternID = embeddingStrategy._embeddedIDs ##Debug embeddingStrategy.getSetDebug(trEmbeddBucket, alphabet, TRindices) TRembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet) TRpatternID = embeddingStrategy._embeddedIDs print( np.all( np.asarray(TRlabels) == np.asarray( embeddingStrategy._embeddedClass))) ## # embeddingStrategy.getSet(vsEmbeddBucket, alphabet) # VSembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet) # VSpatternID = embeddingStrategy._embeddedIDs embeddingStrategy.getSetDebug(vsEmbeddBucket, alphabet, VSindices) VSembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet) VSpatternID = embeddingStrategy._embeddedIDs print( np.all( np.asarray(VSlabels) == np.asarray( embeddingStrategy._embeddedClass))) #Resorting matrix for consistency with dataset TRorderID = np.asarray([TRpatternID.index(x) for x in TRindices]) VSorderID = np.asarray([VSpatternID.index(x) for x in VSindices]) TRMat = TRembeddingMatrix[TRorderID, :] VSMat = VSembeddingMatrix[VSorderID, :] #DEBUG # x = np.all(TRMat==TRembeddingMatrix2) # y = np.all(VSMat==VSembeddingMatrix2) # print(x,y) ## classifier = KNN() classifier.fit(TRMat, TRlabels) predictedVSLabels = classifier.predict(VSMat) # classifier.fit(TRembeddingMatrix,TRlabels) # predictedVSLabels = classifier.predict(VSembeddingMatrix) accuracyVS = sum(predictedVSLabels == VSlabels) / len(VSlabels) print("Accuracy VS = {}".format(accuracyVS)) #Minimisation problem indFit = 0.9 * (1 - accuracyVS) + 0.1 * (len(alphabet) / len(granulationBucket)) else: print("Empty alphabet. Penalizing fitness with worst fitness") indFit = 1 fitness = indFit return fitness,
#-*- coding=utf-8 -*- from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics iris = load_iris() print iris.DESCR x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=666) ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) knnc = KNeighborsClassifier() knnc.fit(x_train, y_train) predict = knnc.predict(x_test) print 'The accuracy of KNN classifier is', knnc.score(x_test, y_test) print metrics.classification_report(y_test, predict) print metrics.confusion_matrix(y_test, predict) data = 1 # with open(r"C:\Users\hanghang\Desktop\hh_practice.csv") as file: # data = 2 # print data
'female', 'male', 'male' ] testX = [[175, 63, 43], [180, 69, 44], [162, 54, 38]] testY = ['male', 'male', 'female'] #classification tree clf = tree.DecisionTreeClassifier() clf = clf.fit(X, Y) #classification GaussianNB clf2 = GaussianNB() clf2.fit(X, Y) #classification neighbors clf3 = KNeighborsClassifier(n_neighbors=3, algorithm='auto') clf3.fit(X, Y) #classification SVC clf4 = SVC() clf4.fit(X, Y) alt = input("Ingresa tu altura: ") peso = input("Ingresa tu peso: ") talla = input("Ingresa tu talla de zapato: ") predicition = clf.predict([[int(alt), int(peso), int(talla)]]) points = clf.score(testX, testY) #190,70,43 print(predicition, "\n La precision de decision tree fue de:", points)
vector = vectorizeSentence(sentence.split()) all_features[i, :] = vector all_features = pd.DataFrame(all_features) all_features['label'] = full_data['dialog_act'] all_features = all_features.dropna() X = all_features.drop('label', axis=1) y = all_features['label'] #y = LabelBinarizer().fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=101) modelsToTest = [baselineModel, LogisticRegression(), SVC(kernel='linear'), \ DecisionTreeClassifier(), RandomForestClassifier(n_estimators=200),\ KNeighborsClassifier(n_neighbors=10)] modelNames = ['Neural network', 'Logistic regression', 'Support vector machine',\ 'Decision Tree', 'Random Forest', 'KNN'] performances = [] for model, name in zip(modelsToTest, modelNames): print('fitting model {}'.format(name)) model.fit(X_train, y_train) preds = model.predict(X_test) print(classification_report(y_test, preds))
fscore = 2 * precision * recall/(precision + recall) return tp, tn, fp, fn, acc, fscore train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') corpus = train_df['text'] toate_cuvintele = get_corpus_vocabulary(corpus) wd2idx, idx2wd = get_representation(toate_cuvintele, 70) data = corpus_to_bow(corpus, wd2idx) labels = train_df['label'] test_data = corpus_to_bow(test_df['text'], wd2idx) from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors = 7) # scrierea predictiilor (scor 75.225% kaggle) clf.fit(data, labels) preds = clf.predict(test_data) write_prediction('352_TroianStefan_submisie1.csv', preds) # # kfold si matrice de confuzie # predictie_medie = [] # tp = [] # tn = [] # fp = [] # fn = [] # acc = [] #
random_state = check_random_state(0) permutation = random_state.permutation(X.shape[0]) X = X[permutation] y = y[permutation] X = X.reshape((X.shape[0], -1)) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=60000, test_size=10000) label_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] begin = time.time() k = KNeighborsClassifier(n_neighbors=3, metric='euclidean', n_jobs=-1) k.fit(X_train, y_train) k_y_pred = k.predict(X_test) print(classification_report(y_test, k_y_pred, target_names=label_names)) end = time.time() print("time:", end - begin) begin = time.time() LR = LogisticRegression(penalty='l2', solver='saga', max_iter=50, n_jobs=-1) LR.fit(X_train, y_train) LR_y_pred = LR.predict(X_test) print(classification_report(y_test, LR_y_pred, target_names=label_names)) end = time.time() print("time:", end - begin) # In[12]:
def cross_validate(x_train, y_train, x_test, y_test): # using grid search instead for now n_neighbors = 2 kf = KFold(n_splits=5, shuffle=True, random_state=42) best_k = None best_score = 0 for train_index, test_index in kf.split(x_train): x_train_folds, x_val_fold = x_train[train_index], x_train[test_index] y_train_folds, y_val_fold = y_train[train_index], y_train[test_index] knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(x_train_folds, y_train_folds) score = knn.score(x_val_fold, y_val_fold) print('K = ', n_neighbors, ':', score) if score > best_score: best_k = n_neighbors n_neighbors += 1 knn = KNeighborsClassifier(n_neighbors=best_k) knn.fit(x_train, y_train) pred = knn.predict(x_test) final_score = knn.score(x_test, y_test) matrix = confusion_matrix(y_test, pred)
from sklearn.neighbors import KNeighborsClassifier # 生成随机样本 # multivariate_normal方法用于生成随机正太分布 n_points = 100 X1 = np.random.multivariate_normal([1, 50], [[1, 0], [0, 10]], n_points) X2 = np.random.multivariate_normal([2, 50], [[1, 0], [0, 10]], n_points) # concatenate方法用于将多个数组进行拼接,默认axis=0,即上下拼接,若axis=1则左右拼接 X = np.concatenate([X1, X2], axis=0) y = np.array([0] * n_points + [1] * n_points) # KNN模型训练过程 clfs = [] neighbors = [1, 3, 5, 9, 11, 13, 15, 17, 19] for i in range(len(neighbors)): clfs.append(KNeighborsClassifier(n_neighbors=neighbors[i]).fit(X, y)) # 可视化结果 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # meshgrid方法用于生成网格矩阵,第一个参数为相当于横坐标,第二个参数相当于纵坐标,返回的是以横纵坐标组合成的网格后,横坐标矩阵和纵坐标的矩阵 # 参考https://www.cnblogs.com/gengyi/p/9420559.html xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(15, 12)) for idx, clf, tt in zip(product([0, 1, 2], [0, 1, 2]), clfs, ['KNN (k=%d)' % k for k in neighbors]): Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape)
# In[ ]: # Validation Set approach X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) accuracyList = {} clf = tree.DecisionTreeClassifier(class_weight="balanced") gaussian = GaussianNB() logreg = LogisticRegression(class_weight="balanced") boost = GradientBoostingClassifier() knn = KNeighborsClassifier(n_neighbors=3) forest = RandomForestClassifier(n_estimators=20) models = [clf, gaussian, logreg, boost, knn, forest] for model in models: accuracyList[model] = 0 for model in models: modelV = model.fit(X_train, Y_train) Y_pred = modelV.predict(X_test) accuracyList[model] = round(modelV.score(X_test, Y_test) * 100, 3) #models = [clf,gaussian,logreg,boost,knn,forest] accResult = pd.DataFrame({ 'Model': [
sgd_score=cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") sgd_score sgd_score.mean() # ## Saving the model import joblib filename = 'sgd_clf.sav' joblib.dump(sgd_clf, filename) # # K nearest neighbours from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(n_neighbors=1,weights="uniform", metric="cosine") knn_clf.fit(X_train_scaled,y_train) knn_score=cross_val_score(knn_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") knn_score knn_score.mean() # ## Saving the model import joblib filename = 'knn_1_uniform_cosine.sav' joblib.dump(knn_clf, filename) # # Evaluating on test set
def train_and_eval_ML(X_train, X_test, y_train, y_test, metrics_manager, fold, quick_test=False): """ Train and evulate Traditional ML classifiers from sci-kit learn Description: This function will train all the models on the given feature set of the X (data) for predicting y (target) and add the acquired metrics to the MetricsManager object from the user Args: X => pd.DataFrame object containing the data y => pd.Series object containings the target classifications feature_set => list of features in X to use for training metrics_manager => MetricsManager object (custom) Returns: None Classifer names used as keys for the manager: XGBoost Classifier => xgb Random Forest => rf Decision Tree => dt k-Nearest Neighbors => knn Support Vector Machine => svm Logistic Regression => lr Linear Discriminant Analysis => lda AdaBoost => ab Naive Bayes => nb """ random_state = 100 if quick_test: # Random Forest Model rf = RandomForestClassifier(random_state=random_state) model_eval(rf, 'rf', fold, X_train, X_test, y_train, y_test, metrics_manager) # XGBoost Classifier xgb = XGBClassifier() model_eval(xgb, 'xgb', fold, X_train, X_test, y_train, y_test, metrics_manager) return # Random Forest Model rf = RandomForestClassifier(random_state=random_state) model_eval(rf, 'rf', fold, X_train, X_test, y_train, y_test, metrics_manager) # XGBoost Classifier xgb = XGBClassifier() model_eval(xgb, 'xgb', fold, X_train, X_test, y_train, y_test, metrics_manager) # AdaBoost Model ab = AdaBoostClassifier(random_state=random_state) model_eval(ab, 'ab', fold, X_train, X_test, y_train, y_test, metrics_manager) # Decision Tree Model dt = DecisionTreeClassifier(random_state=random_state) model_eval(dt, 'dt', fold, X_train, X_test, y_train, y_test, metrics_manager) # k-Nearest Neighbors Model knn = KNeighborsClassifier() model_eval(knn, 'knn', fold, X_train, X_test, y_train, y_test, metrics_manager) # Support Vector Machine Model svm = SVC(random_state=random_state) model_eval(svm, 'svm', fold, X_train, X_test, y_train, y_test, metrics_manager) # Logistic Regression Model lr = LogisticRegression(random_state=random_state) model_eval(lr, 'lr', fold, X_train, X_test, y_train, y_test, metrics_manager) # Linear Discriminant Analysis Model lda = LinearDiscriminantAnalysis() model_eval(lda, 'lda', fold, X_train, X_test, y_train, y_test, metrics_manager) # Naive Bayes Model nb = GaussianNB() model_eval(nb, 'nb', fold, X_train, X_test, y_train, y_test, metrics_manager)
# 特征归一化 from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled= scaler.transform(X_test) for i in range(4): print('归一化前的最大值{:.3f},最小值{:.3f},' .format(X_train.iloc[:,i].max(), X_train.iloc[:,i].min())) print('归一化后的最大值{:.3f},最小值{:.3f},' .format(X_train_scaled[:,i].max(), X_train_scaled[:,i].min())) print() knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train_scaled, y_train_binary) y_pre = knn.predict(X_test_scaled) # 由图可知对SVM,C = 10 或者100 时模型最优 svm_model = SVC(C=10) svm_model.fit(X_train_scaled, y_train) from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 准确率 print('准确率:{:.3f}'.format(accuracy_score(y_test_binary, y_pre))) # 精确率 print('精确率:{:.3f}'.format(precision_score(y_test_binary, y_pre))) # 召回率 print('召回率:{:.3f}'.format(recall_score(y_test_binary, y_pre))) # f1指标
def classify_knn(features_train, labels_train, n): neigh = KNeighborsClassifier(n) neigh.fit(features_train, labels_train) return neigh
if (y_train[seq[j]][0] == -1.0): cnt3 += 1 if cnt1 >= cnt2 and cnt1 >= cnt3: label[i] = 1.0 if cnt2 > cnt1 and cnt2 >= cnt3: label[i] = 0.0 if cnt3 > cnt2 and cnt3 > cnt1: label[i] = -1.0 print cnt1, cnt2, cnt3 print label[i], y_test[i] acc = 0.0 for i in range(len(x_test)): if (label[i] == y_test[i]): acc += 1.0 return acc / len(x_test) pca = PCA(n_components=200) filename = 'EEG.mat' x_train, y_train, x_test, y_test = preprocess(filename) #x_train=pca.fit_transform(x_train) #x_test=pca.fit_transform(x_test) #print k_nn(x_train[:5000],y_train[:5000],x_test[:1500],y_test[:1500],200) neighbors = KNeighborsClassifier(n_neighbors=5) neighbors.fit(x_train, y_train) pre = neighbors.predict(x_test) acc = float((pre == y_test).sum()) / len(y_test) print acc
import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier from sklearn.metrics import roc_auc_score, confusion_matrix from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier import seaborn as sns import matplotlib.pyplot as plt models = [ KNeighborsClassifier(n_neighbors=5), SVC(gamma='auto'), DecisionTreeClassifier(random_state=0), RandomForestClassifier(max_depth=20, random_state=0), AdaBoostClassifier(n_estimators=100, random_state=0), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis() ] luckies = ["Mrs", "Miss", "Master", "Sir", "Lady", "Ms", "Mle", "Counthess"] unluckies = ["Mr", "Don", "Rev", "Dr", "Jonkheer"] def preprocessData(train, test): Y_train = train['Survived'] train['Train'] = train.apply(lambda row: 1, axis=1) test['Train'] = test.apply(lambda row: 0, axis=1) data = pd.concat([train, test], ignore_index=True, axis=0)
from sklearn.naive_bayes import GaussianNB gaussian = GaussianNB() gaussian.fit(X_train, Y_train) #Predict Output gauss_pred = gaussian.predict(X_test) #Using Logistic Regression from sklearn.linear_model import LogisticRegression reg = LogisticRegression() reg.fit(X_train, Y_train) #Predict output regression_pred = reg.predict(X_test) #Using K Nearest Neighbors from sklearn.neighbors import KNeighborsClassifier k_near = KNeighborsClassifier() k_near.fit(X_train, Y_train) #Predict output k_near_pred = k_near.predict(X_test) #Using Decision Tree Classifier from sklearn.tree import DecisionTreeClassifier dec_tree = DecisionTreeClassifier() dec_tree.fit(X_train, Y_train) #Predict output dec_tree_pred = dec_tree.predict(X_test) # Fitting SVC to the dataset from sklearn.svm import SVC regressor = SVC() regressor.fit(X_train, Y_train)
C=100., probability=True, class_weight='balanced', kernel='linear')) clf_output = clf.fit(data_train, targets_train) print(clf.score(data_test, targets_test)) #Naive bayes print("Naive bayes") from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(data_train, targets_train) predictions = nb.predict(data_test) print(metrics.accuracy_score(targets_test, predictions)) #KNN print("KNN") from sklearn.neighbors import KNeighborsClassifier KNN = KNeighborsClassifier(n_neighbors=3) KNN.fit(data_train, targets_train) predictions = KNN.predict(data_test) print(metrics.accuracy_score(targets_test, predictions)) #Decision Tree print("Decision Tree") from sklearn import tree clf = tree.DecisionTreeClassifier() clf = clf.fit(data_train, targets_train) predictions = clf.predict(data_test) print(metrics.accuracy_score(targets_test, predictions))
def model(self): knn = KNeighborsClassifier(n_neighbors=1, metric='cosine') clf = knn.fit(self.X_train_tfidf, self.encode) return clf
from sklearn.neighbors import KNeighborsClassifier #using KNN from sklearn.datasets import load_iris iris = load_iris() features = iris.data labels = iris.target from sklearn.cross_validation import train_test_split #fn train X_train, X_test, Y_train, Y_test = train_test_split(features,labels,test_size=.3) neigh = KNeighborsClassifier() neigh.fit(X_train,Y_train) #clf = DecisionTreeClassifier() #clf.fit(X_train,Y_train) p = neigh.predict(X_test) from sklearn.metrics import accuracy_score print ("Accuracy =",accuracy_score(Y_test,p))
max_depth=8, min_samples_split=12, min_samples_leaf=3, min_weight_fraction_leaf=0.06, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=0, verbose=0, warm_start=False, class_weight=None) knn = KNeighborsClassifier(n_neighbors=40, n_jobs=1) # K临近算法 naiveB = naive_bayes.BernoulliNB(alpha=1.6, binarize=1.41, fit_prior=True, class_prior=None) # 0.575 svm = SVC(C=1, kernel='rbf', gamma=0.001, probability=True) LR = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=4.0, fit_intercept=True, intercept_scaling=2, class_weight=None, random_state=None, solver='liblinear', max_iter=100,
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report, confusion_matrix url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" # Assign colum names to the dataset names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class'] # Read dataset to pandas dataframe dataset = pd.read_csv(url, names=names) print(dataset.head()) X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) classifier = KNeighborsClassifier(n_neighbors=5) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))
def data_trainer(df, algo, y, x1, x2=None, x3=None, x4=None, x5=None, x6=None, x7=None, x8=None, x9=None, x10=None): checklist = [x1] if x2 != None: checklist.append(x2) if x3 != None: checklist.append(x3) if x4 != None: checklist.append(x4) if x5 != None: checklist.append(x5) if x6 != None: checklist.append(x6) if x7 != None: checklist.append(x7) if x8 != None: checklist.append(x8) if x9 != None: checklist.append(x9) if x10 != None: checklist.append(x10) X = df[checklist] y = df[y] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #training if algo == 'DecisionTreeRegressor': regressor = DecisionTreeRegressor() elif algo == 'DecisionTreeClassifier': regressor = DecisionTreeClassifier() elif algo == 'SVC': regressor = SVC(kernel='rbf', probability=True) elif algo == 'GaussianNB': regressor = GaussianNB() elif algo == 'RandomForestClassifier': regressor = RandomForestClassifier() elif algo == 'KNeighbors': regressor = KNeighborsClassifier() elif algo == 'MLP': regressor = MLPClassifier() regressor.fit(X_train, y_train) #prediction y_pred = regressor.predict(X_test) prediction = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) print(prediction) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) print() ''' pickle.dump(regressor, open(targetFile, 'wb')) print('Model saved: ', targetFile) print() loaded_model = pickle.load(open(targetFile, 'rb')) result = loaded_model.score(X_test, y_test) result = round((result*100), 2) print('Confidence: ', result) ''' return prediction
def assign_labels(X_total,X_pred,y_pred): knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_pred, y_pred) return knn.predict(X_total)
def run_knn(X_train, X_test, y_train, y_test): clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, y_train) return clf.score(X_test, y_test)
test_size=0.1, random_state=84) # Ser på størrelsen for hver split print("training data points: {}".format(len(trainLabels))) print("validation data points: {}".format(len(valLabels))) print("testing data points: {}".format(len(testLabels))) # Initialiserer k får vår knn classifier kVals = range(1, 30, 2) accuracies = [] # Looper over kVals for k in range(1, 30, 2): # Tren the classifier med valuen til "k" model = KNeighborsClassifier(n_neighbors=k) model.fit(trainData, trainLabels) # Evaluate moddelen og print score = model.score(valData, valLabels) print("k=%d, accuracy=%.2f%%" % (k, score * 100)) accuracies.append(score) # Største accuracy i = np.argmax(accuracies) print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i], accuracies[i] * 100)) # Nå som vi har den beste K verdien, tren classifieren på nytt model = KNeighborsClassifier(n_neighbors=kVals[i], algorithm='brute') model.fit(trainData, trainLabels)
def knn(n_neighbors=5): knn = KNeighborsClassifier(n_neighbors, weights="uniform", n_jobs=-1) return knn
def train_model(feats_csv): df = pd.DataFrame() df = pd.read_csv(feats_csv).iloc[:,1:] y = np.ravel(df.iloc[:,-1:]) X = np.array(df.iloc[:,:-1]) ############ 15 Best selected features using ANOVA F-value score function ############### X_new = SelectKBest(f_classif, k=15).fit_transform(X, y) selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True) ############ KNN manhattan ############### ##### preprocessing: data scaling######## min_max_scaler = MinMaxScaler() X_new = min_max_scaler.fit_transform(X_new) model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform') model.fit(X_new,y) newdir = '../kNN_clfr' os.mkdir(newdir) joblib.dump(model, os.path.join(newdir,'kNN.pkl')) return