def traineKNNHSV(inFHEE, inFPAS, base): X = []; y = []; for i in range(int(qtdDataTrainning/2)): X.append(pickle.load(inFHEE)); y.append(labelHEE); X.append(pickle.load(inFPAS)); y.append(labelPAS); inFHEE.close(); inFPAS.close(); X1 = np.array(X); y1 = np.array(y); knn5 = KNeighborsClassifier(n_neighbors=5); knn10 = KNeighborsClassifier(n_neighbors=10); knn15 = KNeighborsClassifier(n_neighbors=15); knn20 = KNeighborsClassifier(n_neighbors=20); kf = KFold(n_splits=10); print(len(y1), len(X1)); for k, (train, test) in enumerate(kf.split(X1, y1)): print("TRAIN:", train, " - TEST:", test); knn5.fit(X1[train], y1[train]); knn10.fit(X1[train], y1[train]); knn15.fit(X1[train], y1[train]); knn20.fit(X1[train], y1[train]); print("K ->",k); print(knn5.score(X1[test], y1[test])); print(knn10.score(X1[test], y1[test])); print(knn15.score(X1[test], y1[test])); print(knn20.score(X1[test], y1[test])); n = "..\\trainedKNN_0HistogramColors_H.bin"; if(base==360): n = "..\\trainedKNN_0HistogramColors_H.bin"; elif(base==180): n = "..\\trainedKNN_1HistogramColors_H.bin"; elif(base==90): n = "..\\trainedKNN_2HistogramColors_H.bin"; elif(base==45): n = "..\\trainedKNN_3HistogramColors_H.bin"; knnFile = open(n, "wb"); print(knn5.get_params()); print(knn10.get_params()); print(knn15.get_params()); print(knn20.get_params()); pickle.dump(knn5, knnFile); pickle.dump(knn10, knnFile); pickle.dump(knn15, knnFile); pickle.dump(knn20, knnFile); knnFile.close();
def traineKNN(channels, numberOfShifts): fHEE = "..\\Logs\\3\\Treinamento\\"+channels+"\\output"+channels+"_"+str(numberOfShifts)+"HistogramColors_H&E.bin"; fPAS = "******"+channels+"\\output"+channels+"_"+str(numberOfShifts)+"HistogramColors_PAS.bin"; inFHEE = open(fHEE, 'rb'); inFPAS = open(fPAS, 'rb'); print("Training KNN to channels "+channels+" with "+str(numberOfShifts)+" shifts"); base = 256>>numberOfShifts; X = []; y = []; for i in range(int(qtdDataTrainning/2)): X.append(pickle.load(inFHEE)); y.append(labelHEE); X.append(pickle.load(inFPAS)); y.append(labelPAS); inFHEE.close(); inFPAS.close(); X1 = linearizeTraining(base, channels, X); y1 = np.array(y); knn5 = KNeighborsClassifier(n_neighbors=5); knn10 = KNeighborsClassifier(n_neighbors=10); knn15 = KNeighborsClassifier(n_neighbors=15); knn20 = KNeighborsClassifier(n_neighbors=20); kf = KFold(n_splits=10); print(len(y1), len(X1)) for k, (train, test) in enumerate(kf.split(X1, y1)): print("TRAIN:", train, " - TEST:", test); knn5.fit(X1[train], y1[train]); knn10.fit(X1[train], y1[train]); knn15.fit(X1[train], y1[train]); knn20.fit(X1[train], y1[train]); print("K ->",k); print(knn5.score(X1[test], y1[test])); print(knn10.score(X1[test], y1[test])); print(knn15.score(X1[test], y1[test])); print(knn20.score(X1[test], y1[test])); knnFile = open("..\\trainedKNN_"+str(numberOfShifts)+"HistogramColors_"+channels+".bin", "wb"); print(knn5.get_params()); print(knn10.get_params()); print(knn15.get_params()); print(knn20.get_params()); pickle.dump(knn5, knnFile); pickle.dump(knn10, knnFile); pickle.dump(knn15, knnFile); pickle.dump(knn20, knnFile); knnFile.close();
def kFoldValidationMethod(X_train, y_train): kf = KFold(n_splits = 10) bestAccuracyLogisticRegression = 0 bestAccuracyRandomForestClassifier = 0 bestAccuracyKNNClassifier = 0 accuracyTable = pd.DataFrame(columns = ['Logistic Regression', 'Random Forest', 'KNN Classifier']) for train_index, validation_index in kf.split(X_train): modelLogisticRegression = LogisticRegression().fit(X_train.iloc[train_index], y_train.iloc[train_index]) modelRandomForestClassifier = RandomForestClassifier().fit(X_train.iloc[train_index], y_train.iloc[train_index]) modelKNNClassifier = KNeighborsClassifier().fit(X_train.iloc[train_index], y_train.iloc[train_index]) accuracyLogisticRegression = accuracy_score(modelLogisticRegression.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index]) accuracyRandomForestClassifier = accuracy_score(modelRandomForestClassifier.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index]) accuracyKNNClassifier = accuracy_score(modelKNNClassifier.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index]) if accuracyLogisticRegression > bestAccuracyLogisticRegression: bestAccuracyLogisticRegression = accuracyLogisticRegression paramsLogisticRegression = modelLogisticRegression.get_params() if accuracyRandomForestClassifier > bestAccuracyRandomForestClassifier: bestAccuracyRandomForestClassifier = accuracyRandomForestClassifier paramsRandomForestClassifier = modelRandomForestClassifier.get_params() if accuracyKNNClassifier > bestAccuracyKNNClassifier: bestAccuracyKNNClassifier = accuracyKNNClassifier paramsKNNClassifier = modelKNNClassifier.get_params() accuracyTable = accuracyTable.append({'Logistic Regression': accuracyLogisticRegression, 'Random Forest': accuracyRandomForestClassifier, 'KNN Classifier': accuracyKNNClassifier}, ignore_index = True) return (bestAccuracyLogisticRegression, bestAccuracyRandomForestClassifier, bestAccuracyKNNClassifier, paramsLogisticRegression, paramsRandomForestClassifier, paramsKNNClassifier, accuracyTable)
def KNeighbors_Model(X_train, y_train, X_test, y_test): k = 1 max_score = 0.0 testing_model = None cv_scores = None for n_neighbors in range(1,4): model = KNeighborsClassifier(n_neighbors = n_neighbors) classifier = model.fit(X_train, y_train) testing_model = model.predict(X_test) score = model.score(X_test, y_test) if score > max_score: max_score = score k = n_neighbors cv_scores = cross_val_score(classifier, X_test, y_test, cv = 3) print(' ') print('===== k-Neighbors Model =====') print('score:', max_score) print('cross validation scores:', cv_scores) # Visualize parameters in a table. visualize_params(model.get_params()) # Display confusion matrix. visualize_heatmap(y_test, testing_model, 'k-Neighbors') return score
def getmodel(self, type): if type == 'knn': model = KNeighborsClassifier(n_neighbors=5) elif type == 'nearestcentroid': model = NearestCentroid() elif type == 'svm': model = SVC(gamma='scale') elif type == 'gaussianprocess': model = GaussianProcessClassifier() elif type == 'rf': model = RandomForestClassifier(n_estimators=100, max_features=10, max_depth=5) elif type == 'ada': model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=100, random_state=24) elif type == 'mlp': model = MLPClassifier(solver='adam', hidden_layer_sizes=(28, 28), random_state=1) else: print("Unsupported quality estimator %s" % type) exit(1) if 'random_state' in model.get_params().keys(): model.set_params(random_state=self.random_state) return model
def assign_to_clusters(x_train: np.ndarray, clusters: np.ndarray, x_test: np.ndarray, y_true: np.ndarray) -> None: """ Assigns new data to existing clusters, using nearest neighbors classification. :param x_train: the data which have been clustered. :param clusters: the clusters. :param x_test: the data to be assigned to clusters. :param y_true: the data class labels. """ logger.log('Creating Nearest Neighbors classifier with params:') clf = KNeighborsClassifier() clf_params = clf.get_params() logger.log(clf_params) clf.fit(x_train, clusters) y_pred = clf.predict(x_test) if PLOTTING_MODE != 'none': # Plot data vs clusters. plotter.subfolder = 'classification' plotter.filename = 'data_vs_clusters-k={}'.format( clf_params['n_neighbors']) plotter.xlabel = 'first feature' plotter.ylabel = 'second feature' plotter.title = 'Classified data vs Clusters' plotter.scatter_classified_comparison( x_train, clusters, x_test, y_true, y_pred, 'Test data vs clusters', 'Test data assigned to clusters\nk={}'.format( clf_params['n_neighbors']), helpers.datasets.get_gene_name)
def test_nearest_neighbor_defaults(): """Unit test for Nearest Neighbors classifer algorithm. Check if classifier container with default parameters performs the same as running the corresponding sklearn algorithm with their default parameters.""" # Generate dataset datasets = generate_tutorial_data() clf = KNeighborsClassifier() # manual sklearn categorizations expected_predictions = [] for ds_name in datasets: X, y = datasets[ds_name] clf.fit(X, y) # Train classifier expected_predictions.append(clf.predict(X)) clf_container = classifiers.KNeighborsContainer() # Check that default params are equal assert (clf_container.create_clf().get_params() == clf.get_params()) # Check that the evaluate function works correctly for i, ds_name in enumerate(datasets): X, y = datasets[ds_name] pipeline_ds = generate_pipeline_dataset(X, y) actual_predictions = clf_container.evaluate(pipeline_ds.training_set, pipeline_ds.testing_set) assert len(actual_predictions) == len(expected_predictions[i]) assert (actual_predictions == expected_predictions[i]).all()
def get_knn(x_train, t_train, x_val, t_val, search=False): # KNN params: {'algorithm': 'kd_tree', 'leaf_size': 30, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'} # KNN tested at (array([0.88484087, 0.88107203, 0.89007538, 0.88628272, 0.89256545]), 0.6603707265070087, 0.9209732808442919) if search: knn_params = param_sel( x_train, t_train, KNeighborsClassifier(), { 'n_neighbors': [1, 3, 5, 10, 20], 'weights': ['uniform', 'distance'], 'algorithm': [ 'ball_tree', 'kd_tree', ], 'p': [1, 2] }) else: knn_params = { 'algorithm': 'kd_tree', 'leaf_size': 30, 'n_neighbors': 20, 'p': 1, 'weights': 'distance' } knn_classifier = KNeighborsClassifier(**knn_params) knn_classifier.fit(x_train, t_train) print("KNN params:", knn_classifier.get_params()) print("KNN validated at", validate(knn_classifier, x_val, t_val)) return knn_classifier
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M): outFile = open('knnLog25.txt','a') print 'running mashable knn simulation' outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1])) outFile.write('test==> %d, %d \n'%(test_M.shape[0],test_M.shape[1])) with SimpleTimer('time to train', outFile): clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target) plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4) baseScore = clf.score(test_M, dataTest.target) baseParams = clf.get_params(True) baseNeighbors = baseParams['n_neighbors'] print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors) outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors)) res = [] with SimpleTimer('time to fine tune number of neighbors', outFile): for neighbors in range(2,baseNeighbors * 10): # print 'training for neighbors %d' % neighbors clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target) score = clf.score(hold_M, holdout.target) res.append((score, neighbors)) outFile.write('%d %.3f \n' % (neighbors, score)) res = sorted(res, key=lambda x:x[0], reverse=True) print res[:5] bestNeighbors = res[0][1] print ('best number of neighbors is %d' % bestNeighbors) outFile.write('best number of neighbors is %d and score is %.3f\n' % (bestNeighbors, res[0][0])) bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance') bestClf.fit(train_M, dataTrain.target) predicted = bestClf.predict(test_M) trainPredict = bestClf.predict(train_M) print 'testing score' outFile.write('testing score') outputScores(dataTest.target, predicted, outFile) print 'training score' outFile.write('testing score') outputScores(dataTrain.target, trainPredict, outFile) results = predicted == dataTest.target print numpy.mean(results) res = [] for i in range(len(results)): if not results[i]: res.append(i) print 'classifier got these wrong:' for i in res[:10]: print dataTest.data[i], dataTest.target[i] outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i])) ''' train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5) print train_sizes print train_scores print valid_scores ''' plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
def perform_KNeighborsClassifier(self): KNeighbors_classifier = KNeighborsClassifier() KNeighbors_classifier.fit(self.data_train, self.labels_train) self.KNeighborsClassifier_result={"parameters":KNeighbors_classifier.get_params(),"labels_test_data":KNeighbors_classifier.predict(self.data_test),"score":KNeighbors_classifier.score(self.data_test,self.labels_test)} print_dict(self.KNeighborsClassifier_result) print("f1_score:") print(f1_score(self.labels_test, self.KNeighborsClassifier_result["labels_test_data"], average='macro') )
def recommend(interactions_map, item_profiles, user_profiles, target_users): ############## OLDER ALGS TO RUN FOR COMPARISON ################################## #return recommendTopPop(interactions_map, item_profiles, user_profiles, target_users) #return recommendTopPopNP(interactions_map, item_profiles, user_profiles, target_users) print("Listing users interactions...") tic = time.time() print(interactions_map.head()) interacting_items = interactions_map[['user_id', 'item_id']] #interacting_items = interacting_items.set_index('user_id') print('--------------------------') print(interacting_items.head()) listed_interactions = interacting_items.groupby('user_id').apply( lambda group: group.drop_duplicates()) listed_interactions = listed_interactions[['item_id']] print(listed_interactions.head()) print("Interactions listed in {:.3f} sec!".format(time.time() - tic)) #target_users = pd.Series(target_users.user_id) #print(target_users.head()) #for u, user in enumerate(target_users): ## X can be an array of points! #X = listed_interactions.loc[user] #print(user) #print(X) #return # Prepare the item table, making all fields meaningful print("Preparing rows...") tic = time.time() item_profiles = item_profiles.fillna(0) print("# NaN filled") item_profiles = item_profiles.apply(lambda x: prepare_row(x), axis=1) print("Done in {:.3f} sec!".format(time.time() - tic)) print("Training started...") neigh = KNeighborsClassifier(n_neighbors=5, metric='cosine', algorithm='brute') print(neigh.get_params()) neigh.fit(item_profiles, item_profiles.id) print("Training completed!") print("Creating recommendations") target_users = pd.Series(target_users.user_id) for u, user in enumerate(target_users): # X can be an array of points! X = listed_interactions.loc[user][:1].item_id.reshape(1, -1) print(X) print("User {} gets these recs: {}".format( user, neigh.kneighbors(X=X, n_neighbors=5, return_distance=False))) if u > 1: return print("Done!") return
class KNN(ClassicalModel): def __init__(self, input_size, output_size, labels, class_weights=None, **kwargs): super().__init__(input_size, output_size, labels, class_weights) self.model = KNeighborsClassifier(**kwargs) self.name = "KNN:\n" + str(self.model.get_params())
def knnClassifier(X_train, X_test, y_train, y_test): print("knn") model1 = KNeighborsClassifier() model1.fit(X_train, y_train) y_pred = model1.predict(X_test) print(f1_score(y_test, y_pred)) # Look at parameters used by our current forest print('Parameters currently in use:\n') print(model1.get_params())
def knn(): #knn = KNeighborsClassifier(n_neighbors=10, weights="distance") knn = KNeighborsClassifier() print(knn.get_params().keys()) param_grid = { "n_neighbors": [3, 5, 7, 10], "weights": ["distance", "uniform"] } #clf = GridSearchCV(knn, param_grid=param_grid, scoring = 'accuracy', cv = 5) return clf
def flastClassification(trainData, trainLabels, testData, sigma, k, params): # training t0 = time.perf_counter() kNN = KNeighborsClassifier( algorithm=params["algorithm"], metric=params["metric"], weights=params["weights"], n_neighbors=k, n_jobs=1 ) kNN.fit(trainData, trainLabels) t1 = time.perf_counter() trainTime = t1 - t0 t0 = time.perf_counter() predictLabels = [] neighborDist, neighborInd = kNN.kneighbors(testData) for (distances, indices) in zip(neighborDist, neighborInd): phi, psi = 0, 0 for (distance, neighbor) in zip(distances, indices): if kNN.get_params()["weights"] == "distance": dInv = (1 / distance) if distance != 0 else float("Inf") else: dInv = 1 if trainLabels[neighbor] == 1: phi += dInv else: psi += dInv # handle limit cases for prediction if phi == float("Inf") and psi == float("Inf"): prediction = 0 elif psi == float("Inf"): prediction = 0 elif phi == float("Inf"): prediction = 1 elif (phi + psi) == 0: prediction = 0 else: if phi / (phi + psi) >= sigma: prediction = 1 else: prediction = 0 predictLabels.append(prediction) t1 = time.perf_counter() testTime = t1 - t0 return trainTime, testTime, predictLabels
class myKnn(): def __init__(self, train_data, train_label, test_data, test_label): self.train_data = train_data self.train_label = train_label self.test_data = test_data self.test_label = test_label self.predict_label = None self.train_time = 0 self.test_time = 0 self.clf = None def setK(self, k = 1): self.clf = KNeighborsClassifier(n_neighbors = k) def train(self): print("Start train") time_start = time.time() self.clf.fit(self.train_data, self.train_label) time_end = time.time() - time_start print("End train", time_end) self.train_time = time_end return self.train_time def test(self): print("Start test") time_start = time.time() self.predict_label = self.clf.predict(self.test_data) time_end = time.time() - time_start print("End test", time_end) self.test_time = time_end return self.test_label, self.test_time def getTestLabel(self): return self.test_label def getPredictLabel(self): return self.predict_label def getTrainTime(self): return self.train_time def getTestTime(self): return self.test_time def getParams(self): return self.clf.get_params()
def knn_classify(df, dep_var, n_neighbors, metric, predictors=None, threshold=.5, temporal=False, start_col=None, end_col=None, start_date=None, end_date=None): ''' Create a nearest neighbor model using sklearn. Requires pandas dataframe, number of neighbors, list of predictors, dependent variable to use as input. If no predictors are input, it defaults to using all potential predictors. Creates separate training, testing data either using sklearn default or making a temporal split as above. Returns predicted y-values and y-testing values. ''' if temporal: x, x_train, x_test, y_train, y_test = temporal_split( df, start_col, end_col, start_date, end_date, dep_var, predictors) else: y = df[dep_var] if not predictors: x = df.drop(dep_var, axis=1) else: x = df[predictors] knn = KNeighborsClassifier(n_neighbors, metric=metric) knn.fit(x_train, y_train) y_scores = knn.predict_proba(x_test) y_predict = [1 if x[1] > threshold else 0 for x in y_scores] params = knn.get_params() return (y_test, y_predict, y_scores, params)
def knn(X, y, n_neighbors=None): # Split data into training set and testing set # By default, 75% of the data set is used to for training and # 25% of the data is used to test the model X_train, X_test, y_train, y_test = train_test_split(X, y) # Instantiate K-Nearest Neighbors and fit modl = KNeighborsClassifier(n_neighbors=n_neighbors) # Fit data modl.fit(X_train, y_train) soft_yes = modl.predict_proba(X_test) hard_yes = modl.predict(X_test) # logloss and others requires the probabilities that Yes or 1 is predicted logl = metrics.log_loss(y_test, soft_yes) fpr, tpr, _ = metrics.roc_curve(y_test, soft_yes[:, 1]) auc = metrics.roc_auc_score(y_test, soft_yes[:, 1]) # Precision and accuracy requires y-predictions as (0, 1) accuracy = metrics.accuracy_score(y_test, hard_yes) precision = metrics.precision_score(y_test, hard_yes) recall = metrics.recall_score(y_test, hard_yes) metrics_str = f'KNN: Accuracy: {accuracy:.4f}. Precision: {precision:.4f}. Recall: {recall:.4f}. Log-loss: {logl:.4f}. AUC: {auc:.4f}' return metrics_str, { 'Model': 'K-Nearest Neighbors', 'X_test': X_test, # For plotting at the end 'y_test': y_test, # For plotting at the end 'hard_predictions': hard_yes, 'prediction probs': soft_yes, # For logistic regression, feature importances can be extracted from beta_coefficients 'false pos rate': fpr, 'true pos rate': tpr, 'logloss': logl, 'area under curve': auc, # This returns parameters used in function call 'parameters': modl.get_params() }
dat = df[:, 0:64] tar1 = df[:, 64] X = dat y = tar1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20) clf = KNeighborsClassifier() s1 = time.time() clf.fit(X_train, y_train) e1 = time.time() t1 = e1 - s1 y_test = np.array(y_test) print(clf.get_params()) print("Training time: ", t1) plot_learning_curve(clf, 'Learning Curve for K-nn', X_train, y_train, (0, 1.01), cv=5) # plot_tree(clf.fit(X_train, y_train),filled=True) # plt.show() clf1 = KNeighborsClassifier() plot_validation_curve(X_train, y_train, clf1, 'k1') clf3 = KNeighborsClassifier() clf3 = GridSearchCV(estimator=clf3, param_grid={
def _classification(): accuracy_mean_list, accuracy_min_list, accuracy_max_list = [], [], [] accuracy_sd_list, accuracy_se_list = [], [] f1_mean_list, f1_min_list, f1_max_list = [], [], [] f1_sd_list, f1_se_list = [], [] precision_mean_list, precision_min_list, precision_max_list = [], [], [] precision_mean_list, precision_min_list, precision_max_list = [], [], [] recall_sd_list, recall_se_list = [], [] recall_sd_list, recall_se_list = [], [] best_params_list = [] kfold_accuracy_for_df = [] kfold_f1_macro_for_df, kfold_f1_for_df, kfold_precision_for_df, kfold_recall_for_df = [], [], [], [] params_for_df = [] kfold_f1_for_csv, kfold_precision_for_csv, kfold_recall_for_csv = [], [], [] feature_columns = data_dict.keys() scaler = MinMaxScaler() for feature_column in feature_columns: X = data_dict[feature_column] y = features_ml['Status'].values kfold_accuracy_list = [] kfold_f1_macro_list, kfold_f1_list, kfold_precision_list, kfold_recall_list = [], [], [], [] params_list = [] # kfold_accuracy_for_df kfold_f1_macro_for_df, kfold_f1_for_df, kfold_precision_for_df, kfold_recall_for_df = [], [], [], [] # params_for_df = [] print ('\n') print (path_csv) print ('K-fold : ', kfold) print ('Features : ', feature_column) print ('\n') if kfold is 'Stratified': k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # Stratified k_fold_split = k_fold.split(X, y) elif kfold is 'LeaveOneSubjectOut': subjects = features_ml['Subjects'].values k_fold = LeaveOneGroupOut() k_fold_split = k_fold.split(X, y, subjects) for train_index, test_index in k_fold_split: # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = scaler.fit_transform(X_train) # X_test = scaler.fit_transform(X_test) X_test = scaler.transform(X_test) # GridSearch CV KNN = KNeighborsClassifier() print (KNN) paramaters = { 'n_neighbors': np.arange(10, 50), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan', 'minskowski'] } gridsearch = GridSearchCV(KNN, paramaters, cv=k_fold, verbose=1, n_jobs=-1) groups = None if kfold is 'Stratified' else subjects[train_index] # if kfold is 'Stratified': # groups = None # elif kfold is 'LeaveOneSubjectOut': # groups = subjects[train_index] gridsearch.fit(X_train, y_train, groups=groups) optimal_params = gridsearch.best_params_ # Support vector machine knn = KNeighborsClassifier(**optimal_params) # class_weight = 'balanced' = {Favored :3 and Non_favored :1) approximately. print (knn.get_params()) knn.fit(X_train, y_train) knn_accuracy = knn.score(X_test, y_test) y_pred = knn.predict(X_test) f1_ = f1_score(y_test, y_pred, average=None) f1_macro = f1_score(y_test, y_pred, average='macro') precision_ = precision_score(y_test, y_pred, average=None) recall_ = recall_score(y_test, y_pred, average=None) # print (confusion_matrix(y_test_stratified, y_pred)) # print (classification_report(y_test_stratified,y_pred)) # print ('Kernel: %s' % (gridsearch.best_params_['kernel'])) # print ("Accuracy: %0.2f %s\n" % (svc_accuracy.mean() * 100, '%')) # print ("---------------------------------------------------------") # print (f1_[1]) kfold_accuracy_list.append(knn_accuracy) # 10 models kfold_f1_list.append(f1_) kfold_f1_macro_list.append(f1_macro) kfold_precision_list.append(precision_) kfold_recall_list.append(recall_) params_list.append(optimal_params) # 10 paramete sets index_best_accuracy = kfold_accuracy_list.index(max(kfold_accuracy_list)) index_worst_accuracy = kfold_accuracy_list.index(min(kfold_accuracy_list)) index_best_f1 = kfold_f1_macro_list.index(max(kfold_f1_macro_list)) index_worst_f1 = kfold_f1_macro_list.index(min(kfold_f1_macro_list)) accuracy_mean_list.append(np.round(np.mean(kfold_accuracy_list)*100, decimals=2)) accuracy_sd_list.append(np.round(np.std(kfold_accuracy_list),decimals=2)) accuracy_se_list.append(np.round(np.std(kfold_accuracy_list)/np.sqrt(len(kfold_accuracy_list)),decimals=2)) accuracy_min_list.append(np.round(kfold_accuracy_list[index_worst_accuracy]*100, decimals=2)) accuracy_max_list.append(np.round(kfold_accuracy_list[index_best_accuracy]*100, decimals=2)) f1_mean_list.append(np.round(np.mean(kfold_f1_macro_list)*100, decimals=2)) f1_sd_list.append(np.round(np.std(kfold_f1_macro_list),decimals=2)) f1_se_list.append(np.round(np.std(kfold_f1_macro_list)/np.sqrt(len(kfold_f1_list)),decimals=2)) f1_min_list.append(np.round(kfold_f1_macro_list[index_worst_f1]*100, decimals=2)) f1_max_list.append(np.round(kfold_f1_macro_list[index_best_f1]*100, decimals=2)) best_params_list.append(params_list[index_best_accuracy]) params_for_df.append([params_list]) kfold_accuracy_for_df.append(kfold_accuracy_list) # kfold_f1_for_df.append([kfold_f1_list]) # kfold_precision_for_df.append([kfold_precision_list]) # kfold_recall_for_df.append([kfold_recall_list]) for i in range(len(kfold_f1_list)): kfold_f1_for_df.append(list(kfold_f1_list[i])) kfold_precision_for_df.append([list(kfold_precision_list[i])]) kfold_recall_for_df.append([list(kfold_recall_list[i])]) print (f'{len(kfold_f1_for_df)}') kfold_f1_for_csv.append([kfold_f1_for_df]) kfold_precision_for_csv.append([kfold_precision_for_df]) kfold_recall_for_csv.append([kfold_recall_for_df]) print (f'{len(kfold_f1_for_df)}') # # accuracy_dict = {'accuracy': kfold_accuracy_for_df} # params_dict = {'params': params_for_df} # f1_dict = {'f1_score' : kfold_f1_for_df} # precision_dict = {'precision' : kfold_precision_for_df} # recall_dict = {'recall' : kfold_recall_for_df} print ('\n') print ('Average of accuracy : %.2f (+/- %.2f)' % (np.mean(kfold_accuracy_list)*100, np.std(kfold_accuracy_list))) print ('Highest accuracy : %.2f \n' % (kfold_accuracy_list[index_best_accuracy]*100)) print ('Average of F1-score : %.2f (+/- %.2f)' % (np.mean(kfold_f1_macro_list)*100, np.std(kfold_f1_macro_list))) print ('Highest F1-score : %.2f \n' % (kfold_f1_macro_list[index_best_f1]*100)) print ('Best parameters : %s' % (params_list[index_best_accuracy])) print ('\n-------------------------------------------------------------\n') df_columns_1 = [kfold + '_acc_mean', kfold + '_acc_sd', kfold + '_acc_se', kfold + '_acc_min', kfold + '_acc_max', kfold + '_f1_mean', kfold + '_f1_sd', kfold + '_f1_se', kfold + '_f1_min', kfold + '_f1_max', kfold + '_best_params', kfold + '_fold_accuracy', kfold + '_fold_best_params', kfold + '_fold_f1', kfold + '_fold_precision', kfold + '_fold_recall'] print ('Writing dataframe ...') result_df_1 = pd.DataFrame(np.array([accuracy_mean_list, accuracy_sd_list, accuracy_se_list, accuracy_min_list, accuracy_max_list, f1_mean_list, f1_sd_list, f1_se_list, f1_min_list, f1_max_list, best_params_list, kfold_accuracy_for_df, params_for_df, kfold_f1_for_csv, kfold_precision_for_csv, kfold_recall_for_csv ], dtype=object).T, columns=df_columns_1, index=list(feature_columns)) #kfold_accuracy_for_df,params_for_df # result_df_2 = pd.DataFrame([np.array([ # # params_for_df, # kfold_f1_for_df, # kfold_precision_for_df, # kfold_recall_for_df # ], dtype=object).T], # columns=df_columns_2, # index=list(feature_columns)) return result_df_1
print(iris_X[:2]) ## 顯示前2筆 print(iris_y) print(np.unique(iris.target)) ## 重複的值不顯示 X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3) ## 將iris_X和iris_y這2個數據集分成訓練集和測試集,其中測試集佔數據集的30% print(X_train[:2]) ## 顯示屬性的訓練集(前2筆) print(X_test[:2]) ## 顯示屬性的測試集(前2筆) print(y_train) ## 顯示分類的訓練集 print(y_test) ## 顯示分類的測試集 knn = KNeighborsClassifier() print(knn.get_params()) ## 取出之前定義的參數 knn.fit(X_train, y_train) ## 訓練模型 print(knn.predict(X_test)) ## 預測測試集的數據 print(y_test) ## 真實值 ## 2D圖 x_min, x_max = iris_X[:, 0].min() - .5, iris_X[:, 0].max() + .5 y_min, y_max = iris_X[:, 1].min() - .5, iris_X[:, 1].max() + .5 plt.figure(2, figsize=(10, 8)) # plt.clf() # Plot the training points plt.scatter(iris_X[:, 0], iris_X[:, 1], c=iris_y, cmap=plt.cm.Paired) plt.xlabel('Sepal length')
current_data = data for col in ['cholhighyr','ytqyogyr','vitany']: current_data = current_data[current_data[col].isin([1,2])] # only want values 1 and 2 # set up training/test set X = current_data[['ytqyogyr','vitany']] Y = current_data[['cholhighyr']] X_train, X_test,Y_train, Y_test = train_test_split(X,Y, test_size=.15, random_state=5) # KNN model (accuracy ~0.63, recall values 0.28/0.73 neigh_best = KNeighborsClassifier(n_neighbors=4) neigh_best.fit(X_train, Y_train) y_predicted = neigh_best.predict(X_test) print classification_report(Y_test, y_predicted) print neigh_best.get_params() print '\nALL DATA:' print 'data entries:', len(current_data) print 'did yoga in the past year - no', len(X[X['ytqyogyr']==1]) print 'did yoga in the past year - yes', len(X[X['ytqyogyr']==2]) print 'took vitamins/supplements in the past year - no', len(X[X['vitany']==1]) print 'took vitamins/supplements in the past year - yes', len(X[X['vitany']==2]) print 'high cholesterol - no:', len(Y[Y['cholhighyr']==1]) print 'high cholesterol - yes:', len(Y[Y['cholhighyr']==2]) X_new = X X_new['ytqyogyr'] = X_new['ytqyogyr']-1 X_new['vitany'] = X_new['vitany']-1 X_new.to_csv('vizdata.csv', index=False,header = ['yoga','vitamins'])
trainingSet = np.vstack((trainingSetEllipticals, trainingSetSpirals)) #using only elliptical and spiral for training np.random.shuffle(trainingSet) trainingSetLabels = trainingSet[:,12] #putting labels in separate array trainingSetLabels[trainingSetLabels == 0] = -1 #replacing all 0 with -1 to match sklearn format trainingSet = trainingSet[:, 1:11] #removing label cols from actual inputs trainingSet, testingSet, trainingSetLabels, testingSetLabels = train_test_split(trainingSet, trainingSetLabels, test_size = 0.6, random_state = 0) #fixes random_state so results reproducible startTime = time.time() print "Time before training = ", startTime clf = KNeighborsClassifier(n_neighbors = 5) #starting off with 5 neighbors for now clf = clf.fit(trainingSet, trainingSetLabels) print "Params after training:" print clf.get_params() trainingAccuracy = clf.score(trainingSet, trainingSetLabels) print "Training accuracy = ", trainingAccuracy testingAccuracy = clf.score(testingSet, testingSetLabels) print "Testing accuracy = ", testingAccuracy print "Done training and testing! Time = ", time.time() - startTime, "seconds"
def test_KNN(fn): """ Function which will tune and test a K-Nearest Neighbors model. It will plot a confusion matrix and write a performance report to file. Arguments: - fn : Name of the input file. """ #Timer variables start = 0 end = 0 #Load datasets X_train_df = pd.read_csv("input/{}_train_X.csv".format(fn), sep=";") y_train_df = pd.read_csv("input/{}_train_y.csv".format(fn), sep=";") X_test_df = pd.read_csv("input/{}_test_X.csv".format(fn), sep=";") y_test_df = pd.read_csv("input/{}_test_y.csv".format(fn), sep=";") X_val_tr = X_train_df.values y_val_tr = y_train_df.values X_val_test = X_test_df.values y_val_test = y_test_df.values #Convert to numpy arrays X_train = X_val_tr[:].astype(float) y_train = y_val_tr[:] X_test = X_val_test[:].astype(float) y_test = y_val_test[:] #Scale X values (train) scaler = RobustScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) #Scale X values (test) scaler.fit(X_test) X_test = scaler.transform(X_test) #Transform non-numerical values into numericals encoder = LabelEncoder() encoder.fit(y_train.ravel()) encoded_y_train = encoder.transform(y_train.ravel()) encoder.fit(y_test.ravel()) encoded_y_test = encoder.transform(y_test.ravel()) #Number of neighbors (K) to test nr_of_neighbors = [x for x in range(5, 100, 5)] #Variables to store the best values best_model = KNeighborsClassifier() best_acc = 0.0 time_taken = 0 #Test different values for K for K in nr_of_neighbors: knn = KNeighborsClassifier(n_neighbors=K) #Train the model start = time.time() knn.fit(X_train, encoded_y_train) end = time.time() #Predicted values y_pred = knn.predict(X_test) print("\nK: {}".format(knn.get_params()['n_neighbors'])) print("Acc: {}".format(accuracy_score(encoded_y_test, y_pred))) #Measure accuracy and save model if it is the best one if accuracy_score(encoded_y_test, y_pred) > best_acc: time_taken = end - start best_model = knn best_acc = accuracy_score(encoded_y_test, y_pred) #Predict using the best model y_pred = encoder.inverse_transform(best_model.predict(X_test)) K = best_model.get_params()['n_neighbors'] #Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) print("\n") print(classification_report(y_test, y_pred)) print("Scores for final, best model:\n") print("\nK: {}".format(K)) print("Acc: {}".format(accuracy_score(y_test, y_pred))) #Find labels labels = [label for label in y_test_df.iloc[:, 0].unique()] #Plot confusion matrix plot_confusion_matrix(cm, sorted(labels), False) #Show the plot plt.savefig("figures/KNN_confusion_matrix_{}.svg".format(int(time.time()))) #plt.show() #Write a .txt report file with open("reports/KNN_{}_report.txt".format(fn), "w") as f: f.write("REPORT FOR \"{}\"\n\n".format(fn)) f.write("Best value for K: {}".format(K)) f.write("\n\n\nClassification Report:\n") for line in classification_report(y_test, y_pred): f.write(line) f.write("\nConfusion Matrix:\n\n") f.write(np.array2string(cm, separator=', ')) f.write( "\n\nTime used to train the model: {} seconds".format(time_taken)) f.write("\n\nScores for final, best model:\n") f.write("Accuracy: {}".format(best_acc)) f.close()
len(Trees_RF))] = Trees_RF accuracy_paras[start_idx:(start_idx + len(Trees_RF))] = Accuracy_RF values_optimal[3] = Trees_RF_optimal accuracy_optimal[3] = Accuracy_RF.max() else: ##### USE DEFAULT PARAS --> NO OPTIMAL ##### clf = classifiers[idx_method] clf.fit(X_train, Y_train) if (method[idx_method] == 'SVM'): bK = clf.intercept_[0] XK = clf.support_vectors_ YK = Y_train[clf.support_] BetaK = clf.dual_coef_ / YK BetaK = BetaK[0] GammaK = clf.get_params(deep=False)['gamma'] if (g < G_Points.shape[0]): if (np.mod(g + 1, 50) == 0 or g == 0): print('Generating Point: ' + str(g)) break_flag = 1 # flag for solving boundary function (1 -- solve fail --> 0 solve successful) # Find active points while break_flag: # ---------- RANDOM POINT ---------- if (Shape == '2D_Circle'): initial_guess = RS.Random_2D_Circle( [np.sqrt(0.1 * R), np.sqrt(1.9 * R)], [0, 2 * math.pi],
def classifyPHC(): data = readFile() #data = equalizeClasses(data) features, labels = splitData(data) #determine the training and testing size in the range of 1, 1 = 100% validation_size = 0.2 #here we are splitting our data based on the validation_size into training and testing data features_train, features_validation, labels_train, labels_validation = model_selection.train_test_split( features, labels, test_size=validation_size) #normalize data in the range [-1,1] scaler = MinMaxScaler(feature_range=(-1, 1)) #fit only th training data in order to find the margin and then test to data without normalize them scaler.fit(features_train) features_train_scalar = scaler.transform(features_train) #trnasform the validation features without fitting them features_validation_scalar = scaler.transform(features_validation) #determine the pca, and determine the dimension you want to end up pca = KernelPCA(n_components=6, kernel='rbf', fit_inverse_transform=True) #fit only the features train pca.fit(features_train_scalar) #dimensionality reduction of features train features_train_pca = pca.transform(features_train_scalar) #dimensionality reduction of fatures validation features_validation_pca = pca.transform(features_validation_scalar) #reconstruct data training error reconstruct_data = pca.inverse_transform(features_train_pca) error_percentage = ( sum(sum(error_matrix)) / (len(features_train_scalar) * len(features_train_scalar[0]))) * 100 #len(features_train_scalar) = len(reconstruct_data) = 89 #len(features_train_scalar[0]) = len(reconstruct_data[0]) = 13 #len(error_matrix) = 89, which means for all the samples #len(error_matrix[0]) = 13, for every feature of every sample #we take the sum and we conlcude in an array which has the sum for every feature (error) #so we take the sum again and we divide it with the 89 samples * 13 features print 'Information loss of KernelPCA:', error_percentage, '% \n' lda = LinearDiscriminantAnalysis() lda.fit(features_train_pca, labels_train) features_train_pca = lda.transform(features_train_pca) features_validation_pca = lda.transform(features_validation_pca) #we can see the shapes of the array just to check print 'feature training array: ', features_train.shape, 'and label training array: ', labels_train.shape print 'feature testing array: ', features_validation.shape, 'and label testing array: ', labels_validation.shape, '\n' #take the best couple of parameters from the procedure of greedy search #paramTuning(features_train, labels_train, 5) #we initialize our model #svm = SVC(kernel='poly',C=0.001,gamma=10,degree=3,decision_function_shape='ovr') svm = KNeighborsClassifier(n_neighbors=3) #train our model with the data that we previously precessed svm.fit(features_train_pca, labels_train) #now test our model with the test data predicted_labels = svm.predict(features_validation_pca) accuracy = accuracy_score(labels_validation, predicted_labels) print 'Classification accuracy: ', accuracy * 100, '\n' #see the accuracy in training procedure predicted_labels_train = svm.predict(features_train_pca) accuracy_train = accuracy_score(labels_train, predicted_labels_train) print 'Training accuracy: ', accuracy_train * 100, '\n' #confusion matrix to illustrate the faulty classification of each class conf_matrix = confusion_matrix(labels_validation, predicted_labels) print 'Confusion matrix: \n', conf_matrix, '\n' print 'Support class 0 class 1:' #calculate the support of each class print ' ', conf_matrix[0][0] + conf_matrix[0][ 1], ' ', conf_matrix[1][0] + conf_matrix[1][1], '\n' #calculate the accuracy of each class hC = (conf_matrix[0][0] / (conf_matrix[0][0] + conf_matrix[0][1])) * 100 pC = (conf_matrix[1][1] / (conf_matrix[1][0] + conf_matrix[1][1])) * 100 #see the inside details of the classification print 'For class 0 man cases:', conf_matrix[0][ 0], 'classified correctly and', conf_matrix[0][ 1], 'missclassified,', hC, 'accuracy \n' print 'For class 1 woman cases:', conf_matrix[1][ 1], 'classified correctly and', conf_matrix[1][ 0], 'missclassified,', pC, 'accuracy\n' #try 5-fold cross validation scores = cross_val_score(svm, features_train_pca, labels_train, cv=5) print 'cross validation scores for 5-fold', scores, '\n' print 'parameters of the model: \n', svm.get_params(), '\n' #print 'number of samples used as support vectors',len(svm.support_vectors_),'\n' #return svm.support_vectors_ '''#plot the training features before the kpca and the lda procedure
plt.figure(figsize=(7,7)) sns.heatmap(confusion_matrix(y_test, predictions_dtc), annot=True, cmap="Blues", square=True, xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease']) plt.xlabel("Predicted", fontsize=15) plt.ylabel("Actual", fontsize=15) plt.show() from sklearn.neighbors import KNeighborsClassifier clf_knn = KNeighborsClassifier(n_neighbors=10) clf_knn.get_params() clf_knn.fit(X_train, y_train) predictions_knn = clf_knn.predict(X_test) accuracy_score(y_test, predictions_knn) plt.figure(figsize=(7,7)) sns.heatmap(confusion_matrix(y_test, predictions_knn), annot=True, cmap="Blues", square=True, xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease']) plt.xlabel("Predicted", fontsize=15) plt.ylabel("Actual", fontsize=15)
class KNeighbors(Classifier): r"""Implementation of k neighbors classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: “Neighbourhood Components Analysis”, J. Goldberger, S. Roweis, G. Hinton, R. Salakhutdinov, Advances in Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'K Neighbors Classifier' def __init__(self, **kwargs): r"""Initialize KNeighbors instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self._params = dict(weights=ParameterDefinition( ['uniform', 'distance']), algorithm=ParameterDefinition( ['auto', 'ball_tree', 'kd_tree', 'brute'])) self.__kn_classifier = KNC() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__kn_classifier.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit KNeighbors. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. Returns: None """ self.__kn_classifier.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__kn_classifier.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__kn_classifier.get_params()))
y = diagnostic[:trainingSetLength, 1:] # target values (i.e. expected output for X) for i in range(len(y)): y[i] = int(y[i]) y = np.transpose(y).astype('int') trainingSet = extractedFeatures[:trainingSetLength] neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(trainingSet, y[0]) # letting the algorithm know which sample in X belongs to which class labelled in y # save the params to disk neigh_params = neigh.get_params() params_neigh = 'params_neigh.sav' # save the model to disk filename_neigh = 'neigh_model.sav' pickle.dump(neigh, open(filename_neigh, 'wb')) #testSet=extractedFeatures[trainingSetLength:trainingSetLength+10] #prediction=lda.predict(testSet) pickle.dump(neigh_params, open(params_neigh, 'wb')) #%%TEST CLASSIFICATION - Logistic Regression excelAddress = 'C:\\Users\\theor\\Downloads\\Ground_truth_ISIC_1.xlsx' trainingSetLength = 100 diagnostic = preProcessing(excelAddress)
# 'n_estimators': [190,200,210,240,250], # 'learning_rate': [0.1, 0.01, 0.001, 0.0001] # } # # cross_validation = StratifiedKFold(Y_train, n_folds=5) # grid_search = GridSearchCV(forest, # param_grid=parameter_grid, # cv=cross_validation) # # grid_search.fit(X_train, Y_train) # # print('Best score: {}'.format(grid_search.best_score_)) # print('Best parameters: {}'.format(grid_search.best_params_)) forest = KNeighborsClassifier() print forest.get_params().keys() parameter_grid = { 'n_neighbors' : [2,3,4,5,6,7,8], 'algorithm': ['ball_tree', 'kd_tree', 'auto', 'brute'], 'leaf_size': [10, 20, 30, 40, 50] } cross_validation = StratifiedKFold(Y_train, n_folds=5) grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(X_train, Y_train) print('Best score: {}'.format(grid_search.best_score_))
class KNNCombinedClassifier: def __init__(self, **kwargs): self._classifier1 = KNeighborsClassifier(n_jobs=1, **kwargs) self._classifier2 = KNeighborsClassifier(n_jobs=1, **kwargs) self._nNN = self._classifier1.get_params()['n_neighbors'] self._official_labels = None self._fit1 = False self._fit2 = False def fit1(self, official_embeddings, official_labels): self._classifier1.fit(official_embeddings, official_labels) self._official_labels = official_labels self._fit1 = True def fit2(self, official_embeddings, official_labels): self._classifier2.fit(official_embeddings, official_labels) self._official_labels = official_labels self._fit2 = True def get_recurring_indices(self, ind1, ind2): recurring_inds = [] ninds = ind1.shape[0] for i in xrange(ninds): combined_indices = np.hstack((ind1[i], ind2[i])) unique_indices = np.unique(combined_indices) nunique_indices = unique_indices.shape[0] recurring_inds.append([ unique_indices[i] for i in xrange(nunique_indices) if np.sum(combined_indices == unique_indices[i]) > 1 ]) return recurring_inds def predict_combined(self, test_embeddings1, test_embeddings2, alpha=0.2, pbar=False): chunk_size = 2000 ntest_docs = test_embeddings1.shape[0] predicted_codes = np.empty((ntest_docs, self._nNN), dtype=int) prediction_weights = np.empty((ntest_docs, self._nNN)) potential_full_codes = [] for start_index in xrange(0, ntest_docs, chunk_size): if pbar: progress_bar(start_index, ntest_docs) stop_index = start_index + chunk_size if stop_index > ntest_docs: stop_index = ntest_docs NN_dists1, NN_indices1 = self._classifier1.kneighbors( test_embeddings1[start_index:stop_index], return_distance=True) NN_dists2, NN_indices2 = self._classifier2.kneighbors( test_embeddings2[start_index:stop_index], return_distance=True) probs1, class1 = self.get_assignment_probs(NN_dists1, NN_indices1) probs2, class2 = self.get_assignment_probs(NN_dists2, NN_indices2) predicted_codes[start_index:stop_index], prediction_weights[ start_index:stop_index] = self.predict_from_weights( probs1, probs2, class1, class2, alpha) potential_full_codes += self.get_recurring_indices( NN_indices1, NN_indices2) return [predicted_codes, prediction_weights, potential_full_codes] def get_assignment_probs(self, dists, indices): ndocs, nNN = indices.shape probs_total = np.zeros((ndocs, nNN)) classes = np.zeros((ndocs, nNN)) for i in xrange(ndocs): pred_codes = self._official_labels[indices[i]] unique_codes = np.unique(pred_codes) nunique_pred_codes = unique_codes.shape[0] probs = np.zeros(nunique_pred_codes) for j in xrange(nunique_pred_codes): probs[j] = np.sum( 1 / dists[i, np.where(pred_codes == unique_codes[j])]) if np.any(np.isinf(probs)): inf_index = np.where(probs == np.inf) probs[:] = 0 probs[inf_index] = 1 sorted_probs = np.argsort(probs)[::-1] stop_index = nNN if nunique_pred_codes < nNN: stop_index = nunique_pred_codes probs_total[i, :stop_index] = probs[sorted_probs[:stop_index]] classes[i, :stop_index] = unique_codes[sorted_probs[:stop_index]] return probs_total, classes def predict_from_weights(self, probs1, probs2, class1, class2, alpha): ndocs, nNN = probs1.shape predictions = np.zeros((ndocs, nNN), dtype=int) sorted_weights = np.zeros((ndocs, nNN)) for i in xrange(ndocs): unique_classes = np.unique(np.hstack((class1[i], class2[i]))) nunique_classes = unique_classes.shape[0] combined_probs = np.zeros(nunique_classes) for j in xrange(nunique_classes): combined_probs[j] += np.sum( probs1[i][np.where(class1[i] == unique_classes[j])]) combined_probs[j] += alpha * np.sum( probs2[i][np.where(class2[i] == unique_classes[j])]) stop_index = nunique_classes if nunique_classes > nNN: stop_index = nNN sorted_probs = np.argsort(combined_probs)[::-1] sorted_weights[i, :stop_index] = combined_probs[ sorted_probs[:stop_index]] sorted_weights[i] /= np.sum(sorted_weights[i]) predictions[i, :stop_index] = unique_classes[ sorted_probs[:stop_index]] return [predictions, sorted_weights]
# In[6]: sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test) # ### Make Model # In[7]: clf = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean') clf.fit(X_train, y_train) clf.get_params() # ### Predict Test Set with that Model # In[8]: y_pred = clf.predict(X_test) y_pred # ### Check Accuracy # In[9]:
# print(iris) iris_X = iris.data print(len(iris_X)) iris_y = iris.target print(iris_y) X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3) print(y_train) print(y_test) knn = KNeighborsClassifier() knn.fit(X_train, y_train) params = knn.get_params() print(params) score = knn.score(X_test, y_test) print(score) y_predict = knn.predict(X_test) labels = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"] print(y_predict) print(y_test) mcm = confusion_matrix(y_test, y_predict) # mcm=multilabel_confusion_matrix(y_test,y_predict,label=labels) print(mcm) plt.imshow(mcm, cmap=plt.cm.Blues) indices = range(len(mcm)) plt.xticks(indices, labels) plt.yticks(indices, labels)
print_score(knn, X_train, y_train, X_test, y_test, train=False) # # Grid Search # In[32]: from sklearn.model_selection import GridSearchCV # In[33]: knn.get_params() # In[33]: params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} # In[34]: . grid_search_cv = GridSearchCV(KNeighborsClassifier(),
from sklearn.neighbors import KNeighborsClassifier tt = [] aa = [] for i in range(10): t1 = time.time() clf_knn = KNeighborsClassifier() clf_knn.fit(x_train, y_train) t2 = time.time() accuracy_knn = clf_knn.score(x_test, y_test) t = t2 - t1 tt.append(t) aa.append(accuracy_knn) for k in range(10): print("the time of", k + 1, "th training is", tt[k]) print("The accuracy of ", k + 1, "th training is ", aa[k], "\n") tt_ = (tt[1] + tt[2] + tt[3] + tt[4] + tt[5] + tt[6] + tt[7] + tt[8] + tt[9] + tt[0]) / 10 aa_ = (aa[1] + aa[2] + aa[3] + aa[4] + aa[5] + aa[6] + aa[7] + aa[8] + aa[9] + aa[0]) / 10 print("the time of training averagely is ", tt_, "\n", "the accuracy of training averagely is ", aa_) paras_knn = clf_knn.get_params() print(paras_knn)