def radiusNeighborClassifier(): maximumValue = 0 returnParameters = ['0', '0'] for neighbor in xrange(100, 1001, 100): neighAutoRadius = RadiusNeighborsClassifier(radius=neighbor, weights='uniform', algorithm='auto', p=2, metric='minkowski') neighAutoRadius.fit(trainData, trainLabel) neighDistanceRadius = RadiusNeighborsClassifier(radius=neighbor, weights='distance', algorithm='auto', p=2, metric='minkowski') neighDistanceRadius.fit(trainData, trainLabel) scoreAuto = neighAutoRadius.score(validationData, validationLabel) scoreDistance = neighDistanceRadius.score(validationData, validationLabel) if max(scoreAuto, scoreDistance) > maximumValue: maximumValue = max(scoreAuto, scoreDistance) returnParameters[0] = str(neighbor) returnParameters[ 1] = 'distance' if scoreDistance > scoreAuto else 'uniform' neighTest = RadiusNeighborsClassifier(radius=int(returnParameters[0]), weights=returnParameters[1], algorithm='auto', p=2, metric='minkowski') neighTest.fit(trainData, trainLabel) scoreTest = neighTest.score(testData, testLabel) guideToGraph['Radius Neighbor'] = scoreTest
def rnn_model(train_input, train_target, test_input, test_target): r_neigh = RadiusNeighborsClassifier(radius=3.0) r_neigh.fit(train_input, train_target) print("R-NN (r=1) accuracy for training set: %s" % (r_neigh.score(train_input, train_target))) print("R-NN (r=1) accuracy for testing set: %s" % (r_neigh.score(test_input, test_target)))
def radiusNeighborClassifier(): maximumValue = 0 returnParameters = ['0','0'] for neighbor in xrange(100,1001,100): neighAutoRadius = RadiusNeighborsClassifier(radius=neighbor, weights='uniform',algorithm='auto', p=2,metric='minkowski') neighAutoRadius.fit(trainData, trainLabel) neighDistanceRadius = RadiusNeighborsClassifier(radius=neighbor, weights='distance',algorithm='auto', p=2,metric='minkowski') neighDistanceRadius.fit(trainData, trainLabel) scoreAuto = neighAutoRadius.score(validationData, validationLabel) scoreDistance = neighDistanceRadius.score(validationData, validationLabel) if max(scoreAuto,scoreDistance) > maximumValue: maximumValue = max(scoreAuto,scoreDistance) returnParameters[0] = str(neighbor) returnParameters[1] = 'distance' if scoreDistance>scoreAuto else 'uniform' neighTest = RadiusNeighborsClassifier(radius=int(returnParameters[0]), weights=returnParameters[1],algorithm='auto', p=2,metric='minkowski') neighTest.fit(trainData, trainLabel) scoreTest = neighTest.score(testData, testLabel) guideToGraph['Radius Neighbor'] = scoreTest
def knnClassifier(): trainData, trainLabel = featureArray(conf['train']['feature_vector']) testData, testLabel = featureArray(conf['test']['feature_vector']) neigh = KNeighborsClassifier(n_neighbors=1, algorithm='auto', p=2) neigh.fit(trainData, trainLabel) print(neigh.score(testData,testLabel)) neighRadius = RadiusNeighborsClassifier(radius=500, weights='distance',algorithm='auto', p=2,metric='minkowski') neighRadius.fit(trainData, trainLabel) print(neighRadius.score(testData, testLabel))
def knnClassifier(): trainData, trainLabel = featureArray(conf['train']['feature_vector']) testData, testLabel = featureArray(conf['test']['feature_vector']) neigh = KNeighborsClassifier(n_neighbors=1, algorithm='auto', p=2) neigh.fit(trainData, trainLabel) print(neigh.score(testData, testLabel)) neighRadius = RadiusNeighborsClassifier(radius=500, weights='distance', algorithm='auto', p=2, metric='minkowski') neighRadius.fit(trainData, trainLabel) print(neighRadius.score(testData, testLabel))
def KNN_diabetes_demo(): import numpy as np import pandas as pd #get data data = pd.read_csv("D:/PythonProjects/pima-indians-diabetes.csv") print(data.shape) print("data.head(): ", data.head()) #first 5 rows #split intput and outcome X = data.iloc[:, 0:8] Y = data.iloc[:, 8] #split the training data and test data from sklearn.model_selection import train_test_split #random_state is a seed which will decide the way of splitting X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=22) #prediction from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier model1 = KNeighborsClassifier(n_neighbors=2) model1.fit(X_train, Y_train) score1 = model1.score(X_test, Y_test) model2 = KNeighborsClassifier(n_neighbors=2, weights='distance') model2.fit(X_train, Y_train) score2 = model2.score(X_test, Y_test) model3 = RadiusNeighborsClassifier(n_neighbors=2, radius=500.0) model3.fit(X_train, Y_train) score3 = model3.score(X_test, Y_test) #compare the results of the three models print(score1, score2, score3) #cross validation from sklearn.model_selection import cross_val_score result1 = cross_val_score(model1, X, Y, cv=10) result2 = cross_val_score(model2, X, Y, cv=10) result3 = cross_val_score(model3, X, Y, cv=10) print(result1.mean(), result2.mean(), result3.mean())
def main(): X_train_all, t_train_all, train_all_ids = create_data_matrix(0, 3086, TRAIN_DIR) X_train, X_valid, t_train, t_valid = train_test_split(X_train_all, t_train_all, test_size=0.20, random_state=37) X_test_all, t_test_all, test_all_ids = create_data_matrix(0, 3724, TEST_DIR) sv = svm.SVC(kernel='poly') sv.fit(X_train, t_train) print "SVM Score was: %f" % clf.score(X_valid, t_valid) rf = RandomForestClassifier(n_estimators=30, min_samples_split=1, random_state=37) rf.fit(X_train, t_train) print "RandomForest Score was: %f" % (rf.score(X_valid, t_valid)) lr = LogisticRegression(penalty='l2',solver='newton-cg',max_iter=500) lr.fit(X_train, t_train) print "LogisticRegression Score was: %f" % (lr.score(X_valid, t_valid)) clf = GaussianNB() clf.fit(X_train, t_train) print "GaussianNB Score was: %f" % (clf.score(X_valid, t_valid)) nn = KNeighborsClassifier(n_neighbors=6, weights='uniform') nn.fit(X_train, t_train) score = nn.score(X_valid, t_valid) print "KNeighbors Score was: %f" % (score) rnc = RadiusNeighborsClassifier(radius=6,outlier_label=8, p=2) rnc.fit(X_train, t_train) print "RadiusNeighbors Score was: %f" % (rnc.score(X_valid, t_valid)) # Get predictions rf = RandomForestClassifier(n_estimators=30, min_samples_split=1) rf.fit(X_train_all, t_train_all) test_predictions = rf.predict(X_test_all) write_to_file("prediction.csv", test_all_ids, test_predictions)
p=2, metric='minkowski', metric_params=None, n_jobs=1) rlf2 = RadiusNeighborsClassifier(radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None) rlf1.fit(trainX, trainY) rlf2.fit(trainX, trainY) rlf1.score(testX, testY) rlf2.score(testX, testY) ''' n_neighbors 近邻数 radius 半径 weights 权重形式 algorithm 计算最邻近的算法 ball_tree ball树 kd_tree KD树 brute 暴力搜索 auto 根据X,自己选择 leaf_size 选择ball树和KD树时的叶尺寸 p 不太懂 metric 计算树距离度量 outlier_label 是否把离群样本点独立出来 metric_params 不太懂 n_jobs 计算性能相关
plt.show() plate_img_lr, neg_img_lr = plate_img.T, negative_img.T for i in np.random.randint(0, len(plate_img_lr), 5) : show_image_prediction(plate_img_lr, i, clf) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() knn.fit(plate_img_lr, neg_img_lr) print("Model accuracy: {:.2f}%".format(knn.score(plate_img_lr, neg_img_lr)*100)) plate_img_lr, neg_img_lr = plate_img.T, negative_img.T for i in np.random.randint(0, len(plate_img_lr), 5) : show_image_prediction(plate_img_lr, i, knn) from sklearn.neighbors import RadiusNeighborsClassifier rnc = RadiusNeighborsClassifier() rnc.fit(plate_img_lr, neg_img_lr) print("Model accuracy: {:.2f}%".format(rnc.score(plate_img_lr, neg_img_lr)*100)) plate_img_lr, neg_img_lr = plate_img.T, negative_img.T for i in np.random.randint(0, len(plate_img_lr), 5): show_image_prediction(plate_img_lr, i, rnc)
del globals()['unqLikesUIDs'] del globals()['unqLikesLIDs'] del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, sexsARR, test_size=1500) myRAD = float(sys.argv[1]) radNN = RadiusNeighborsClassifier(radius=myRAD) #radNN.fit(likesMAT, sexsARR) radNN.fit(X_train, y_train) print("sexs, Radius neighbors: ", str(myRAD), " ", radNN.score(X_test, y_test)) # joblib.dump(radNN, "/Users/jamster/radNN-A-sexs.xz", compress=9) # impRadNN = joblib.load("/Users/jamster/radNN-A-sexs.xz")
standard_BCE = StandardScaler() standard_TCP = StandardScaler() standard_UIE = StandardScaler() PSE_BCE_t_Unit_neigh.fit(standard_BCE.fit_transform(X_train_BCE), y_train_BCE_Unit) PSE_BCE_t_Upgrade_neigh.fit(standard_BCE.fit_transform(X_train_BCE), y_train_BCE_Upgrade) PSE_TCP_t_Build_neigh.fit(standard_TCP.fit_transform(X_train_TCP), y_train_TCP_Build) PSE_TCP_t_Attack_neigh.fit(standard_TCP.fit_transform(X_train_TCP), y_train_TCP_Attack) print( 'neigh - Unit Accuracy: ', PSE_BCE_t_Unit_neigh.score(standard_BCE.fit_transform(X_train_BCE), y_test_BCE_Unit)) print( 'neigh - Upgrade Accuracy: ', PSE_BCE_t_Upgrade_neigh.score(standard_BCE.fit_transform(X_train_BCE), y_test_BCE_Upgrade)) print( 'neigh - Build Accuracy: ', PSE_TCP_t_Build_neigh.score(standard_TCP.fit_transform(X_train_TCP), y_test_TCP_Build)) print( 'neigh - Attack Accuracy: ', PSE_TCP_t_Attack_neigh.score(standard_TCP.fit_transform(X_train_TCP), y_test_TCP_Attack)) #||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| #|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||KNeighborsClassifier
import pandas as pd import numpy as np from sklearn.neighbors import RadiusNeighborsClassifier import matplotlib.pyplot as plt data = pd.read_csv('ice.csv') x = data[['temp', 'street']] y = data['ice'] clf = RadiusNeighborsClassifier() clf.fit(x, y) print(clf.score(x, y)) t = np.arange(0.0, 31.0) plt.plot(t, y, '--', t, clf.predict(x), '-') plt.show()
#标准化及其保存 scale = StandardScaler() x4 = scale.fit_transform(x3) y4 = y3 way_tempt_scale = 'C:/Users/Administrator/Desktop/ali/data/4_knn/scale_2/scale_2_' + mall_list[ i] + '.model' #joblib.dump(scale,way_tempt_scale) #切分 x5, x6, y5, y6 = train_test_split(x4, y4, test_size=0.5) #重新训练 #knn=KNeighborsClassifier(n_neighbors=len(y3.unique())/3) knn = RadiusNeighborsClassifier(radius=j) ##################### knn.fit(x5, y5) score = knn.score(x6, y6) time_now = time.strftime('%H:%M:%S', time.localtime(time.time())) print( str(i) + '....mall_id=' + mall_list[i] + '....' + str(score) + '....' + str(len(x4))) way_tempt_knn = 'C:/Users/Administrator/Desktop/ali/data/4_knn/scale_2/knn_' + mall_list[ i] + '.model' #joblib.dump(knn,way_tempt_knn) #保存 parameters_append = pd.DataFrame( [[i, mall_list[i], score, len(y6)]], columns=['loop_number', 'mall_id', 'correct_rate', 'member']) parameters = parameters.append(parameters_append, ignore_index=True) #parameters.to_csv(way_write+'parameters.csv')
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Indefinido', 'estilo_de_aprendizagem'] = 0 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Ativo', 'estilo_de_aprendizagem'] = 1 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Teorico', 'estilo_de_aprendizagem'] = 2 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Reflexivo', 'estilo_de_aprendizagem'] = 3 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Pragmatico', 'estilo_de_aprendizagem'] = 4 datatrain = datatrain.apply(pd.to_numeric) datatrain_array = datatrain.as_matrix() X = datatrain_array[:, :14] y = datatrain_array[:, 14:15] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) neigh = RadiusNeighborsClassifier(radius=3) neigh.fit(X_train, y_train) precisao = neigh.score(X_test, y_test) print("------Acurácia-------: %f" % (precisao))
print(np.asarray(RN[0][0])) # print(np.asarray(RN[1][2])) ## 計算x中的點與在指定半徑內的近鄰的加權圖 ## radius neighbors graph RNG = knn_model.radius_neighbors_graph(X, radius=10) print(RNG) print(RNG.toarray()) # ## 利用 test data裡的 X 來預測 y # print(knn_model.predict(X_test)) # ## 查看實際y # print(y_test.values.ravel()) # ## 這是 test data X 預測y是什麼的機率 # print(knn_model.predict_proba(X_test)) # ## 模型預測的準確率(Accuracy) # print("radius = 2 , score :",knn_model.score(X_test, y_test)) ## 0.8333333333333334 # print("Accuracy: ",knn_model.score(X_test, y_test)*100,"%") ## 優化 ## KNN Classifier 參數設 knn_model = RadiusNeighborsClassifier(radius=1, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) knn_model.fit(X_train, y_train.values.ravel()) print("radius = 1, score :", knn_model.score(X_test, y_test)) # 1.0 print("Accuracy: ", knn_model.score(X_test, y_test) * 100, "%")
from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.cross_validation import train_test_split total_score = 0 stop = 1000 for x in range(stop): clf = RadiusNeighborsClassifier(radius=100.0) data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec for s in data_train] data_test_labels = [s.spec for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) total_score = total_score / stop print('all') print(total_score) specs = ['FK', 'FM', 'MN', 'OE'] for sp in specs: total_score = 0 for x in range(stop): clf = RadiusNeighborsClassifier(radius=100.0) data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_train] data_test_labels = [s.spec if s.spec == sp else 'NOT ' + sp for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test]
import numpy as np from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier import matplotlib.pyplot as plt X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3, random_state=0) # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) # plt.show() clf = RadiusNeighborsClassifier(radius=1, weights='distance', outlier_label=1) # 若样本在指定半径范围内没有近邻样本,指定outlier_label。如果不指定,遇到异常点会报错 clf.fit(X, y) print('score: {}'.format(clf.score(X, y))) # 查看目标样本的近邻样本(距离+位置) # print(clf.radius_neighbors(X[0,:].reshape(1, -1), return_distance=True)) # 查看目标样本的近邻图(稀疏矩阵,位置+距离或者连通) # print(clf.radius_neighbors_graph(X[0].reshape(1, -1), mode='distance')) # 可视化预测的效果(决策边界) from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # 确认训练集的边界 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
def train(): objects = environment["objects"] workspace = environment["workspace"] for o in objects: log.vprint( f"{bcolors.HEADER}{bcolors.UNDERLINE}objects/{o}.xml{bcolors.ENDC}\n" ) model_data = pd.read_csv(f"{workspace}/datasets/{o}_dataset_mut.csv") all_inputs = model_data[[ "HIF-ScaleX", "HIF-ScaleY", "HIF-ScaleZ", "HIF-PositionX", "HIF-PositionZ", "HIS-ScaleX", "HIS-ScaleY", "HIS-ScaleZ", "HIS-PositionX", "HIS-PositionY", "HIS-PositionZ", "HIS-OrientationW", "HIS-OrientationZ" ]].values all_classes = model_data["Name"].values log.vprint(f"{bcolors.OKBLUE}Exploratory analysis{bcolors.ENDC}") log.vprint(all_inputs[:5], end='\n\n') log.vprint(f"Classes: {all_classes}", end='\n\n') (training_inputs, testing_inputs, training_classes, testing_classes) = train_test_split(all_inputs, all_classes, train_size=0.80, random_state=456) log.vprint(f"Training set: {training_inputs.shape}") log.vprint(f"Testing set: {testing_inputs.shape}", end='\n\n') ### DecisionTreeClassifier ### # Create the classifier. decision_tree_classifier = DecisionTreeClassifier() # Train the classifier on the training set. decision_tree_classifier.fit(training_inputs, training_classes) # Validate the classifier on the testing set using classification accuracy. log.vprint(f"{bcolors.OKBLUE}DecisionTreeClassifier{bcolors.ENDC}") log.vprint( decision_tree_classifier.score(testing_inputs, testing_classes)) log.vprint(decision_tree_classifier.predict(testing_inputs[:1, :]), end='\n\n') ### RadiusNeighborsClassifier ### # Create the classifier. neigh = RadiusNeighborsClassifier(radius=2.0) # Train the classifier on the training set. neigh.fit(training_inputs, training_classes) # Validate the classifier on the testing set using classification accuracy. log.vprint(neigh.score(testing_inputs, testing_classes)) log.vprint(neigh.predict(testing_inputs[:1, :]), end='\n\n') ### Model accuracies ### plt.title("Model accuracies") # DecisionTreeClassifier model_accuracies = [] for repetition in range(1000): (training_inputs, testing_inputs, training_classes, testing_classes) = train_test_split(all_inputs, all_classes, train_size=0.75) decision_tree_classifier = DecisionTreeClassifier() decision_tree_classifier.fit(training_inputs, training_classes) classifier_accuracy = decision_tree_classifier.score( testing_inputs, testing_classes) model_accuracies.append(classifier_accuracy) sb.distplot(model_accuracies, label="DecisionTreeClassifier") # RadiusNeighborsClassifier model_accuracies = [] for repetition in range(1000): (training_inputs, testing_inputs, training_classes, testing_classes) = train_test_split(all_inputs, all_classes, train_size=0.75) neigh = RadiusNeighborsClassifier(radius=2.0) neigh.fit(training_inputs, training_classes) classifier_accuracy = neigh.score(testing_inputs, testing_classes) model_accuracies.append(classifier_accuracy) sb.distplot(model_accuracies, label="RadiusNeighborsClassifier") plt.legend() # plt.show() plt.savefig(f"{workspace}/plots/{o}_model_accuracies.pdf") plt.close("all") ''' The model achieves 97% classification accuracy without much effort. It's obviously a problem that our model performs quite differently depending on the subset of the data it's trained on. This phenomenon is known as overfitting: The model is learning to classify the training set so well that it doesn't gene- ralize and perform well on data it hasn't seen before. This problem is the main reason that most data scientists perform k-fold cross-validation on their models: Split the original data set into k subsets, use one of the subsets as the testing set, and the rest of the subsets are used as the training set. This process is then repeated k times such that each subset is used as the testing set exactly once. 10-fold cross-validation is the most common choice. ''' # DecisionTreeClassifier(max_depth=4) decision_tree_classifier = DecisionTreeClassifier() # cross_val_score returns a list of the scores, which we can visualize # to get a reasonable estimate of our classifier's performance cv_scores = cross_val_score(decision_tree_classifier, all_inputs, all_classes, cv=10) sb.distplot(cv_scores) plt.title(f"Average score: {np.mean(cv_scores)}") # plt.show() plt.savefig(f"{workspace}/plots/{o}_model_cv_scores.pdf") plt.close("all") ### Grid Search ### ''' Explore a range of parameters and find the best-performing parameter com- bination. Focus your search on the best range of parameters, then repeat this process several times until the best parameters are discovered. ''' decision_tree_classifier = DecisionTreeClassifier() parameter_grid = { 'max_depth': [1, 2, 3, 4, 5], 'max_features': [1, 2, 3, 4] } # It may not work correctly if the least populated class in y has less mem- # bers than n_splits. cross_validation = StratifiedKFold(n_splits=50) grid_search = GridSearchCV(decision_tree_classifier, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(all_inputs, all_classes) log.vprint(f"{bcolors.OKBLUE}GridSearch{bcolors.ENDC}") log.vprint(f" Best score: {grid_search.best_score_}") log.vprint(f" Best parameters: {grid_search.best_params_}", end='\n\n') decision_tree_classifier = grid_search.best_estimator_ log.vprint(f"{decision_tree_classifier} (before parameter tuning)", end='\n\n') # Visualize the grid search to see how the parameters interact. grid_visualization = [] grid_visualization.append(grid_search.cv_results_['mean_test_score']) grid_visualization = np.array(grid_visualization) grid_visualization.shape = (5, 4) sb.heatmap(grid_visualization, cmap='Blues') plt.xticks(np.arange(4) + 0.5, grid_search.param_grid['max_features']) plt.yticks( np.arange(5) + 0.5, grid_search.param_grid['max_depth'][::-1]) plt.xlabel('max_features') plt.ylabel('max_depth') # plt.show() plt.savefig(f"{workspace}/plots/{o}_model_grid_search.pdf") plt.close("all") ### Parameter tuning ### decision_tree_classifier = DecisionTreeClassifier() ''' Criterion <https://quantdare.com/decision-trees-gini-vs-entropy/> It is used to evaluate the feature importance. The default one is `gini` but you can also use `entropy`. Based on this, the model will define the importance of each feature for the classification. Splitter It is used to decide which feature and which threshold is used. Using `best`, the model if taking the feature with the highest importance. Using `random`, the model if taking the feature randomly but with the same distribution. ''' parameter_grid = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [1, 2, 3, 4] } cross_validation = StratifiedKFold(n_splits=10) grid_search = GridSearchCV(decision_tree_classifier, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(all_inputs, all_classes) log.vprint(f"{bcolors.OKBLUE}Parameter Tuning{bcolors.ENDC}") log.vprint(f" Best score: {grid_search.best_score_}") log.vprint(f" Best parameters: {grid_search.best_params_}", end='\n\n') # Then, the best classifer is taken decision_tree_classifier = grid_search.best_estimator_ log.vprint(f"{decision_tree_classifier} (after parameter tuning)", end='\n\n') with open(f"{workspace}/{o}_model_dtc.dot", 'w') as out_file: out_file = tree.export_graphviz(decision_tree_classifier, out_file=out_file) log.vprint( f"{bcolors.OKGREEN}Done! Check generated graph: {bcolors.ENDC}'./{workspace}/{o}_model_dtc.dot'.\n" ) ### RandomForestClassifier ### random_forest_classifier = RandomForestClassifier() parameter_grid = { 'n_estimators': [5, 10, 25, 50], 'criterion': ['gini', 'entropy'], 'max_features': [1, 2, 3, 4], 'warm_start': [True, False] } cross_validation = StratifiedKFold(n_splits=10) grid_search = GridSearchCV(random_forest_classifier, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(all_inputs, all_classes) log.vprint(f"{bcolors.OKBLUE}RandomForestClassifier{bcolors.ENDC}") log.vprint(f" Best score: {grid_search.best_score_}") log.vprint(f" Best parameters: {grid_search.best_params_}", end='\n\n') random_forest_classifier = grid_search.best_estimator_ log.vprint(random_forest_classifier, end='\n\n') ### Performance visuals ### rf_df = pd.DataFrame({ 'accuracy': cross_val_score(random_forest_classifier, all_inputs, all_classes, cv=10), 'classifier': ['Random Forest'] * 10 }) dt_df = pd.DataFrame({ 'accuracy': cross_val_score(decision_tree_classifier, all_inputs, all_classes, cv=10), 'classifier': ['Decision Tree'] * 10 }) both_df = rf_df.append(dt_df) sb.boxplot(x='classifier', y='accuracy', data=both_df) sb.stripplot(x='classifier', y='accuracy', data=both_df, jitter=True, color='white') # plt.show() plt.savefig(f"{workspace}/plots/{o}_model_classifiers_performance.pdf") plt.close("all")
y, test_size=0.2, random_state=22) from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier model_1 = KNeighborsClassifier(n_neighbors=2, weights="uniform") model_1.fit(x_train, y_train) score_1 = model_1.score(x_test, y_test) model_2 = KNeighborsClassifier(n_neighbors=2, weights="distance") model_2.fit(x_train, y_train) score_2 = model_2.score(x_test, y_test) model_3 = RadiusNeighborsClassifier(n_neighbors=2, radius=500.0) model_3.fit(x_train, y_train) score_3 = model_3.score(x_test, y_test) print(score_1, score_2, score_3) from sklearn.model_selection import cross_val_score result1 = cross_val_score(model_1, X, y, cv=10) result2 = cross_val_score(model_2, X, y, cv=10) result3 = cross_val_score(model_3, X, y, cv=10) print(result1.mean(), result2.mean(), result3.mean()) predict1 = model_1.predict(x_test) print(predict1) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predict1)
print("K-Nearest Neighbour(KNN) ALGORITMASI ILE ") from sklearn.neighbors import KNeighborsClassifier score_list = [] each_list = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) for each in range(1, 15): knn = KNeighborsClassifier(n_neighbors=each) knn.fit(x_train, y_train) score_list = (knn.score(x_test, y_test)) print("KNN de {} komsu alinirsa accuracy : {} ".format(each, score_list)) #%%Radius Neighbors Classifier print("o-o--o-o-o-o-o-o-o-o-o-o--o-o-o-o-o-o-o-o-o-o-o-o-o-o-") from sklearn.neighbors import RadiusNeighborsClassifier rnn = RadiusNeighborsClassifier() rnn.fit(x_train, y_train) print("RNN Algoritmasi kullanilirrsa accuracy :{} ".format( rnn.score(x_test, y_test))) #%%Logistic Regression print("o-o--o-o-o-o-o-o-o-o-o-o--o-o-o-o-o-o-o-o-o-o-o-o-o-o-") from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(x_train, y_train) print("Logistic Regression parametreleri default ise accuracy : ", lr.score(x_test, y_test)) #%%Logistic Regression (max_iter=500,sover=saga,penalty=elasticnet) lr = LogisticRegression(max_iter=500, solver='saga', penalty='elasticnet') lr.fit(x_train, y_train) print("Logistic Regression parametreleri degistirip bakalım accuracy : ", lr.score(x_test, y_test)) #%%Logistic Regression CV(Logistic Regression Cross-Validation) from sklearn.linear_model import LogisticRegressionCV lrcv = LogisticRegressionCV()
all_data = sel.fit_transform(all_data) train_data = all_data[:tr_samples_size] test_data = all_data[tr_samples_size:] tr_samples_size, feature_size = train_data.shape te_samples_size, _ = test_data.shape print('Train Data Samples:', tr_samples_size, ', Test Data Samples', te_samples_size, ', Feature Size(after feature-selection):', feature_size) #radius=1.0#################################################################### neigh = RadiusNeighborsClassifier(radius=2.9) t1 = time.clock() neigh.fit(train_data, train_labels) t2 = time.clock() CRR = neigh.score(test_data, test_labels) t3 = time.clock() print('CRR(radius=2.9):', CRR) print('Training Time:', t2 - t1) print('Testing Time:', t3 - t2) predictions = neigh.predict(test_data) ok = 0 confusion = np.zeros((10, 10)) confidence = np.zeros((10, 10)) for i in range(test_data.shape[0]): confusion[test_labels[i], predictions[i]] += 1 dig = np.arange(10) plot_confusion_matrix(confusion, dig, title='Confusion matrix radius=2.9') #-------------------------------------------------------------------------------------------# import numpy as np
x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target) x_train y_train x_test y_test ##con numero de vecinos from sklearn.neighbors import KNeighborsClassifier knn=KNeighborsClassifier(n_neighbors=5) knn.fit(x_train,y_train) knn.score(x_test,y_test) knn.predict(x_test) y_test knn.predict(x_test[2:3][:4]) y_test[2:3][:4] ##con radio from sklearn.neighbors import RadiusNeighborsClassifier knn_r=RadiusNeighborsClassifier(radius=1) knn_r.fit(x_train,y_train) knn_r.score(x_test,y_test) knn_r.predict(x_test) y_test knn_r.predict(x_test[2:3][:4]) y_test[2:3][:4]
('Gradient Boosting Classifier', score, matrix)) print('K Nearest Neighbors') kNeigh = KNeighborsClassifier(n_neighbors=3) kNeigh.fit(X_train, y_train) y_test_pred = kNeigh.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = kNeigh.score(X_test, y_test) no_selection_performance.append(('K Nearest Neighbours', score, matrix)) print('Radius Nearest Neighbors') rNeigh = RadiusNeighborsClassifier(radius=42.0) rNeigh.fit(X_train, y_train) y_test_pred = rNeigh.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = rNeigh.score(X_test, y_test) no_selection_performance.append(('Radius Nearest Neighbours', score, matrix)) print('Decision Tree Classifier') dTree = DecisionTreeClassifier(random_state=0) dTree.fit(X_train, y_train) y_test_pred = dTree.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = dTree.score(X_test, y_test) no_selection_performance.append(('Decision Tree Classifier', score, matrix)) print('Bagging (with K Nearest Neighbors)') bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) bagging.fit(X_train, y_train)
def run(data, classifications, scoring_data, scoring_classifications, radius): classifer = RadiusNeighborsClassifier(radius=radius) classifer.fit(data, classifications) accuracy = classifer.score(scoring_data, scoring_classifications) return accuracy