def Radius_Neighbors(input_file,Output): lvltrace.lvltrace("LVLEntree dans Radius_Neighbors") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = RadiusNeighborsClassifier(n_neighbors=1) clf.fit(X, y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Radius Neighbors Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Raidus_Neighbors_metrics.txt" file = open(results, "w") file.write("Radius Neighbors estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Radius Neighbors" save = Output + "Radius_Neighbors_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans Radius_Neighbors")
def draw(self): """ Draw the estimated floorplan in the current figure """ xy = self.dimred.transform(self._fingerprints) x_min, x_max = xy[:,0].min(), xy[:,0].max() y_min, y_max = xy[:,1].min(), xy[:,1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, 1.0), np.arange(y_min, y_max, 1.0)) clf = RadiusNeighborsClassifier(radius=3.0, outlier_label=0) clf.fit(xy, self._label) label = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) plt.pcolormesh(xx, yy, label) plt.scatter(xy[:,0], xy[:,1], c=self._label, vmin=0)
def Radius_Neighbors(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print "Radius Neighbors accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("Radius Neighbors estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Radius Neighbors %f"%test_size save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (ValueError): results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file: No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.") file.close() lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
r = 3000 # <------------------------------------ clf = RadiusNeighborsClassifier(radius=r) # radius=1.0, weights=’uniform’, algorithm=’auto’ # leaf_size=30, p=2, metric=’minkowski’, outlier_label=None # metric_params=None, n_jobs=None, **kwargs from time import process_time start = process_time() clf.fit(trainData, trainLabel) print('time of train :', process_time() - start) start = process_time() predicts = clf.predict(testData) print('time of test :', process_time() - start) from sklearn.metrics import accuracy_score print("Accuracy : ", accuracy_score(testLabel, predicts)) import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix def plot_confusion_matrix(y_true, y_pred, classes, normalize=False,
#load libraries from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn import datasets #load data iris = datasets.load_iris() features = iris.data target = iris.target #create standardizer standardizer = StandardScaler() #Standardize feature features_standardized = standardizer.fit_transform(features) #Train radius neighbors classifier rnn = RadiusNeighborsClassifier(radius=0.5, n_jobs=-1).fit(features_standardized, target) #create two observation new_observations = [[1, 1, 1, 1]] #predict the class of two observations print(rnn.predict(new_observations))
x = dataset[0] y = dataset[1] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.metrics import accuracy_score radio = 3 model_radius = RadiusNeighborsClassifier(radius=radio) model_radius.fit(x_train, y_train) predict_radius = model_radius.predict(x_train) accu_radius = accuracy_score(y_train, predict_radius) print('La precision del modelo KNN Radius es ', round(accu_radius, 2)) from sklearn.neighbors import KNeighborsClassifier neighbors = 4 model_KNN = KNeighborsClassifier(n_neighbors=neighbors, n_jobs=2) model_KNN.fit(x_train, y_train) predict_KNN = model_KNN.predict(x_test)
pred_RF = RF_model.predict(X_valid) print('MAE RF: ', mean_absolute_error(pred_RF, Y_valid)) KN_model = KNeighborsClassifier(n_neighbors=55, weights='distance', algorithm='auto') KN_model.fit(X_train, Y_train) pred_KN = KN_model.predict(X_valid) print('MAE KN: ', mean_absolute_error(pred_KN, Y_valid)) RN_model = RadiusNeighborsClassifier(radius=3.32, weights='distance', algorithm='ball_tree', outlier_label=1) RN_model.fit(X_train, Y_train) pred_RN = RN_model.predict(X_valid) print('MAE RAD_N: ', mean_absolute_error(pred_RN, Y_valid)) GB_model = GradientBoostingClassifier( learning_rate=0.0730, n_estimators=250) #(learning_rate=0.028,n_estimators=375) Fitted_GB = GB_model.fit(X_train, Y_train) pred_GB = GB_model.predict(X_valid) print('MAE GB: ', mean_absolute_error(pred_GB, Y_valid)) score = (cross_val_score(GB_model, X, Y)).mean() print('CROSS-VALIDATION_GB= ', score) SVM_model = SVC() SVM_model.fit(X_train, Y_train) pred_SVM = SVM_model.predict(X_valid)
hist = desc.describe(gray) # extraia o rótulo do caminho da imagem e atualiza o # rótulo e listas de dados # print(imagePath.split("/")) # use "\\" no Windows labels.append(imagePath.split("/")[-2]) # use "\\" no Windows data.append(hist) #print(labels) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, random_state=0) # treinar um KNN Linear nos dados k_nn = [0.005, 0.01, 0.015, 0.02] for k in k_nn: neigh = RadiusNeighborsClassifier(radius=k, outlier_label=0.1) neigh.fit(X_train, y_train) benar = 0 jml = 0 for i in range(len(y_test)): jml += 1 hist = X_test[i] prediction = neigh.predict([hist])[0] if prediction == y_test[i]: benar += 1 akurasi = float(benar * 100 / jml) print(benar, jml, k, ": Akurasi", akurasi, "%") hasil.append([k, p[0], p[1], akurasi]) tulis_hasil(hasil, "results/{0}_rnn.csv".format(db))
if self._clasifyData.has_key(coltag): try: tag = self._clasifyData[coltag]['neigh'].predict([[screenspace_x,screenspace_y]]) tag = tag[0] self._clasifyData[coltag]['data'][tag] = [screenspace_x,screenspace_y] except ValueError: return MOCAP_ROGE_DATA return tag return MOCAP_ROGE_DATA def updateBoxesForNextFrame(self): for clotag,data in self._clasifyData.items(): centroids = [] labels = [] for tag,centroid in data['data'].items(): centroids.append(centroid) labels.append(tag) self._clasifyData[clotag]['neigh'].fit(centroids,labels) X = [[229.5, 500.5], [127.0, 497.0]]#[[0,0], [1,1], [2,2], [3,3]] y = [1, 5]#[5, 1, 3, 4] neigh = RadiusNeighborsClassifier(radius=1.0) neigh.fit(X, y) print(neigh.predict([[229.5, 500.5]]))
def knn_classifier_Radius(X_train, categories, X_test, test_categories): from sklearn.neighbors import RadiusNeighborsClassifier clf = RadiusNeighborsClassifier(outlier_label= 0).fit(X_train, categories) y_rknn_predicted = clf.predict(X_test) print "\n Here is the classification report for RadiusNeighborsClassifier classifier:" print metrics.classification_report(test_categories, y_rknn_predicted)
class Model(object): """ Text-classification-system with scikit-learn. For reference see: http://scikit-learn.org/stable/ This Model class is based on Data class. Defines training and test data. Build classification model. Provides evaluation methods. Parameter --------- data : Data, optional Contains a data object with filled data.real_data. data_list : array, shape = [data1 object, data2 object, ...] Contains data objects with filled data.real_data. Attributes ---------- clf : classifier object from sklean moduls. Contains a selected classifier object from sklean modul. see reference: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning classifier_list : array, shape = [string classifier1 name, ...] Contains names of all available classification algorithms. __train_data_set : boolean Contains bolloean value that describes if train_data is set. train_data : Data Contains the data object that is set as training data. test_data : Data Contains the data object that is set as test data. train_targets : numpy array of shape [n_samples] Contains the class labels of training data. A sample is a textpair object, it's class label is found in textpair.target. train_samples : numpy array of shape [n_samples,n_features] Contains the feature values of the training data. A sample is a textpair object, it's feature values are found in textpair.features hash. After vectorize() them, they are stored in textpair.feature_vector. test_targets : numpy array of shape [n_samples] Contains the class labels of test data. A sample is a textpair object, it's class label is found in textpair.target. test_samples : numpy array of shape [n_samples,n_features] Contains the feature values of the test data. A sample is a textpair object, it's feature values are found in textpair.features hash. After vectorize() them, they are stored in textpair.feature_vector. """ def __init__(self, data=None, data_list=None): self.clf = None if data is not None: self.data_list = [data] elif data_list is not None: self.data_list = data_list self.classifier_list = [ "svm_linear", "svm_poly", "naive_bayes", "decision_tree", "nearest_centroid", "k_neighbors", "radius_neighbors" ] self.__train_data_set = False def set_train_data(self, data_name): """Setter for training data Walk through data_list and set data object with data.name as train_data. Parameter --------- data_name : string Contains the name of the data object, that should be set as train_data for the model. """ data_in_list = False for data in self.data_list: if data.name == data_name: print data_name + " is in model_data_list" self.train_data = data self.train_samples, self.train_targets = self.fill_feature_target( data) print data_name + " is set as train_data" data_in_list = True if data_in_list: self.__train_data_set = True else: print data_name + " not in model_data_list " def set_test_data(self, data_name): """Setter for test data Walk through data_list and set data object with data.name as test_data. Notes ----- Training data has to be set before test data, due to the fact that some features need skeletons that have to be build before seeing the test data. see reference: bag_of_pos.py, bag_of_words.py, tf_idf.py Parameter --------- data_name : string Contains the name of the data object, that should be set as test_data for the model. """ if self.__train_data_set and self.train_data.name == data_name: self.test_data = self.train_data print "train_data and test_data from one data_set" elif not self.__train_data_set: print "please set train_data first" else: data_in_list = False for data in self.data_list: if data.name == data_name: print data_name + " is in model_data_list" self.test_data = data self.test_samples, self.test_targets = self.fill_feature_target( data) data_in_list = True print data_name + " is set as test_data" if not data_in_list: print data_name + " not in model_data_list " def fill_feature_target(self, data): """ Fill the feature samples and target values. The classifier objects from sklearn need a numpy array for classification. Shape of the data class labels : numpy array of shape [n_samples] Shape of the data feature values : numpy array of shape [n_samples,n_features] Vectorize() textpair feature values, for building required numpy arrays. Note ---- Check __train_data_set first, cause there is no need to attache the same features for test data manually in main.py. This will be performed automatically in here. Parameter --------- data : Data Contains a Data object that data.real_data should be vectorized. """ sample_list = [] target_list = [] if self.__train_data_set: for feature in self.train_data.features_fit: if feature == "bag_of_words" or feature == "bag_of_pos" or feature == "tf_idf": data.bow_model = self.train_data.bow_model print self.train_data.features_fit data.attach_feature_list(self.train_data.features_fit) for textpair in data.real_data.values(): textpair.vectorize() target_list.append(textpair.target) sample_list.append(textpair.feature_vector) return np.array(sample_list), np.array(target_list) else: for textpair in data.real_data.values(): textpair.vectorize() target_list.append(textpair.target) sample_list.append(textpair.feature_vector) return np.array(sample_list), np.array(target_list) def set_classifier(self, classifier_name): """ Setter for clf Building instances of classifier objects with corresponding name. Parameter --------- classifier_name : string Contains the corresponding name of the wanted classifier from sklearn. """ if classifier_name == "svm_linear": self.clf = svm.SVC(kernel="linear", class_weight="auto") elif classifier_name == "svm_poly": self.clf = svm.SVC(kernel="poly", class_weight="auto") elif classifier_name == "naive_bayes": self.clf = GaussianNB() elif classifier_name == "decision_tree": self.clf = tree.DecisionTreeClassifier() elif classifier_name == "nearest_centroid": self.clf = NearestCentroid() elif classifier_name == "k_neighbors": self.clf = KNeighborsClassifier(n_neighbors=100) elif classifier_name == "radius_neighbors": self.clf = RadiusNeighborsClassifier(radius=1.0, outlier_label=1) else: raise ClassifierNotExistException(classifier_name) def train(self, fraction): """ Train the model Training the classifier with the wanted fraction of the training data. Parameter ------- fraction : int Contains a number from 0 to 100. Defines the fraction of the training data that will be used for training the classifier. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: count = int( round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) self.clf.fit(self.train_samples[:count], self.train_targets[:count]) def predict(self, sample): """ Predict a given sample. Make a prediction for a given sample. Classifier needs a numpy array with the feature values of a sample. Note ---- Requires a trained(fitted) model. Parameters ---------- samples : numpy array of shape [n_samples,n_features] Returns ------- self.clf.predict(sample) : int Contains the prediction value from the model. It is the predicted class label. For a textpair object it can be 0 or 1. """ if self.clf is None: raise NoClassifierException elif self.test_targets.size == 0 and self.test_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: return self.clf.predict(sample) def evaluate_cross_validation(self, folds): """ Evaluation through a cross-validation Perform a cross-validation on the set training data with measured accuracy. It requires a given number of folds. Note ---- cross validation is performed on the training data, not on the test data. So set your data as training data, if you want to perform a cross validation. Parameter --------- folds : int Contains the number of folds for the cross-validation. Returns ------- accuracy_list : array, shape = [float acc score1, float acc score2, ...] Contains the accuracy scores of all iterations. acc_mean : float Contains the accuracy mean of the all iterations. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException elif folds > len(self.train_samples): raise FoldSizeToBigException(folds, self.train_samples) else: kf = KFold(len(self.train_samples), n_folds=folds) accuracy_list = [] for train, test in kf: x_train, x_test, y_train, y_test = self.train_samples[train], self.train_samples[test], \ self.train_targets[train], self.train_targets[test] self.clf.fit(x_train, y_train) accuracy_list.append( accuracy_score(np.array(y_test), np.array(self.clf.predict(x_test)))) n = 0 sum_values = 0 for acc_value in accuracy_list: sum_values = sum_values + acc_value n += 1 acc_mean = (sum_values / n) return accuracy_list, acc_mean def evaluate_classification_report(self, fraction): """ A detailed classification report For an easy use to measure how well your trained model performs, the given method uses your set data objects and gives an accuracy score output on the shell. Note ---- There are two scenarios : 1. training data and test data are from the same data object. (means there names are the same !) - Normalization 2. training data and test data are from different data objects. + Normalization The first scenario will use given fraction and divide the training data in train and test data for the classification. If fraction is 100 then it will be trained and tested on the same data object. With a number of 80 fraction it will be trained on 80 percent and tested on 20 percent of the given data object. There is no Normalization for this scenario implemented ! The second scenario needs a number of 100 fraction, to use the whole training data for the training ! Working with normalized values. Parameter --------- fraction : int Contains a number from 0 to 100. Defines the fraction of the training data that will be used for training the classifier. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: # if trained on 100 % fraction, it will be tested on 100 % # fraction, than train and test data are the same # if count_predict is 0 (with 100% count_train), than # self.targets[-count_predict:] == self.targets[:] = True if self.test_data.name == self.train_data.name: print "train_data and test_data from one data_set" count_train = int( round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) count_predict = len(self.train_targets) - count_train print "count_train:", count_train print "count_predict:", count_predict # Summarize placed in here, cause data objects are equal and # dived in this method. So training and test data are defined # in here. print "##########train_data summarize##########" summarize_textpair( self.train_data.real_data.values()[:count_train]) print "##########test_data summarize##########" summarize_textpair( self.train_data.real_data.values()[-count_predict:]) # setting train and test data train_samples = self.train_samples[:count_train] train_targets = self.train_targets[:count_train] test_samples = self.train_samples[-count_predict:] test_targets = self.train_targets[-count_predict:] # Training self.clf.fit(train_samples, train_targets) # Testing test_targets_predicted = self.clf.predict(test_samples) # calculating baseline null = 0 eins = 0 for i in test_targets: if i == 0: null += 1 else: eins += 1 if null > eins: baseline = float(null) / (float(null) + float(eins)) else: baseline = float(eins) / (float(null) + float(eins)) print "Anzahl 0:", null print "Anzahl 1:", eins print "Baseline:", baseline print "-------------------------------" # Calculating accuracy score of predicted samples print "accuracy_score: ", accuracy_score( test_targets, test_targets_predicted) else: # Normalization norma = preprocessing.normalize(self.train_samples) count_train = int( round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) print "count_train:", count_train print "count_predict:", len(self.test_targets) # Setting train and test data # without normalization take this one instead # train_samples = self.train_samples[:count_train] train_samples = norma[:count_train] train_targets = self.train_targets[:count_train] # without normalization take this one instead # test_samples = self.test_samples test_samples = preprocessing.normalize(self.test_samples) test_targets = self.test_targets # Training self.clf.fit(train_samples, train_targets) # Testing test_targets_predicted = self.clf.predict(test_samples) # Calculating baseline null = 0 eins = 0 for i in test_targets: if i == 0: null += 1 else: eins += 1 if null > eins: baseline = float(null) / (float(null) + float(eins)) else: baseline = float(eins) / (float(null) + float(eins)) print "Anzahl 0:", null print "Anzahl 1:", eins print "Baseline:", baseline print "-------------------------------" # Calculating accuracy score of predicted samples print "accuracy_score: ", accuracy_score( test_targets, test_targets_predicted)
def detect(containers, fields, time_range, learning=True, usual_file=None, stdout=None): global maxim_distances if stdout is not None: sys.stdout = stdout data = dict() knowledge = dict() exceptions = load_exceptions() if time_range == 'seconds': print '-' * 50 print "SECONDS" print '-' * 50 for container in containers: data[container.module_data['name']] = copy.deepcopy( container.database_events[time_range]) knowledge[container.module_data['name']] = copy.deepcopy( container.database_info[time_range]) usual = load_usual_data(time_range) usual, changed = normalize(data, fields, usual) if changed and not learning: save_usual_events_json(usual, time_range) if learning: add_new_data_to_cluster(data, usual, knowledge) save_usual_events_json(usual, time_range) return # ----------------------------------- # NOT LEARNING ---> PREDICT # ----------------------------------- maxims, averages = get_maxims_and_averages(knowledge) usual_to_fit = normalize_fit_input(usual['data'], usual['events'], usual['fields'], averages, maxims) # ----------------------------------- # PREPARING TO PREDICTs # ----------------------------------- new_timestamps = eventdata_to_timestamps(data, usual) new_data = from_timestamps_to_data(new_timestamps) new_data_to_fit = normalize_fit_input(new_data, usual['events'], usual['fields'], averages, maxims) classifier = RadiusNeighborsClassifier(radius=ANOMALY_RADIUS, metric=similarity, outlier_label=-1) print "PREDICTING" t = time.time() if len(usual['labels']) < len(usual_to_fit): cluster_data(usual, usual_to_fit) save_usual_events_json(usual, time_range) classifier.fit(usual_to_fit, usual['labels']) labels = classifier.predict(new_data_to_fit) print "PREDICTION TOOK", time.time() - t, "seconds" print 'maxim distances:', sorted(maxim_distances, reverse=True)[:10] print 'NEW SAMPLES LABELS: ', labels # ----------------------------------- # DONE PREDICTION # ----------------------------------- events = usual['events'] fields = usual['fields'] for i in range(len(new_data) - 1, -1, -1): if labels[i] == -1: detected = do_detection(new_data[i], new_timestamps, maxims, new_data_to_fit[i], events, fields, usual, exceptions)
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standardized, target) # 创建grid搜索 classifier.best_estimator_.get_params()["knn__n_neighbors"] # 最佳邻域的大小(k) # k值的大小对KNN分类器的性能有重要的影响。在机器学习中,我们一直尝试在偏差(bias)和方差(variance)之间找到一种平衡,而k值对这种平衡的影响很明显。 # 如果k=n(这里n是观察值的数量),那么偏差就会很大而方差很小。如果k=1,那么偏差会很小但是方差很大。只有找到了能在偏差和方差之间取得折中的k值, 才能 # 得到最佳的KNN分类器。 在解决方案中,我们用GridSearchCV 对不同k值的KNN分类器做5折交叉验证,可以得到能产生最佳的KNN分类器的k值。 # 15.4 创建一个基于半径的最近邻分类器 from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn import datasets iris = datasets.load_iris() features = iris.data target = iris.target standardizer = StandardScaler() features_standardized = standardizer.fit_transform(features) rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=1).fit(features_standardized, target) # 训练一个基于半径的最近邻分类器 new_observations = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]] # 创建两个观察值 rnn.predict(new_observations) # 预测这两个观察值的分类 # 基于半径的最近邻分类器不太常用,其观察值的分类是根据某一半径r范围内所有观察值的分类来预测的。 # 在scikit-learn中, RadiusNeighborsClassifier 与 KNeighborsClassifier 很相似,除了两个参数 # 1)radius: 我们需要指定一个半径来确定某个观察值能不能算作目标观察值的邻居。除非你有很充分的理由要把radius设为某个值,否则最好像对待其他超参数一样 # 在模型选择起见对它进行调整。 # 2)outlier_label: 用来指定如果一个观察值周围没有其他观察值在半径radius的范围内,这个观察值应该被标记为什么。这是一个有用的分辨界外点的方法。
y_test = labels[272:, i] else: X_train = training y_train = labels[:172, i] X_test = sampletest y_test = labels[172:, i] posterior = np.empty([100, 72, 6]) box = np.zeros([6, 6]) for j in range(4, 5): for k in range(1, 2): accuracy = np.zeros(100) for m in range(0, 100): rnc = RadiusNeighborsClassifier(radius=j, leaf_size=k) rnc.fit(X_train, y_train) y_pred = rnc.predict(X_test) n = 0 for i in range(0, len(y_pred)): if y_pred[i] == y_test[i]: # print i, y_pred[i], y_test[i] n = n + 1 accuracy[m] = accuracy[m] + 1 box[y_test[i] - 1, y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1 # posterior[m] = knc.predict_proba(X_test) print j, k, np.mean(accuracy) / 0.72, np.std(accuracy) / 0.72 # print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 """ means = np.empty([72,6]) stds = np.empty([72,6]) grid = np.empty([6,6])
#dimension reduction from sklearn.decomposition import PCA pca = PCA(n_components=10000) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) X_train.shape y_train = list(y_train) from sklearn.neighbors import RadiusNeighborsClassifier clf = RadiusNeighborsClassifier(radius=1.0, weights='uniform', algorithm='auto', leaf_size=30,p=2, metric='minkowski') clf.fit(X_train,y_train) y_pred = clf.predict(X_test) from sklearn.metrics import accuracy_score accuracy_score(y_test, y_pred) from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth = None, min_samples_split = 2, random_state=0) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) accuracy_score(y_test, y_pred) from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') clf.fit(X_train,y_train) y_pred = clf.predict(X_test) accuracy_score(y_test, y_pred)
x_train y_train x_test y_test from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) knn.fit(x_train, y_train) knn.score(x_test, y_test) knn.predict(x_test) y_test knn.predict(x_test[2:3][:4]) y_test[2:3][:4] pr = x_test[2:3][:4] from sklearn.neighbors import RadiusNeighborsClassifier knn_r = RadiusNeighborsClassifier(radius=5) knn_r.fit(x_train, y_train) knn_r.score(x_test, y_test) knn_r.predict(x_test) y_test knn_r.predict(x_test[2:3][:4]) y_test[2:3][:4]
# Wczytanie bibliotek. from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn import datasets # Wczytanie danych. iris = datasets.load_iris() features = iris.data target = iris.target # Utworzenie egzemplarza typu StandardScaler. standardizer = StandardScaler() # Standaryzacja cech. features_standardized = standardizer.fit_transform(features) # Wytrenowanie klasyfikatora sąsiedztwa na podstawie promienia. rnn = RadiusNeighborsClassifier( radius=.5, n_jobs=-1).fit(features_standardized, target) # Utworzenie dwóch obserwacji. new_observations = [[ 1, 1, 1, 1]] # Prognozowanie klasy tych dwóch obserwacji. rnn.predict(new_observations)
# 查看目标样本的近邻样本(距离+位置) # print(clf.radius_neighbors(X[0,:].reshape(1, -1), return_distance=True)) # 查看目标样本的近邻图(稀疏矩阵,位置+距离或者连通) # print(clf.radius_neighbors_graph(X[0].reshape(1, -1), mode='distance')) # 可视化预测的效果(决策边界) from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # 确认训练集的边界 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 生成随机数据来做测试集,然后作预测 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) new_x = np.c_[xx.ravel(), yy.ravel()] y_pred = clf.predict(new_x) # 画出测试集数据 ax = plt.subplot() ax.pcolormesh(xx, yy, y_pred.reshape(xx.shape), cmap=cmap_light) # 也画出所有的训练集数据 ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("3-Class classification (k = 15, weights = 'distance')" ) plt.show()
# Create and train the Radius Neighbors Classifier clf = RadiusNeighborsClassifier(radius=0.5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None) clf.fit(encodings, names) # Load the test image with unknown faces into a numpy array test_image = face_recognition.load_image_file('test_image.jpg') # Find all the faces in the test image using the default HOG-based model face_locations = face_recognition.face_locations(test_image) no = len(face_locations) print("Number of faces detected: ", no) # Predict all the faces in the test image using the trained classifier print("Found:") for i in range(no): test_image_enc = face_recognition.face_encodings( test_image, known_face_locations=face_locations)[i] try: name = clf.predict(test_image_enc) print(*name) except ValueError: print('No matches Found')
#plt.xticks(()) #plt.yticks(()) #plt.axis([-3, 3, -3, 3]) biz['svm_pred']=(biz.expensive>clf.predict(X)).astype(int) plt.scatter(x=biz[biz.svm_pred==1].X,y=biz[biz.svm_pred==1].Y, s=20, c='g') print("Prop of expensive businesses seen as gentrifiers [%.2f]" %(biz['svm_pred'].sum()/biz.expensive.sum())) print("Prop of expensive businesses seen as gentrifiers [%.2f]" %(biz['svm_pred'].sum()/len(biz.expensive))) #biz['gentrifier']=(biz.expensive>biz.svm_pred).astype(int) ##################################################################################################################################### ################################################################ Nearest Neighbor ################################################### r=.00025 #A block is .001 and two blocks are .003; therefore, .011 scans about 8 blocks in diameter. neigh = RadiusNeighborsClassifier(radius=r) #from qGis nneighbor analysis neigh.fit(X, Y) predictions=neigh.predict(X) plt.scatter(X.iloc[:,0], X.iloc[:,1], s=30, c=Y, cmap=plt.cm.Paired); plt.title('True labels') plt.subplots_adjust(left=0, bottom=0, right=1, top=.95, wspace=0, hspace=0) plt.figure(); plt.scatter(X.iloc[:,0], X.iloc[:,1], s=30, c=predictions, cmap=plt.cm.Paired); plt.title('Predicted labels, rad=%.3f' %r) plt.subplots_adjust(left=0, bottom=0, right=1, top=.95, wspace=0, hspace=0) biz['rnn_gentrifier']=(biz.expensive>predictions).astype(int) plt.scatter(x=biz[biz.rnn_gentrifier==1].X,y=biz[biz.rnn_gentrifier==1].Y, s=20, c='g') print("Prop of expensive businesses seen as gentrifiers [%.2f]" %((biz.rnn_gentrifier.sum()/biz.expensive.sum()))) print("Prop of expensive businesses seen as gentrifiers [%.2f]" %((biz.rnn_gentrifier.sum()/len(biz.expensive)))) ##################################################################################################################################### ################################################################ SAVE RESULTS ######################################################
#filename = "serialized_y_test_" + country + ".pck" # filepath = os.path.join( # here, 'persisted_models', country, filename) #y_test = joblib.load(filepath) #print("loading data finished") # the radius neighbors # https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html classifier_radius = RadiusNeighborsClassifier( radius=5, metric='euclidean', weights='distance') classifier_radius.set_params(outlier_label='Z') classifier_radius.fit(X_train, y_train) y_pred_radius = classifier_radius.predict(X_test) print("radius prediction") print(y_pred_radius) print("Accuracy radius classifier") print(confusion_matrix(y_test, y_pred_radius)) print(classification_report(y_test, y_pred_radius)) # joblib - save model to file filenameRadiusClassifier = "serialized_radius_classifier_" + country + ".pck" here = os.path.dirname(os.path.abspath(__file__)) filepathRadiusClassifier = os.path.join( here, 'persisted_models', country, filenameRadiusClassifier) joblib.dump(classifier_radius, filepathRadiusClassifier)
class Model(object): """ Text-classification-system with scikit-learn. For reference see: http://scikit-learn.org/stable/ This Model class is based on Data class. Defines training and test data. Build classification model. Provides evaluation methods. Parameter --------- data : Data, optional Contains a data object with filled data.real_data. data_list : array, shape = [data1 object, data2 object, ...] Contains data objects with filled data.real_data. Attributes ---------- clf : classifier object from sklean moduls. Contains a selected classifier object from sklean modul. see reference: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning classifier_list : array, shape = [string classifier1 name, ...] Contains names of all available classification algorithms. __train_data_set : boolean Contains bolloean value that describes if train_data is set. train_data : Data Contains the data object that is set as training data. test_data : Data Contains the data object that is set as test data. train_targets : numpy array of shape [n_samples] Contains the class labels of training data. A sample is a textpair object, it's class label is found in textpair.target. train_samples : numpy array of shape [n_samples,n_features] Contains the feature values of the training data. A sample is a textpair object, it's feature values are found in textpair.features hash. After vectorize() them, they are stored in textpair.feature_vector. test_targets : numpy array of shape [n_samples] Contains the class labels of test data. A sample is a textpair object, it's class label is found in textpair.target. test_samples : numpy array of shape [n_samples,n_features] Contains the feature values of the test data. A sample is a textpair object, it's feature values are found in textpair.features hash. After vectorize() them, they are stored in textpair.feature_vector. """ def __init__(self, data=None, data_list=None): self.clf = None if data is not None: self.data_list = [data] elif data_list is not None: self.data_list = data_list self.classifier_list = ["svm_linear", "svm_poly", "naive_bayes", "decision_tree", "nearest_centroid", "k_neighbors", "radius_neighbors"] self.__train_data_set = False def set_train_data(self, data_name): """Setter for training data Walk through data_list and set data object with data.name as train_data. Parameter --------- data_name : string Contains the name of the data object, that should be set as train_data for the model. """ data_in_list = False for data in self.data_list: if data.name == data_name: print data_name + " is in model_data_list" self.train_data = data self.train_samples, self.train_targets = self.fill_feature_target(data) print data_name + " is set as train_data" data_in_list = True if data_in_list: self.__train_data_set = True else: print data_name + " not in model_data_list " def set_test_data(self, data_name): """Setter for test data Walk through data_list and set data object with data.name as test_data. Notes ----- Training data has to be set before test data, due to the fact that some features need skeletons that have to be build before seeing the test data. see reference: bag_of_pos.py, bag_of_words.py, tf_idf.py Parameter --------- data_name : string Contains the name of the data object, that should be set as test_data for the model. """ if self.__train_data_set and self.train_data.name == data_name: self.test_data = self.train_data print "train_data and test_data from one data_set" elif not self.__train_data_set: print "please set train_data first" else: data_in_list = False for data in self.data_list: if data.name == data_name: print data_name + " is in model_data_list" self.test_data = data self.test_samples, self.test_targets = self.fill_feature_target(data) data_in_list = True print data_name + " is set as test_data" if not data_in_list: print data_name + " not in model_data_list " def fill_feature_target(self, data): """ Fill the feature samples and target values. The classifier objects from sklearn need a numpy array for classification. Shape of the data class labels : numpy array of shape [n_samples] Shape of the data feature values : numpy array of shape [n_samples,n_features] Vectorize() textpair feature values, for building required numpy arrays. Note ---- Check __train_data_set first, cause there is no need to attache the same features for test data manually in main.py. This will be performed automatically in here. Parameter --------- data : Data Contains a Data object that data.real_data should be vectorized. """ sample_list = [] target_list = [] if self.__train_data_set: for feature in self.train_data.features_fit: if feature == "bag_of_words" or feature == "bag_of_pos" or feature == "tf_idf": data.bow_model = self.train_data.bow_model print self.train_data.features_fit data.attach_feature_list(self.train_data.features_fit) for textpair in data.real_data.values(): textpair.vectorize() target_list.append(textpair.target) sample_list.append(textpair.feature_vector) return np.array(sample_list), np.array(target_list) else: for textpair in data.real_data.values(): textpair.vectorize() target_list.append(textpair.target) sample_list.append(textpair.feature_vector) return np.array(sample_list), np.array(target_list) def set_classifier(self, classifier_name): """ Setter for clf Building instances of classifier objects with corresponding name. Parameter --------- classifier_name : string Contains the corresponding name of the wanted classifier from sklearn. """ if classifier_name == "svm_linear": self.clf = svm.SVC(kernel="linear", class_weight="auto") elif classifier_name == "svm_poly": self.clf = svm.SVC(kernel="poly", class_weight="auto") elif classifier_name == "naive_bayes": self.clf = GaussianNB() elif classifier_name == "decision_tree": self.clf = tree.DecisionTreeClassifier() elif classifier_name == "nearest_centroid": self.clf = NearestCentroid() elif classifier_name == "k_neighbors": self.clf = KNeighborsClassifier(n_neighbors=100) elif classifier_name == "radius_neighbors": self.clf = RadiusNeighborsClassifier(radius=1.0, outlier_label=1) else: raise ClassifierNotExistException(classifier_name) def train(self, fraction): """ Train the model Training the classifier with the wanted fraction of the training data. Parameter ------- fraction : int Contains a number from 0 to 100. Defines the fraction of the training data that will be used for training the classifier. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: count = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) self.clf.fit(self.train_samples[:count], self.train_targets[:count]) def predict(self, sample): """ Predict a given sample. Make a prediction for a given sample. Classifier needs a numpy array with the feature values of a sample. Note ---- Requires a trained(fitted) model. Parameters ---------- samples : numpy array of shape [n_samples,n_features] Returns ------- self.clf.predict(sample) : int Contains the prediction value from the model. It is the predicted class label. For a textpair object it can be 0 or 1. """ if self.clf is None: raise NoClassifierException elif self.test_targets.size == 0 and self.test_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: return self.clf.predict(sample) def evaluate_cross_validation(self, folds): """ Evaluation through a cross-validation Perform a cross-validation on the set training data with measured accuracy. It requires a given number of folds. Note ---- cross validation is performed on the training data, not on the test data. So set your data as training data, if you want to perform a cross validation. Parameter --------- folds : int Contains the number of folds for the cross-validation. Returns ------- accuracy_list : array, shape = [float acc score1, float acc score2, ...] Contains the accuracy scores of all iterations. acc_mean : float Contains the accuracy mean of the all iterations. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException elif folds > len(self.train_samples): raise FoldSizeToBigException(folds, self.train_samples) else: kf = KFold(len(self.train_samples), n_folds=folds) accuracy_list = [] for train, test in kf: x_train, x_test, y_train, y_test = self.train_samples[train], self.train_samples[test], \ self.train_targets[train], self.train_targets[test] self.clf.fit(x_train, y_train) accuracy_list.append(accuracy_score(np.array(y_test), np.array(self.clf.predict(x_test)))) n = 0 sum_values = 0 for acc_value in accuracy_list: sum_values = sum_values + acc_value n += 1 acc_mean = (sum_values / n) return accuracy_list, acc_mean def evaluate_classification_report(self, fraction): """ A detailed classification report For an easy use to measure how well your trained model performs, the given method uses your set data objects and gives an accuracy score output on the shell. Note ---- There are two scenarios : 1. training data and test data are from the same data object. (means there names are the same !) - Normalization 2. training data and test data are from different data objects. + Normalization The first scenario will use given fraction and divide the training data in train and test data for the classification. If fraction is 100 then it will be trained and tested on the same data object. With a number of 80 fraction it will be trained on 80 percent and tested on 20 percent of the given data object. There is no Normalization for this scenario implemented ! The second scenario needs a number of 100 fraction, to use the whole training data for the training ! Working with normalized values. Parameter --------- fraction : int Contains a number from 0 to 100. Defines the fraction of the training data that will be used for training the classifier. """ if self.clf is None: raise NoClassifierException elif self.train_targets.size == 0 and self.train_samples.size == 0: raise EmptyFeaturesEmptyTargetsException else: # if trained on 100 % fraction, it will be tested on 100 % # fraction, than train and test data are the same # if count_predict is 0 (with 100% count_train), than # self.targets[-count_predict:] == self.targets[:] = True if self.test_data.name == self.train_data.name: print "train_data and test_data from one data_set" count_train = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) count_predict = len(self.train_targets) - count_train print "count_train:", count_train print "count_predict:", count_predict # Summarize placed in here, cause data objects are equal and # dived in this method. So training and test data are defined # in here. print "##########train_data summarize##########" summarize_textpair(self.train_data.real_data.values()[:count_train]) print "##########test_data summarize##########" summarize_textpair(self.train_data.real_data.values()[-count_predict:]) # setting train and test data train_samples = self.train_samples[:count_train] train_targets = self.train_targets[:count_train] test_samples = self.train_samples[-count_predict:] test_targets = self.train_targets[-count_predict:] # Training self.clf.fit(train_samples, train_targets) # Testing test_targets_predicted = self.clf.predict(test_samples) # calculating baseline null = 0 eins = 0 for i in test_targets: if i == 0: null += 1 else: eins += 1 if null > eins: baseline = float(null)/(float(null)+float(eins)) else: baseline = float(eins)/(float(null)+float(eins)) print "Anzahl 0:", null print "Anzahl 1:", eins print "Baseline:", baseline print "-------------------------------" # Calculating accuracy score of predicted samples print "accuracy_score: ", accuracy_score(test_targets, test_targets_predicted) else: # Normalization norma = preprocessing.normalize(self.train_samples) count_train = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0)) print "count_train:", count_train print "count_predict:", len(self.test_targets) # Setting train and test data # without normalization take this one instead # train_samples = self.train_samples[:count_train] train_samples = norma[:count_train] train_targets = self.train_targets[:count_train] # without normalization take this one instead # test_samples = self.test_samples test_samples = preprocessing.normalize(self.test_samples) test_targets = self.test_targets # Training self.clf.fit(train_samples, train_targets) # Testing test_targets_predicted = self.clf.predict(test_samples) # Calculating baseline null = 0 eins = 0 for i in test_targets: if i == 0: null += 1 else: eins += 1 if null > eins: baseline = float(null)/(float(null)+float(eins)) else: baseline = float(eins)/(float(null)+float(eins)) print "Anzahl 0:", null print "Anzahl 1:", eins print "Baseline:", baseline print "-------------------------------" # Calculating accuracy score of predicted samples print "accuracy_score: ", accuracy_score(test_targets, test_targets_predicted)
for i in range(0, nr_of_neighbors): # the id of the neighboars: neighbors[1][0][i] print(data_df.iloc[neighbors[1][0][i], :]) # sort by distance # get the first n elements, as the first n closest neighbors #outputlist = sorted(neighbors[0], key=itemgetter(0)) #print("sorted array") # print(outputlist) # the radius neighbors # https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html classifier_radius = RadiusNeighborsClassifier(radius=5) classifier_radius.fit(X_train, y_train) y_pred_radius = classifier_radius.predict(X_test) print("radius prediction") print(y_pred_radius) print("Accuracy radius classifier") print(confusion_matrix(y_test, y_pred_radius)) print(classification_report(y_test, y_pred_radius)) y_pred_radius_for_one = classifier_radius.predict(new_X) print("radius prediction for one") print(y_pred_radius_for_one) print("Accuracy radius classifier for one") print(confusion_matrix(new_y, y_pred_radius_for_one))
def par(X_tr, y_tr, X_te, r): neigh = RadiusNeighborsClassifier(radius = r) neigh.fit(X_tr, y_tr) y_pred = neigh.predict(X_te) return y_pred
def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train, add, alg): # size_train = len(Y_train) X_train_temp = np.copy(X_train) Y_train_temp = np.copy(Y_train) test_size = len(X_test) Y_predict = [-1 for x in range(test_size)] Y_current = list(set(Y_train)) test_index = [x for x in range(test_size)] new_indices = [] epsilon_update = epsilon # epsilon_update = updateEpsilon(distances, test_index, choice) for test_time in range(test_size): Knn_temp = NearestNeighbors(n_neighbors=1) Knn_temp.fit(X_train_temp) min_distances = Knn_temp.kneighbors(X_test[test_index])[0] min_distances = [np.mean(x) for x in min_distances] optimal_indice = min_distances.index(min(min_distances)) optimal_test = test_index[optimal_indice] clf = RadiusNeighborsClassifier(radius=epsilon_update, weights='distance').fit( X_train_temp, Y_train_temp) predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1, -1))[1] predict_set = list(predict_set[0]) if len(predict_set) > 0: if min(Y[predict_set]) == max(Y[predict_set]): y_predict = min(Y[predict_set]) else: if alg == "srnc": y_predict = clf.predict(X_test[optimal_test].reshape( 1, -1)) y_predict = y_predict[0] else: if alg == "svm": clf = svm.SVC().fit(X[predict_set], Y[predict_set]) if alg == "LinearSVC": # clf = LinearSVC(max_iter=10000).fit(X[predict_set], Y[predict_set]) clf = LinearSVC().fit(X[predict_set], Y[predict_set]) if alg == "dt": clf = DecisionTreeClassifier().fit( X[predict_set], Y[predict_set]) if alg == "rf": clf = RandomForestClassifier(n_estimators=10).fit( X[predict_set], Y[predict_set]) if alg == "gb": clf = GradientBoostingClassifier(n_estimators=10).fit( X[predict_set], Y[predict_set]) if alg == "lr": clf = LogisticRegression(max_iter=10000).fit( X[predict_set], Y[predict_set]) if alg == "mlp": clf = MLPClassifier().fit(X[predict_set], Y[predict_set]) y_predict = clf.predict(X_test[optimal_test].reshape( 1, -1)) y_predict = y_predict[0] if add == 1: X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0) Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0) else: y_predict = max(Y_current) + 1 Y_current.append(y_predict) X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0) Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0) new_indices.append(optimal_test) # epsilon_update = updateEpsilon(distances, test_index, choice) Y_predict[optimal_test] = y_predict test_index.remove(optimal_test) return Y_predict