def do(train_data, train_label, test_data, test_label=None, adjust_parameters=True, k=5): train_data = np.array(train_data).squeeze() train_label = np.array(train_label).squeeze() test_data = np.array(test_data).squeeze() if test_label is not None: test_label = np.array(test_label).squeeze() if not adjust_parameters: knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8) knn.fit(train_data, train_label) predicted_label = knn.predict(test_data) if test_label is not None: acc = accuracy_score(test_label, predicted_label) print 'acc is ', acc return predicted_label else: max_acc = 0.0 max_k = 0 max_predicted = None for k in range(1, 11): knn = KNeighborsClassifier(n_neighbors=k, n_jobs=8) knn.fit(train_data, train_label) predicted_label = knn.predict(test_data) acc = accuracy_score(test_label, predicted_label) if acc > max_acc: max_acc = acc max_k = k max_predicted = predicted_label print 'k = ', k, ' acc is ', acc print 'max acc is ', max_acc, ' responding to k is ', max_k return max_predicted, max_k
class PatchedRawModel: def __init__(self): self.baseModel = RawModel() self.model49 = KNeighborsClassifier(n_neighbors=10) self.model35 = KNeighborsClassifier(n_neighbors=10) def fit(self, trainExamples): self.baseModel.fit(trainExamples) X49 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [4, 9]] ) Y49 = [x.Y for x in trainExamples if x.Y in [4, 9]] self.model49.fit(X49, Y49) X35 = vstack ( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples if x.Y in [3, 5]] ) Y35 = [x.Y for x in trainExamples if x.Y in [3, 5]] self.model35.fit(X35, Y35) def predict(self, examples): basePredictions = self.baseModel.predict(examples) for (x, y, i) in zip(examples, basePredictions, range(0, len(examples))): if y in [4, 9]: specializedPrediction = self.model49.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT))) if specializedPrediction != y: basePredictions[i] = specializedPrediction elif y in [3, 5]: specializedPrediction = self.model35.predict(reshape(x.X, (1, x.WIDTH * x.HEIGHT))) if specializedPrediction != y: basePredictions[i] = specializedPrediction return basePredictions
def plotDecisionBoundry(X, y, y_predicted, modelName): X_Train_embedded = TSNE(n_components=2).fit_transform(X) print(X_Train_embedded.shape) # create meshgrid resolution = 1000 # 100x100 background pixels X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max( X_Train_embedded[:, 0]) X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max( X_Train_embedded[:, 1]) xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( X_Train_embedded, y_predicted) voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoiBackground = voronoiBackground.reshape((resolution, resolution)) #plot plt.contourf(xx, yy, voronoiBackground) plt.scatter(X_Train_embedded[:, 0], X_Train_embedded[:, 1], c=y.values.flatten()) plt.title(modelName) plt.show()
class KNeighborsClassifierImpl(): def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): self._hyperparams = { 'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm, 'leaf_size': leaf_size, 'p': p, 'metric': metric, 'metric_params': metric_params, 'n_jobs': n_jobs} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def compute_cnn(X, y): "condenced nearest neighbor. the cnn removes reduntant instances, maintaining the samples in the decision boundaries." classifier = KNeighborsClassifier(n_neighbors=3) prots_s = [] labels_s = [] classes = np.unique(y) classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] classifier.fit(prots_s, labels_s) X_ = np.asarray(prots_s) y_ = np.asarray(labels_s) reduction_ = 1.0 - float(len(y_)/len(y)) print reduction_
def KNN_method(X, y): skf = StratifiedKFold(n_splits=4, random_state=42) skf.get_n_splits(X, y) for train_index, test_index in skf.split(X, y): print("Train:", train_index, "Validation:", test_index) trainX, testX = X[train_index], X[test_index] trainY, testY = y[train_index], y[test_index] #here starts KNN #how many neighbours want to use in the KNC kvalues = [1, 3, 5, 7, 9, 11, 13, 15, 19, 24, 30, 40, 50, 60, 70, 90] dist = ['manhattan', 'euclidean', 'chebyshev'] results = {} for element in dist: accuracy_results = [] for k in kvalues: knn = KNeighborsClassifier(n_neighbors=k, metric=element) knn.fit(trainX, trainY) predictedY = knn.predict(testX) accuracy_results.append(accuracy_score(testY, predictedY)) results[element] = accuracy_results print("Results of model preparation for: " + str(results)) plt.figure() multiple_line_chart(plt.gca(), kvalues, results, 'KNN variants', 'n', 'accuracy', percentage=True) plt.show()
class RawModel: def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean') def fit(self, trainExamples): X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) ) Y = [x.Y for x in trainExamples] self.model.fit(X, Y) return self def predict(self, examples): X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) ) return self.model.predict( X )
def evaluate(Xtra, ytra, Xtst, ytst, k=1, positive_label=1): knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute') knn.fit(Xtra, ytra) y_true = ytst y_pred = knn.predict(Xtst) return evaluate_results(y_true, y_pred, positive_label=positive_label)
def knn(X, y, model_path): model = KNeighborsClassifier() model.fit(X, y) print(model) #预测 expected = y predicted = model.predict(X) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) joblib.dump(model, model_path)
def plot_boundaries_decision(X, y, clf, namefile): """ Method to plot the boundaries decision of our data X : A numpy array of the data we want to plot y : A numpy array of the label corresponding to our data clf : the model use to predict the label of our data namefile : the name of the file in which we want to save the figure """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42) # #The plot of boundary decision in the 2D space of representation of data model.fit(X_train, y_train) # create meshgrid resolution = 100 # 100x100 background pixels X2d_xmin, X2d_xmax = np.min(X[:, 0]), np.max(X[:, 0]) X2d_ymin, X2d_ymax = np.min(X[:, 1]), np.max(X[:, 1]) xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit(X, y) voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoiBackground = voronoiBackground.reshape((resolution, resolution)) fig = pyplot.figure() fig.set_size_inches(10.5, 8.5) ax = fig.add_subplot(211) #small subplot to show how the legend has moved. #plot ax.contourf(xx, yy, voronoiBackground) ax.set_title( " Boundaries decision in using the dimensionality reduction of Multidimensional scaling" ) ax.scatter(X[:, 0], X[:, 1], c=color[y].tolist()) label = numpy.array([x for x in ["Apple", "Tomatoes"]]) # Legend for ind, s in enumerate(label): ax.scatter([], [], label=s, color=color[ind]) pyplot.legend(scatterpoints=1, frameon=True, labelspacing=0.5, bbox_to_anchor=(1.2, .4), loc='center right') pyplot.tight_layout() pyplot.savefig(namefile) pyplot.show()
def __plot_decision_boundaries(X, y, y_pred, resolution: int = 100, embedding=None): if embedding is None: embedding = TSNE(n_components=2, random_state=160290).fit_transform(X) x_min, x_max = safe_bounds(embedding[:, 0]) y_min, y_max = safe_bounds(embedding[:, 1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( embedding, y_pred) voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoi_bg = voronoi_bg.reshape((resolution, resolution)) mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis") points = hv.Scatter( { "x": embedding[:, 0], "y": embedding[:, 1], "pred": y_pred, "class": y }, kdims=["x", "y"], vdims=["pred", "class"], ) errors = y_pred != y failed_points = hv.Scatter( { "x": embedding[errors, 0], "y": embedding[errors, 1] }, kdims=["x", "y"]).opts(color="red", size=5, alpha=0.9) points = points.opts(color="pred", cmap="viridis", line_color="grey", size=10, alpha=0.8, tools=["hover"]) plot = mesh * points * failed_points plot = plot.opts(xaxis=None, yaxis=None, width=500, height=450, title="Decision boundaries on TSNE") return plot
def get_result(self): # file opener tkinter.Tk().withdraw() directory = filedialog.askdirectory() result = self.read_emails_from_directory(directory) train_labels = np.zeros(1430) train_labels[715:1430] = 1 # This equates to 1-715 = HAM and 716-1430 = SPAM # If you change result[n] to something else # Make sure you change the same result down # down in line 251 (test_matrix) train_matrix = self.extract_features(directory, result[0]) #print(train_matrix) # print("body words:", result[0]) # print("\n\nsubject words:", result[1]) # print("\n\nbody phrases:", result[2]) # print("\n\nsubject phrases:", result[3]) print("body words:", len(result[0])) print("subject words:", len(result[1])) print("body phrases:", len(result[2])) print("subject phrases:", len(result[3])) model1 = MultinomialNB() model2 = LinearSVC() model3 = RandomForestClassifier() model4 = KNeighborsClassifier() model1.fit(train_matrix, train_labels) model2.fit(train_matrix, train_labels) model3.fit(train_matrix, train_labels) model4.fit(train_matrix, train_labels) test_dir = filedialog.askdirectory() # Here -----v test_matrix = self.extract_features(test_dir, result[0]) test_labels = np.zeros(600) # This equates to 1-300 = HAM and 301-600 = SPAM test_labels[300:600] = 1 result1 = model1.predict(test_matrix) result2 = model2.predict(test_matrix) result3 = model3.predict(test_matrix) result4 = model4.predict(test_matrix) print(confusion_matrix(test_labels, result1)) print(confusion_matrix(test_labels, result2)) print(confusion_matrix(test_labels, result3)) print(confusion_matrix(test_labels, result4)) return result
def build_and_test_model(classifier, X, Y, Z, param): accuracies = [] ari = [] for train, test in LeaveOneOut().split(X): X_train, Y_train = X[train], Y[train] X_test, Y_test, Z_test = X[test], Y[test], Z[test] predicted = None if classifier == "KNN": neigh = KNeighborsClassifier(n_neighbors=param).fit( X_train, Y_train) predicted = neigh.predict(X_test) elif classifier == "RF": clf = RandomForestClassifier(n_estimators=param, random_state=0) # ,max_depth=2, clf.fit(X_train, Y_train) predicted = clf.predict(X_test) elif classifier == "SVM": clf = svm.SVC(gamma='scale') clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "NAIVE": clf = GaussianNB() clf.fit(X_train, Y_train) predicted = clf.predict(X_test).astype(int) elif classifier == "RANDOM": options = list(set(Y_train)) predicted = [random.choice(options) for _ in range(len(Y_test))] accuracies.append(metrics.accuracy_score(Y_test, predicted)) ari.append(metrics.adjusted_rand_score(Z_test, predicted)) return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
def compute_enn(X, y): """ the edited nearest neighbors removes the instances in the boundaries, maintaining reduntant samples """ classifier = KNeighborsClassifier(n_neighbors=3) classes = np.unique(y) classes_ = classes mask = np.zeros(y.size, dtype=bool) classifier.fit(X, y) for i in xrange(y.size): sample, label = X[i], y[i] if classifier.predict(sample) == [label]: mask[i] = not mask[i] X_ = np.asarray(X[mask]) y_ = np.asarray(y[mask]) reduction_ = 1.0 - float(len(y_)) / len(y) print reduction_
def nd_boundary_plot(X_tst, y_predicted, model, ax, resolution=256): if len(X_tst.shape) != 2: raise ValueError("X must be ndarray of the form [nsamples, nfeatures]") if X_tst.shape[0] < 2: raise ValueError("Must have at least 2 dimensions") if not hasattr(model, "classes_"): raise ValueError("Model has to be trained first") if len(model.classes_) < 2: raise ValueError("Classification must be at least binary") #done with sanity checks if X_tst.shape[1] == 2: #2 dimensions X = X_tst xmin, xmax = np.min(X[:, 0]), np.max(X[:, 0]) ymin, ymax = np.min(X[:, 1]), np.max(X[:, 1]) xx, yy = np.meshgrid(np.linspace(xmin, xmax, resolution), np.linspace(ymin, ymax, resolution)) if hasattr(model, "decision_function") or len( model.classes_ ) != 2: #model does not comute posterior or hard to graph Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) else: Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] else: #lots of dimensions X = TSNE(n_components=2).fit_transform(X_tst) background_model = KNeighborsClassifier(n_neighbors=1).fit( X, y_predicted) xmin, xmax = np.min(X[:, 0]), np.max(X[:, 0]) ymin, ymax = np.min(X[:, 1]), np.max(X[:, 1]) xx, yy = np.meshgrid(np.linspace(xmin, xmax, resolution), np.linspace(ymin, ymax, resolution)) Z = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape((resolution, resolution)) ax.contourf(xx, yy, Z, alpha=.3) ax.scatter(X[:, 0], X[:, 1], c=y_predicted)
def check_accuracy_f1(self, path): data_after_feature_selected = [] for i in path: data_after_feature_selected.append(feature[:, i]) data_after_feature_selected = np.array(data_after_feature_selected) data_after_feature_selected = data_after_feature_selected.transpose( ) # 矩阵转置 X_train2, X_test2, y_train2, y_test2 = train_test_split( data_after_feature_selected, label, test_size=0.3) if select_model == "SVM": model_svm = svm.SVC(kernel='poly', gamma=0.125, C=20) model_svm.fit(X_train2, y_train2) model_svm.get_params(deep=True) prediction2 = model_svm.predict(X_test2) elif select_model == "KNN": model_knn = KNeighborsClassifier(n_neighbors=1) model_knn.fit(X_train2, y_train2) model_knn.get_params(deep=True) prediction2 = model_knn.predict(X_test2) elif select_model == "RF": model_rf2 = RandomForestClassifier() model_rf2.fit(X_train2, y_train2) model_rf2.get_params(deep=True) prediction2 = model_rf2.predict(X_test2) elif select_model == "LR": model_lr2 = LogisticRegression() model_lr2.fit(X_train2, y_train2) prediction2 = model_lr2.predict(X_test2) elif select_model == "DT": model_dt = DecisionTreeClassifier() model_dt.fit(X_train2, y_train2) prediction2 = model_dt.predict(X_test2) return accuracy_score(y_test2, prediction2), f1_score(y_test2, prediction2, average='macro')
# model.fit(feature[train_index,:][:,top], label[train_index]) # prediction = model.predict(feature[test_index,:][:,top]) # acc, f = get_result(label[test_index], prediction) # accuracy['DT'].append(acc) # f1['DT'].append(f) # model = LogisticRegression() # model.fit(feature[train_index,:][:,top], label[train_index]) # prediction = model.predict(feature[test_index,:][:,top]) # acc, f = get_result(label[test_index], prediction) # accuracy['LR'].append(acc) # f1['LR'].append(f) model = KNeighborsClassifier(n_neighbors=1) model.fit(feature[train_index, :][:, top], label[train_index]) prediction = model.predict(feature[test_index, :][:, top]) acc, f = get_result(label[test_index], prediction) accuracy['KNN'].append(acc) f1['KNN'].append(f) model = RandomForestClassifier(n_estimators=250) model.fit(feature[train_index, :][:, top], label[train_index]) prediction = model.predict(feature[test_index, :][:, top]) acc, f = get_result(label[test_index], prediction) accuracy['RF'].append(acc) f1['RF'].append(f) # model = MLPClassifier() # model.fit(feature[train_index,:][:,top], label[train_index]) # prediction = model.predict(feature[test_index,:][:,top]) # acc, f = get_result(label[test_index], prediction)
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin): """Mixin class for all instance reduction techniques""" def set_classifier(self): """Sets the classified to be used in the instance reduction process and classification. Parameters ---------- classifier : classifier, following the KNeighborsClassifier style (default = KNN) y : array-like, shape = [n_samples] Labels for X. Returns ------- P : array-like, shape = [indeterminated, n_features] Resulting training set. q : array-like, shape = [indertaminated] Labels for P """ self.classifier = classifier def reduce_data(self, X, y): """Perform the instance reduction procedure on the given training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training set.0 y : array-like, shape = [n_samples] Labels for X. Returns ------- X_ : array-like, shape = [indeterminated, n_features] Resulting training set. y_ : array-like, shape = [indertaminated] Labels for X_ """ pass def fit(self, X, y, reduce_data=True): """ Fit the InstanceReduction model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) reduce_data : bool, flag indicating if the reduction would be performed """ self.X = X self.y = y if reduce_data: self.reduce_data(X, y) return self def predict(self, X, n_neighbors=1): """Perform classification on an array of test vectors X. The predicted class C for each sample in X is returned. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Notes ----- The default prediction is using KNeighborsClassifier, if the instance reducition algorithm is to be performed with another classifier, it should be explicited overwritten and explained in the documentation. """ X = check_array(X) if not hasattr(self, "X_") or self.X_ is None: raise AttributeError("Model has not been trained yet.") if not hasattr(self, "y_") or self.y_ is None: raise AttributeError("Model has not been trained yet.") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) self.classifier.fit(self.X_, self.y_) return self.classifier.predict(X) def predict_proba(self, X): """Return probability estimates for the test data X. after a given prototype selection algorithm. Parameters ---------- X : array, shape = (n_samples, n_features) A 2-D array representing the test points. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ self.classifier.fit(self.X_, self.y_) return self.classifier.predict_proba(X)
class KNNClassifier(): ''' classdocs ''' def __init__(self, csv_path_train, csv_path_test, k): ''' Constructor ''' self.csv_path_train = csv_path_train self.csv_path_test = csv_path_test self.classifier = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski') def create_arrays(self): arr_train = np.genfromtxt(self.csv_path_train, delimiter=',', skip_header=1) self.X_train = np.delete(arr_train, [arr_train.shape[1] - 1], axis=1) self.y_train = np.delete(arr_train, list(range(arr_train.shape[1] - 1)), axis=1) arr_test = np.genfromtxt(self.csv_path_test, delimiter=',', skip_header=1) self.X_test = np.delete(arr_test, [arr_test.shape[1] - 1], axis=1) self.y_test = np.delete(arr_test, list(range(arr_test.shape[1] - 1)), axis=1) def preprocess(self): # X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(self.X_train) self.X_train_std = sc.transform(self.X_train) self.X_test_std = sc.transform(self.X_test) def train(self): self.create_arrays() self.preprocess() self.classifier.fit(self.X_train_std, self.y_train.ravel()) def test(self, f, patient_num, total_fpr, total_tpr): y_pred = self.classifier.predict(self.X_test_std) accuracy = accuracy_score(self.y_test, y_pred) precision = accuracy_score(self.y_test, y_pred) recall = recall_score(self.y_test, y_pred) print("Accuracy: %.2f" % accuracy) print("Precision: %.2f" % precision) print("Recall: %.2f" % recall) line = str(accuracy) + "," + str(precision) + "," + str(recall) f.write(line) f.write("\n") confmat = confusion_matrix(self.y_test, y_pred) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num + "_confmat.png") plt.close() fpr, tpr, thresholds = roc_curve(self.y_test, y_pred) print("fpr", fpr) print("tpr", tpr) total_fpr[1] += fpr[len(fpr) - 2] total_tpr[1] += tpr[len(tpr) - 2] print(total_fpr) print(total_tpr) roc_auc = auc(fpr, tpr) plt.title('ROC Curve') plt.plot(fpr, tpr, 'b', label='AUC = %.2F' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig("D:\\Documents\\KNN\\FFT\\chb" + patient_num + "roc.png") plt.close() return total_fpr, total_tpr
class CNN(InstanceReductionMixin): """Condensed Nearest Neighbors. Each class is represented by a set of prototypes, with test samples classified to the class with the nearest prototype. The Condensed Nearest Neighbors removes the redundant instances, maintaining the samples in the decision boundaries. Parameters ---------- n_neighbors : int, optional (default = 1) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `prototypes_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `labels_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.cnn import CNN >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> y = np.array([1, 1, 1, 2, 2, 2]) >>> cnn = CNN() >>> cnn.fit(X, y) CNN(n_neighbors=1) >>> print(cnn.predict([[-0.8, -1]])) [1] See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier Notes ----- The Condensed Nearest Neighbor is one the first prototype selection technique in literature. References ---------- P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on Information Theory 14 (1968) 515–516. """ def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
class SSMA(InstanceReductionMixin): """Steady State Memetic Algorithm The Steady-State Memetic Algorithm is an evolutionary prototype selection algorithm. It uses a memetic algorithm in order to perform a local search in the code. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. alpha : float (default = 0.6) Parameter that ponderates the fitness function. max_loop : int (default = 1000) Number of maximum loops performed by the algorithm. threshold : int (default = 0) Threshold that regulates the substitution condition; chromosomes_count: int (default = 10) number of chromosomes used to find the optimal solution. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.ssma import SSMA >>> import numpy as np >>> X = np.array([[i] for i in range(100)]) >>> y = np.asarray(50 * [0] + 50 * [1]) >>> ssma = SSMA() >>> ssma.fit(X, y) SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0) >>> print ssma.predict([[40],[60]]) [0 1] >>> print ssma.reduction_ 0.98 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype selection based on a steady-state memetic algorithm: a study of scalability. Memetic Computing, 2(3):183–199, 2010. """ def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) def accuracy(self, chromosome, X, y): mask = np.asarray(chromosome, dtype=bool) cX, cy = X[mask], y[mask] #print len(cX), len(cy), sum(chromosome) self.classifier.fit(cX, cy) labels = self.classifier.predict(X) accuracy = (labels == y).sum() return float(accuracy) / len(y) def fitness(self, chromosome, X, y): #TODO add the possibility of use AUC for factor1 ac = self.accuracy(chromosome, X, y) rd = 1.0 - (float(sum(chromosome)) / len(chromosome)) return self.alpha * ac + (1.0 - self.alpha) * rd def fitness_gain(self, gain, n): return self.alpha * (float(gain) / n) + (1 - self.alpha) * (1.0 / n) def update_threshold(self, X, y): best_index = np.argmax(self.evaluations) chromosome = self.chromosomes[best_index] best_ac = self.accuracy(chromosome, X, y) best_rd = 1.0 - float(sum(chromosome)) / len(y) if best_ac <= self.best_chromosome_ac: self.threshold = self.threshold + 1 if best_rd <= self.best_chromosome_rd: self.threshold = self.threshold - 1 self.best_chromosome_ac = best_ac self.best_chromosome_rd = best_rd def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U def memetic_looper(self, S, R): c = 0 for i in range(len(S)): if S[i] == 1 and i not in R: c = c + 1 if c == 2: return True return False def memetic_select_j(self, S, R): indexs = [] for i in range(len(S)): if i not in R and S[i] == 1: indexs.append(i) # if list is empty wlil return error return np.random.choice(indexs) def generate_population(self, X, y): self.chromosomes = [[np.random.choice([0, 1]) for i in range(len(y))] for c in range(self.chromosomes_count)] self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes] self.update_threshold(X, y) def select_parents(self, X, y): parents = [] for i in range(2): samples = random.sample(self.chromosomes, 2) parents = parents + [ samples[0] if self.fitness(samples[0], X, y) > self.fitness( samples[1], X, y) else samples[1] ] return np.array(parents, copy=True) def crossover(self, parent_1, parent_2): size = len(parent_1) mask = [0] * (size / 2) + [1] * (size - size / 2) mask = np.asarray(mask, dtype=bool) np.random.shuffle(mask) off_1 = parent_1 * mask + parent_2 * ~mask off_2 = parent_2 * mask + parent_1 * ~mask return np.asarray([off_1, off_2]) def mutation(self, offspring): for i in range(len(offspring)): if np.random.uniform(0, 1) < 1.0 / len(offspring): offspring[i] = not offspring[i] return offspring def memetic_search(self, chromosome, X, y, chromosome_fitness=None): S = np.array(chromosome, copy=True) if S.sum() == 0: return S, 0 if chromosome_fitness == None: chromosome_fitness = self.fitness(chromosome, X, y) fitness_s = chromosome_fitness # List of visited genes in S R = [] # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i U = self.index_nearest_neighbor(S, X, y) while self.memetic_looper(S, R): j = self.memetic_select_j(S, R) S[j] = 0 gain = 0.0 U_copy = list(U) mask = np.asarray(S, dtype=bool) X_tra, y_tra = X[mask], y[mask] real_idx = np.asarray(range(len(y)))[mask] if len(y_tra) > 0: for i in range(len(U)): if U[i] == j: self.classifier.fit(X_tra, y_tra) [[idx] ] = self.classifier.kneighbors(X[i], n_neighbors=1, return_distance=False) U[i] = real_idx[idx] if y[i] == y[U_copy[i]] and y[i] != y[U[i]]: gain = gain - 1.0 if y[i] != y[U_copy[i]] and y[i] == y[U[i]]: gain = gain + 1.0 if gain >= self.threshold: n = S.sum() g = self.fitness_gain(gain, n) fitness_s = fitness_s + g R = [] else: U = U_copy S[j] = 1 R.append(j) return list(S), fitness_s def main_loop(self, X, y): self.generate_population(X, y) n, worse_fit_index = 0, -1 while (n < self.max_loop): parents = self.select_parents(X, y) offspring = self.crossover(parents[0], parents[1]) offspring[0] = self.mutation(offspring[0]) offspring[1] = self.mutation(offspring[1]) fit_offs = [ self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring ] if worse_fit_index == -1: worse_fit_index = np.argmin(self.evaluations) for i in range(len(offspring)): p_ls = 1.0 if fit_offs[i] == -1: p_ls = -1 if fit_offs[i] <= self.evaluations[worse_fit_index]: p_ls = 0.0625 if np.random.uniform(0, 1) < p_ls: offspring[i], fit_offs[i] = self.memetic_search( offspring[i], X, y, chromosome_fitness=fit_offs[i]) for i in range(len(offspring)): if fit_offs[i] > self.evaluations[worse_fit_index]: self.chromosomes[worse_fit_index] = offspring[i] self.evaluations[worse_fit_index] = fit_offs[i] worse_fit_index = np.argmin(self.evaluations) n = n + 1 if n % 10 == 0: self.update_threshold(X, y) def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class ENN(InstanceReductionMixin): """Edited Nearest Neighbors. The Edited Nearest Neighbors removes the instances in de boundaries, maintaining redudant samples. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.enn import ENN >>> import numpy as np >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]]) >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2]) >>> editednn = ENN() >>> editednn.fit(X, y) ENN(n_neighbors=3) >>> print(editednn.predict([[-0.6, 0.6]])) [1] >>> print editednn.reduction_ 0.75 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest neighbor rule. JCP, 6(7):1493–1500, 2011. """ def __init__(self, n_neighbors=3): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
class CNN(InstanceReductionMixin): """Condensed Nearest Neighbors. Each class is represented by a set of prototypes, with test samples classified to the class with the nearest prototype. The Condensed Nearest Neighbors removes the redundant instances, maintaining the samples in the decision boundaries. Parameters ---------- n_neighbors : int, optional (default = 1) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `prototypes_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `labels_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.cnn import CNN >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> y = np.array([1, 1, 1, 2, 2, 2]) >>> cnn = CNN() >>> cnn.fit(X, y) CNN(n_neighbors=1) >>> print(cnn.predict([[-0.8, -1]])) [1] See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier Notes ----- The Condensed Nearest Neighbor is one the first prototype selection technique in literature. References ---------- P. E. Hart, The condensed nearest neighbor rule, IEEE Transactions on Information Theory 14 (1968) 515–516. """ def __init__(self, n_neighbors=1): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): X, y = check_arrays(X, y, sparse_format="csr") if self.classifier == None: self.classifier = KNeighborsClassifier( n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
############################################################ # 审查结果可视化 _Fig = plt.figure() _Fig.suptitle(t="ALGORITHM COMPARISION") _Ax = _Fig.add_subplot(111) plt.boxplot(x=_ALGORITHM_CMP_RESULT_LIST) _Ax.set_xticklabels(labels=_MODELS.keys()) plt.show() ################################################################################ # 预测程序启动... from sklearn import metrics ############################################################ # K近邻算法预测 _KNC_MODEL = KNC() _KNC_MODEL.fit(X=_X_TRAIN, y=_Y_TRAIN) _KNC_PREDICTIONS = _KNC_MODEL.predict(X=_X_VAL) print( "KNC-K近邻算法预测结果:\n", # " " * 4, "ACCURACY_SCORE:\n", " " * 8, metrics.accuracy_score(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS), "\n", # " " * 4, "CONFUSION_MATRIX:\n", metrics.confusion_matrix(y_true=_Y_VAL, y_pred=_KNC_PREDICTIONS), "\n", # " " * 4,
class SSMA(InstanceReductionMixin): """Steady State Memetic Algorithm The Steady-State Memetic Algorithm is an evolutionary prototype selection algorithm. It uses a memetic algorithm in order to perform a local search in the code. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. alpha : float (default = 0.6) Parameter that ponderates the fitness function. max_loop : int (default = 1000) Number of maximum loops performed by the algorithm. threshold : int (default = 0) Threshold that regulates the substitution condition; chromosomes_count: int (default = 10) number of chromosomes used to find the optimal solution. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.ssma import SSMA >>> import numpy as np >>> X = np.array([[i] for i in range(100)]) >>> y = np.asarray(50 * [0] + 50 * [1]) >>> ssma = SSMA() >>> ssma.fit(X, y) SSMA(alpha=0.6, chromosomes_count=10, max_loop=1000, threshold=0) >>> print ssma.predict([[40],[60]]) [0 1] >>> print ssma.reduction_ 0.98 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Joaquín Derrac, Salvador García, and Francisco Herrera. Stratified prototype selection based on a steady-state memetic algorithm: a study of scalability. Memetic Computing, 2(3):183–199, 2010. """ def __init__(self, n_neighbors=1, alpha=0.6, max_loop=1000, threshold=0, chromosomes_count=10): self.n_neighbors = n_neighbors self.alpha = alpha self.max_loop = max_loop self.threshold = threshold self.chromosomes_count = chromosomes_count self.evaluations = None self.chromosomes = None self.best_chromosome_ac = -1 self.best_chromosome_rd = -1 self.classifier = KNeighborsClassifier(n_neighbors = n_neighbors) def accuracy(self, chromosome, X, y): mask = np.asarray(chromosome, dtype=bool) cX, cy = X[mask], y[mask] #print len(cX), len(cy), sum(chromosome) self.classifier.fit(cX, cy) labels = self.classifier.predict(X) accuracy = (labels == y).sum() return float(accuracy)/len(y) def fitness(self, chromosome, X, y): #TODO add the possibility of use AUC for factor1 ac = self.accuracy(chromosome, X, y) rd = 1.0 - (float(sum(chromosome))/len(chromosome)) return self.alpha * ac + (1.0 - self.alpha) * rd def fitness_gain(self, gain, n): return self.alpha * (float(gain)/n) + (1 - self.alpha) * (1.0 / n) def update_threshold(self, X, y): best_index = np.argmax(self.evaluations) chromosome = self.chromosomes[best_index] best_ac = self.accuracy(chromosome, X, y) best_rd = 1.0 - float(sum(chromosome))/len(y) if best_ac <= self.best_chromosome_ac: self.threshold = self.threshold + 1 if best_rd <= self.best_chromosome_rd: self.threshold = self.threshold - 1 self.best_chromosome_ac = best_ac self.best_chromosome_rd = best_rd def index_nearest_neighbor(self, S, X, y): classifier = KNeighborsClassifier(n_neighbors=1) U = [] S_mask = np.array(S, dtype=bool, copy=True) indexs = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] for i in range(len(y)): real_indexes = np.asarray(range(len(y)))[S_mask] X_tra, y_tra = X[S_mask], y[S_mask] #print len(X_tra), len(y_tra) classifier.fit(X_tra, y_tra) [[index]] = classifier.kneighbors(X[i], return_distance=False) U = U + [real_indexes[index]] return U def memetic_looper(self, S, R): c = 0 for i in range(len(S)): if S[i] == 1 and i not in R: c = c + 1 if c == 2: return True return False def memetic_select_j(self, S, R): indexs = [] for i in range(len(S)): if i not in R and S[i] == 1: indexs.append(i) # if list is empty wlil return error return np.random.choice(indexs) def generate_population(self, X, y): self.chromosomes = [[np.random.choice([0,1]) for i in range(len(y))] for c in range(self.chromosomes_count)] self.evaluations = [self.fitness(c, X, y) for c in self.chromosomes] self.update_threshold(X, y) def select_parents(self, X, y): parents = [] for i in range(2): samples = random.sample(self.chromosomes, 2) parents = parents + [samples[0] if self.fitness(samples[0], X, y) > self.fitness(samples[1], X, y) else samples[1]] return np.array(parents, copy=True) def crossover(self, parent_1, parent_2): size = len(parent_1) mask = [0] * (size/2) + [1] * (size - size/2) mask = np.asarray(mask, dtype=bool) np.random.shuffle(mask) off_1 = parent_1 * mask + parent_2 * ~mask off_2 = parent_2 * mask + parent_1 * ~mask return np.asarray([off_1, off_2]) def mutation(self, offspring): for i in range(len(offspring)): if np.random.uniform(0,1) < 1.0/len(offspring): offspring[i] = not offspring[i] return offspring def memetic_search(self, chromosome, X, y, chromosome_fitness = None): S = np.array(chromosome, copy=True) if S.sum() == 0: return S, 0 if chromosome_fitness == None: chromosome_fitness = self.fitness(chromosome, X, y) fitness_s = chromosome_fitness # List of visited genes in S R = [] # let U = {u0, u1, ..., un} list where ui = classifier(si,S)/i U = self.index_nearest_neighbor(S, X, y) while self.memetic_looper(S, R): j = self.memetic_select_j(S, R) S[j] = 0 gain = 0.0 U_copy = list(U) mask = np.asarray(S, dtype=bool) X_tra, y_tra = X[mask], y[mask] real_idx = np.asarray(range(len(y)))[mask] if len(y_tra) > 0: for i in range(len(U)): if U[i] == j: self.classifier.fit(X_tra, y_tra) [[idx]] = self.classifier.kneighbors(X[i], n_neighbors=1, return_distance=False) U[i] = real_idx[idx] if y[i] == y[U_copy[i]] and y[i] != y[U[i]]: gain = gain - 1.0 if y[i] != y[U_copy[i]] and y[i] == y[U[i]]: gain = gain + 1.0 if gain >= self.threshold: n = S.sum() g = self.fitness_gain(gain, n) fitness_s = fitness_s + g R = [] else: U = U_copy S[j] = 1 R.append(j) return list(S), fitness_s def main_loop(self, X, y): self.generate_population(X, y) n, worse_fit_index = 0, -1 while (n < self.max_loop): parents = self.select_parents(X, y) offspring = self.crossover(parents[0], parents[1]) offspring[0] = self.mutation(offspring[0]) offspring[1] = self.mutation(offspring[1]) fit_offs = [self.fitness(off, X, y) if sum(off) > 0 else -1 for off in offspring] if worse_fit_index == -1: worse_fit_index = np.argmin(self.evaluations) for i in range(len(offspring)): p_ls = 1.0 if fit_offs[i] == -1: p_ls = -1 if fit_offs[i] <= self.evaluations[worse_fit_index]: p_ls = 0.0625 if np.random.uniform(0,1) < p_ls: offspring[i], fit_offs[i] = self.memetic_search(offspring[i], X, y, chromosome_fitness = fit_offs[i]) for i in range(len(offspring)): if fit_offs[i] > self.evaluations[worse_fit_index]: self.chromosomes[worse_fit_index] = offspring[i] self.evaluations[worse_fit_index] = fit_offs[i] worse_fit_index = np.argmin(self.evaluations) n = n + 1 if n % 10 == 0: self.update_threshold(X, y) def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def test_view_func_NN(model_classifier, model_rpn, model_inner, C): test_cls = 'aeroplane' input_train_file = 'pickle_data/train_data_Wflip_all.pickle' ## read the training data from pickle file or from annotations test_pickle = 'pickle_data/test_data_{}.pickle'.format(test_cls) if os.path.exists(test_pickle): with open(test_pickle) as f: all_imgs, classes_count, _ = pickle.load(f) class_mapping = C.class_mapping inv_class_mapping = {v: k for k, v in class_mapping.iteritems()} backend = K.image_dim_ordering() gt_cls_num = class_mapping[test_cls] print('work on class {}'.format(test_cls)) base_path = os.getcwd() # turn off any data augmentation at test time C.use_horizontal_flips = False C.use_vertical_flips = False C.rot_90 = False count = 0 good_img = 0 not_good = 0 def format_img_size(img, C): """ formats the image size based on config """ img_min_side = float(C.im_size) (height, width, _) = img.shape if width <= height: ratio = img_min_side / width new_height = int(ratio * height) new_width = int(img_min_side) else: ratio = img_min_side / height new_width = int(ratio * width) new_height = int(img_min_side) img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC) return img, ratio def format_img_channels(img, C): """ formats the image channels based on config """ img = img[:, :, (2, 1, 0)] img = img.astype(np.float32) img[:, :, 0] -= C.img_channel_mean[0] img[:, :, 1] -= C.img_channel_mean[1] img[:, :, 2] -= C.img_channel_mean[2] img /= C.img_scaling_factor img = np.transpose(img, (2, 0, 1)) img = np.expand_dims(img, axis=0) return img def format_img(img, C): """ formats an image for model prediction based on config """ img, ratio = format_img_size(img, C) img = format_img_channels(img, C) return img, ratio def display_image(img): img1 = img[:, :, (2, 1, 0)] # img1=img im = Image.fromarray(img1.astype('uint8'), 'RGB') im.show() # Method to transform the coordinates of the bounding box to its original size def get_real_coordinates(ratio, x1, y1, x2, y2): ## read the training data from pickle file or from annotations real_x1 = int(round(x1 // ratio)) real_y1 = int(round(y1 // ratio)) real_x2 = int(round(x2 // ratio)) real_y2 = int(round(y2 // ratio)) return (real_x1, real_y1, real_x2, real_y2) vnum_test = 24 azimuth_vec = np.concatenate( ([0], np.linspace((360. / (vnum_test * 2)), 360. - (360. / (vnum_test * 2)), vnum_test)), axis=0) def find_interval(azimuth, azimuth_vec): for i in range(len(azimuth_vec)): if azimuth < azimuth_vec[i]: break ind = i if azimuth > azimuth_vec[-1]: ind = 1 return ind class_mapping = C.class_mapping if 'bg' not in class_mapping: class_mapping['bg'] = len(class_mapping) class_mapping = {v: k for k, v in class_mapping.items()} # print(class_mapping) class_to_color = { class_mapping[v]: np.random.randint(0, 255, 3) for v in class_mapping } C.num_rois = 32 obj_num = 0 bbox_threshold_orig = 0.6 th_bbox = 0.4 ## get GT for all az for single cls feature_az = [] sorted_path = input_train_file tmp_ind = sorted_path.index('.pickle') sorted_path = sorted_path[:tmp_ind] + "_sorted_Angles" + sorted_path[ tmp_ind:] if os.path.exists(sorted_path): print("loading sorted data") with open(sorted_path) as f: trip_data = pickle.load(f) im_file = [] ind = [] for ii in range(360): for jj in range(3): try: im_file.append(trip_data[test_cls][ii][jj]) ind.append(ii) except: if jj == 0: print('no azimuth {}'.format(ii)) data_gen_train = data_generators.get_anchor_gt(im_file, [], C, K.image_dim_ordering(), mode='test') azimuth_dict = [] inner_NN = [] azimuths = [] for tt in range(len(ind)): try: if tt % 100 == 0: print('worked on {}/{}'.format(tt, len(ind))) # print ('im num {}'.format(good_img)) X, Y, img_data = next(data_gen_train) P_rpn = model_rpn.predict_on_batch(X) R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300) X2, Y1, Y2, Y_view = roi_helpers.calc_iou_new( R, img_data, C, C.class_mapping) pos_samples = np.where(Y1[0, :, -1] == 0) sel_samples = pos_samples[0].tolist() R = X2[0, sel_samples, :] for jk in range(R.shape[0] // C.num_rois + 1): ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois * (jk + 1), :], axis=0) if ROIs.shape[1] == 0: break if jk == R.shape[0] // C.num_rois: # pad R curr_shape = ROIs.shape target_shape = (curr_shape[0], C.num_rois, curr_shape[2]) ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype) ROIs_padded[:, :curr_shape[1], :] = ROIs ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :] ROIs = ROIs_padded [P_cls, P_regr, P_view] = model_classifier.predict([X, ROIs]) iner_f = model_inner.predict([X, ROIs]) # oo = model_classifier_only.predict([F, ROIs]) for ii in range(len(sel_samples)): if np.max(P_cls[0, ii, :]) < bbox_threshold_orig or np.argmax( P_cls[0, ii, :]) == (P_cls.shape[2] - 1): continue ## get class from the net # cls_num = np.argmax(P_cls[0, ii, :]) ## use gt class cls_num = gt_cls_num cls_name = inv_class_mapping[cls_num] cls_view = P_view[0, ii, 360 * cls_num:360 * (cls_num + 1)] # azimuths[cls_name].append(np.argmax(cls_view, axis=0)) inner_NN.append(iner_f[0, ii, :]) azimuth_dict.append(img_data['bboxes'][0]['azimuth']) except: print('failed on az {}'.format(img_data['bboxes'][0]['azimuth'])) ## calculating some mean feature map for every az with open('pickle_data/{}_NN.pickle'.format(C.weight_name), 'w') as f: pickle.dump([inner_NN, azimuth_dict], f) print('saved PICKLE') with open('pickle_data/{}_NN.pickle'.format(C.weight_name)) as f: inner_NN, azimuth_dict = pickle.load(f) neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(inner_NN, azimuth_dict) jj = 0 for im_file in all_imgs: jj += 1 if jj % 50 == 0: print(jj) filepath = im_file['filepath'] img = cv2.imread(filepath) img_gt = np.copy(img) if img is None: not_good += 1 continue else: good_img += 1 # print ('im num {}'.format(good_img)) X, ratio = format_img(img, C) if backend == 'tf': X = np.transpose(X, (0, 2, 3, 1)) # get the feature maps and output from the RPN Y1, Y2 = model_rpn.predict(X) R = roi_helpers.rpn_to_roi(Y1, Y2, C, K.image_dim_ordering(), overlap_thresh=0.7) # # convert from (x1,y1,x2,y2) to (x,y,w,h) R[:, 2] -= R[:, 0] R[:, 3] -= R[:, 1] width, height = int(im_file["width"]), int(im_file["height"]) resized_width, resized_height = data_generators.get_new_img_size( width, height, C.im_size) # [_,_, F] = model_rpn.predict(X) ROIs = [] ## pass on all the labels in the image, some of them are not equal to test_cls for bbox_gt in im_file['bboxes']: no_bbox_flag = 1 bbox_threshold = bbox_threshold_orig if not bbox_gt['class'] == test_cls: continue if bbox_gt[ 'class'] == test_cls and bbox_threshold == bbox_threshold_orig: obj_num += 1 while no_bbox_flag and bbox_threshold > th_bbox: cls_gt = bbox_gt['class'] az_gt = bbox_gt['azimuth'] el_gt = bbox_gt['elevation'] t_gt = bbox_gt['tilt'] if len(ROIs) == 0: # apply the spatial pyramid pooling to the proposed regions bboxes = {} probs = {} azimuths = {} inner_res = {} # print ('obj num {}'.format(obj_num)) for jk in range(R.shape[0] // C.num_rois + 1): ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois * (jk + 1), :], axis=0) if ROIs.shape[1] == 0: break if jk == R.shape[0] // C.num_rois: #pad R curr_shape = ROIs.shape target_shape = (curr_shape[0], C.num_rois, curr_shape[2]) ROIs_padded = np.zeros(target_shape).astype( ROIs.dtype) ROIs_padded[:, :curr_shape[1], :] = ROIs ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :] ROIs = ROIs_padded [P_cls, P_regr, P_view] = model_classifier.predict([X, ROIs]) inner_out = model_inner.predict([X, ROIs]) # oo = model_classifier_only.predict([F, ROIs]) for ii in range(P_cls.shape[1]): if np.max(P_cls[ 0, ii, :]) < bbox_threshold or np.argmax( P_cls[0, ii, :]) == (P_cls.shape[2] - 1): continue ## get class from the net # cls_num = np.argmax(P_cls[0, ii, :]) ## use gt class cls_num = gt_cls_num cls_name = inv_class_mapping[cls_num] cls_view = P_view[0, ii, 360 * cls_num:360 * (cls_num + 1)] if cls_name not in bboxes: bboxes[cls_name] = [] probs[cls_name] = [] azimuths[cls_name] = [] inner_res[cls_name] = [] (x, y, w, h) = ROIs[0, ii, :] try: (tx, ty, tw, th) = P_regr[0, ii, 4 * cls_num:4 * (cls_num + 1)] tx /= C.classifier_regr_std[0] ty /= C.classifier_regr_std[1] tw /= C.classifier_regr_std[2] th /= C.classifier_regr_std[3] x, y, w, h = roi_helpers.apply_regr( x, y, w, h, tx, ty, tw, th) except: pass bboxes[cls_name].append([ C.rpn_stride * x, C.rpn_stride * y, C.rpn_stride * (x + w), C.rpn_stride * (y + h) ]) probs[cls_name].append(np.max(P_cls[0, ii, :])) azimuths[cls_name].append( np.argmax(cls_view, axis=0)) inner_res[cls_name].append(inner_out[0, ii, :]) # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2) for key in bboxes: # if 1: if key == test_cls and bbox_gt['class'] == test_cls: bbox = np.array(bboxes[key]) prob = np.array(probs[key]) azimuth = np.array(azimuths[key]) inner_result = np.array(inner_res[key]) # img = draw_bbox(img,bbox, prob, azimuth, ratio) azimuth = neigh.predict(inner_result) ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox az = [] overlap_thresh = 0.5 try: while np.size(az) == 0 and overlap_thresh > 0: _, prob_bbox, az = roi_helpers.overlap_with_gt( bbox, prob, azimuth, bbox_gt, ratio=ratio, overlap_thresh=overlap_thresh, max_boxes=300, use_az=True) overlap_thresh -= 0.1 if overlap_thresh == 0: print("No good Bbox was found") counts = np.bincount(az) except: az = [] counts = [] try: az_fin = np.argmax(counts) true_bin = find_interval(az_gt, azimuth_vec) prob_bin = find_interval(az_fin, azimuth_vec) no_bbox_flag = 0 if true_bin == prob_bin: count += 1 break except: # print('here') no_bbox_flag = 1 bbox_threshold -= 0.1 ## azimuth calculations ## display bbox_threshold -= 0.1 succ = float(count) / float(obj_num) * 100. print( 'for class {} -true count is {} out of {} from {} images . {} success'. format(test_cls, count, obj_num, good_img, succ)) return succ
azimuths[cls_name].append(np.argmax(cls_view, axis=0)) inner_res[cls_name].append(inner_out[0, ii, :]) all_dets = [] if len(bboxes) == 0: bbox_threshold -= 0.1 # cv2.rectangle(img_gt, (bbox_gt['x1'], bbox_gt['y1']), (bbox_gt['x2'], bbox_gt['y2']), (int(class_to_color[test_cls][0]), int(class_to_color[test_cls][1]), int(class_to_color[test_cls][2])), 2) for key in bboxes: # if 1: if key == test_cls and bbox_gt['class'] == test_cls: bbox = np.array(bboxes[key]) prob = np.array(probs[key]) azimuth = np.array(azimuths[key]) inner_result = np.array(inner_res[key]) # img = draw_bbox(img,bbox, prob, azimuth, ratio) azimuth = neigh.predict(inner_result) ## get the azimuth from bbox that have more than 'overlap_thresh' overlap with gt_bbox az = [] overlap_thresh = 0.5 try: while np.size(az) == 0 and overlap_thresh > 0: _, prob_bbox, az = roi_helpers.overlap_with_gt( bbox, prob, azimuth, bbox_gt, ratio=ratio, overlap_thresh=overlap_thresh, max_boxes=300, use_az=True) overlap_thresh -= 0.1
def KFoldCrossValidation(train_and_test_indexes, X_data_frame, y_data_frame, k_value=3, kcv_value=9, smote=True, debug=False): train_indexes = train_and_test_indexes[0] #print('Train Indexes:',train_indexes) test_indexes = train_and_test_indexes[1] #print('Test Indexes:',test_indexes) knn = KNeighborsClassifier(n_neighbors=k_value) #if debug: #print("Train Index: ", train_index, "\n") #print("Test Index: ", test_index, "\n") # STEP 1: split data between test and train sets if debug: print('* Starting train and test sets splitting... ', end='') y_data = np.ravel(y_data_frame) # Added to solve column-vector issue X_train, X_test, y_train, y_test = X_data_frame[ train_indexes], X_data_frame[test_indexes], y_data[ train_indexes], y_data[test_indexes] #print('y_data[test_indexes]:',y_data[test_indexes]) if debug: print('Done!') # print the shapes of the new X objects if debug: print('* Display X and y objects\'s shape:') print('\t X_train.shape: ', X_train.shape) print('\t X_test.shape: ', X_test.shape) print('\t y_train.shape: ', y_train.shape) print('\t y_test.shape: ', y_test.shape) # SMOTE HERE if smote: # Oversampling training data using SMOTE if debug: print('* Starting to oversample training data using SMOTE...') print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) from imblearn.over_sampling import SMOTE smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) if debug: print('\t -Instances amount from each class AFTER to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) #print('y_train:',y_train) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) #print('y_test:',y_data[test_indexes]) #print('y_pred=',y_pred) # comparing actual response values (y_test) with predicted response values (y_pred) this_accuracy = metrics.accuracy_score(y_test, y_pred) this_confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None) return this_accuracy, this_confusion_matrix
out = Trainer.model.semantics(data, 1, 1, 1.0) n = data.shape[0] Xtrain[counter:counter + n] = out[0].detach().cpu().numpy().reshape(n, -1) Ytrain[counter:counter + n] = labels.numpy() counter += n counter = 0 for i, (data, labels) in enumerate(testloader): # Feed forward data data = data.reshape(-1, *Trainer.input_shape).to(torch.float32).to( Trainer.device) out = Trainer.model.semantics(data, 1, 1, 1.0) n = data.shape[0] Xtest[counter:counter + n] = out[0].detach().cpu().numpy().reshape( n, -1) Ytest[counter:counter + n] = labels.numpy() counter += n else: raise ValueError('Wrong dataset') #%% from sklearn.neighbors.classification import KNeighborsClassifier classifier = KNeighborsClassifier() classifier.fit(Xtrain, Ytrain) Ypred = classifier.predict(Xtest) print(np.mean(Ypred == Ytest))
def plot_decision_boundaries( X_train, y_train, y_pred_train, X_test, y_test, y_pred_test, resolution: int = 100, embedding=None, ): X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) y_pred = np.concatenate([y_pred_train, y_pred_test]) if embedding is None: try: embedding = umap.UMAP(n_components=2, random_state=160290).fit_transform(X) except: from sklearn.manifold import TSNE embedding = TSNE(n_components=2, random_state=160290).fit_transform(X) x_min, x_max = safe_bounds(embedding[:, 0]) y_min, y_max = safe_bounds(embedding[:, 1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( embedding, y_pred) voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoi_bg = voronoi_bg.reshape((resolution, resolution)) mesh = hv.QuadMesh((xx, yy, voronoi_bg)).opts(cmap="viridis", alpha=0.6) points_train = hv.Scatter( { "x": embedding[:len(y_train), 0], "y": embedding[:len(y_train), 1], "pred": y_pred_train, "class": y_train, }, kdims=["x", "y"], vdims=["pred", "class"], ) points_test = hv.Scatter( { "x": embedding[len(y_train):, 0], "y": embedding[len(y_train):, 1], "pred": y_pred_test, "class": y_test, }, kdims=["x", "y"], vdims=["pred", "class"], ) errors = y_pred != y failed_points = hv.Scatter( { "x": embedding[errors, 0], "y": embedding[errors, 1] }, kdims=["x", "y"]).opts(color="red", size=2, alpha=0.9) points_train = points_train.opts(color="class", cmap="viridis", line_color="grey", size=10, alpha=0.8, tools=["hover"]) points_test = points_test.opts( color="class", cmap="viridis", line_color="grey", size=10, alpha=0.8, tools=["hover"], marker="square", ) plot = mesh * points_train * points_test * failed_points plot = plot.opts(xaxis=None, yaxis=None, width=500, height=450, title="Fronteras de decisión") return plot
class ENN(InstanceReductionMixin): """Edited Nearest Neighbors. The Edited Nearest Neighbors removes the instances in de boundaries, maintaining redudant samples. Parameters ---------- n_neighbors : int, optional (default = 3) Number of neighbors to use by default for :meth:`k_neighbors` queries. Attributes ---------- `X_` : array-like, shape = [indeterminated, n_features] Selected prototypes. `y_` : array-like, shape = [indeterminated] Labels of the selected prototypes. `reduction_` : float, percentual of reduction. Examples -------- >>> from protopy.selection.enn import ENN >>> import numpy as np >>> X = np.array([[-1, 0], [-0.8, 1], [-0.8, -1], [-0.5, 0] , [0.5, 0], [1, 0], [0.8, 1], [0.8, -1]]) >>> y = np.array([1, 1, 1, 2, 1, 2, 2, 2]) >>> editednn = ENN() >>> editednn.fit(X, y) ENN(n_neighbors=3) >>> print(editednn.predict([[-0.6, 0.6]])) [1] >>> print editednn.reduction_ 0.75 See also -------- sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier References ---------- Ruiqin Chang, Zheng Pei, and Chao Zhang. A modified editing k-nearest neighbor rule. JCP, 6(7):1493–1500, 2011. """ def __init__(self, n_neighbors=3): self.n_neighbors = n_neighbors self.classifier = None def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_arrays(X, y, sparse_format="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 return self.X_, self.y_ mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
print(label_train.shape) print(label_test.shape) # 학습 모델 생성 후 처리. clf = KNeighborsClassifier(n_neighbors=3).fit(data_train, label_train) # 개량형. print(clf) # 데이터 검증 cross_vali = model_selection.cross_val_score(clf, data_train, label_train, cv=5) print('각각의 검증 정답율 : ', cross_vali) print('평균 검증 정답율 : ', cross_vali.mean()) pred = clf.predict(data_test) print(data_test) # 28650 0.91 0.835 ... 45834 0.37 0.975 print(pred) # ['fat' 'fat' 'fat' ... 'fat' 'thin' 'thin'] ac_score = metrics.accuracy_score(label_test, pred) # 검정 데이터와 모델 print('정확도 : ', ac_score) cl_report = metrics.classification_report(label_test, pred) print('리포트 : ', cl_report) # 시각화 tbl2 = pd.read_csv("bmi.csv", index_col=2) print(tbl2.tail(3)) # 끝에서 3개만. fig = plt.figure() # 이미지 저장 시작.
class InstanceReductionMixin(InstanceReductionBase, ClassifierMixin): """Mixin class for all instance reduction techniques""" def set_classifier(self): """Sets the classified to be used in the instance reduction process and classification. Parameters ---------- classifier : classifier, following the KNeighborsClassifier style (default = KNN) y : array-like, shape = [n_samples] Labels for X. Returns ------- P : array-like, shape = [indeterminated, n_features] Resulting training set. q : array-like, shape = [indertaminated] Labels for P """ self.classifier = classifier def reduce_data(self, X, y): """Perform the instance reduction procedure on the given training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training set.0 y : array-like, shape = [n_samples] Labels for X. Returns ------- X_ : array-like, shape = [indeterminated, n_features] Resulting training set. y_ : array-like, shape = [indertaminated] Labels for X_ """ pass def get_prototypes(self): return self.X_, self.y_ def fit(self, X, y, reduce_data=True): """ Fit the InstanceReduction model according to the given training data. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. Note that centroid shrinking cannot be used with sparse matrices. y : array, shape = [n_samples] Target values (integers) reduce_data : bool, flag indicating if the reduction would be performed """ self.X = X self.y = y self.labels = set(y) self.prototypes = None self.prototypes_labels = None self.reduction_ratio = 0.0 if reduce_data: self.reduce_data(X, y) return self def predict(self, X, n_neighbors=1): """Perform classification on an array of test vectors X. The predicted class C for each sample in X is returned. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Notes ----- The default prediction is using KNeighborsClassifier, if the instance reducition algorithm is to be performed with another classifier, it should be explicited overwritten and explained in the documentation. """ X = atleast2d_or_csr(X) if not hasattr(self, "X_") or self.X_ is None: raise AttributeError("Model has not been trained yet.") if not hasattr(self, "y_") or self.y_ is None: raise AttributeError("Model has not been trained yet.") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors) self.classifier.fit(self.X_, self.y_) return self.classifier.predict(X) def predict_proba(self, X): """Return probability estimates for the test data X. after a given prototype selection algorithm. Parameters ---------- X : array, shape = (n_samples, n_features) A 2-D array representing the test points. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. """ self.classifier.fit(self.X_, self.y_) return self.classifier.predict_proba(X)
def demo(): """ _test_knn_adwin This demo tests the KNNAdwin classifier on a file stream, which gives instances coming from a SEA generator. The test computes the performance of the KNNAdwin classifier as well as the time to create the structure and classify max_samples (10000 by default) instances. """ start = timer() logging.basicConfig(format='%(message)s', level=logging.INFO) # warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/sea_big.csv', -1, 1) # stream = RandomRBFGeneratorDrift(change_speed=41.00, n_centroids=50, model_random_state=32523423, # sample_seed=5435, n_classes=2, num_att=10, num_drift_centroids=50) stream.prepare_for_use() t = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) t2 = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # knn = KNN(n_neighbors=8, max_window_size=2000, leaf_size=40) knn = KNNAdwin(n_neighbors=8, leaf_size=40, max_window_size=2000) # pipe = Pipeline([('one_hot_to_categorical', t), ('KNN', knn)]) compare = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=40, metric='euclidean') # pipe2 = Pipeline([('one_hot_to_categorical', t2), ('KNN', compare)]) first = True train = 200 if train > 0: X, y = stream.next_sample(train) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) # pipe2.fit(X, y) knn.partial_fit(X, y, classes=stream.target_values) compare.fit(X, y) first = False n_samples = 0 max_samples = 10000 my_corrects = 0 compare_corrects = 0 while n_samples < max_samples: if n_samples % (max_samples / 20) == 0: logging.info('%s%%', str((n_samples // (max_samples / 20) * 5))) X, y = stream.next_sample() # my_pred = pipe.predict(X) my_pred = knn.predict(X) # my_pred = [1] if first: # pipe.partial_fit(X, y, classes=stream.target_values) # pipe.partial_fit(X, y, classes=stream.target_values) knn.partial_fit(X, y, classes=stream.target_values) first = False else: # pipe.partial_fit(X, y) knn.partial_fit(X, y) # compare_pred = pipe2.predict(X) compare_pred = compare.predict(X) if y[0] == my_pred[0]: my_corrects += 1 if y[0] == compare_pred[0]: compare_corrects += 1 n_samples += 1 end = timer() print('Evaluation time: ' + str(end - start)) print(str(n_samples) + ' samples analyzed.') print('My performance: ' + str(my_corrects / n_samples)) print('Compare performance: ' + str(compare_corrects / n_samples))
def plot_decision_boundaries( X_train, y_train, y_pred_train, X_test, y_test, y_pred_test, resolution: int = 100, embedding=None, figsize=(9, 8), cmap="viridis", title: str = "Decision boundaries", s=200, ): import umap X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) y_pred = np.concatenate([y_pred_train, y_pred_test]) if embedding is None: try: embedding = umap.UMAP(n_components=2, random_state=160290).fit_transform(X) except ImportError: from sklearn.manifold import TSNE embedding = TSNE(n_components=2, random_state=160290).fit_transform(X) x_min, x_max = safe_bounds(embedding[:, 0]) y_min, y_max = safe_bounds(embedding[:, 1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit( embedding, y_pred) voronoi_bg = background_model.predict(np.c_[xx.ravel(), yy.ravel()]) voronoi_bg = voronoi_bg.reshape((resolution, resolution)) fig, ax = plt.subplots(figsize=figsize) ax.pcolormesh(xx, yy, voronoi_bg, cmap=cmap, alpha=0.1) emb_train = embedding[:len(y_train)] data = pd.DataFrame({ "x": emb_train[:, 0], "y": emb_train[:, 1], "target": y_train }) data.plot.scatter(x="x", y="y", c="target", cmap=cmap, s=s, colorbar=False, ax=ax, alpha=0.7, label="train set") emb_test = embedding[len(y_train):] data = pd.DataFrame({ "x": emb_test[:, 0], "y": emb_test[:, 1], "target": y_test }) data.plot.scatter(x="x", y="y", c="target", cmap=cmap, s=s, colorbar=False, ax=ax, alpha=0.7, marker="s", label="test set") errors = y_pred != y failed_points = ax.scatter(embedding[errors, 0], embedding[errors, 1], c="red", s=50, alpha=0.9, label="errors") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xlabel(None) plt.ylabel(None) plt.legend() if title is not None: plt.title(title, fontsize=22) return fig, ax
def knn_score(X, y, neighbors): knn5 = KNeighborsClassifier(n_neighbors=neighbors) knn5.fit(X, y) y_pred = knn5.predict(X) print "KNN{} accuracy_score: {}".format(neighbors, metrics.accuracy_score(y, y_pred))
from sklearn.neighbors.classification import KNeighborsClassifier iris = datasets.load_iris() iris_X = iris.data iris_y = iris.target print np.unique(iris_y) np.random.seed(0) indices = np.random.permutation(len(iris_X)) iris_X_train = iris_X[indices[indices[:-10]]] iris_y_train = iris_y[indices[indices[:-10]]] iris_X_test = iris_X[indices[indices[:-10]]] iris_y_test = iris_y[indices[indices[:-10]]] knn = KNeighborsClassifier() knn.fit(iris_X_train, iris_y_train) print "预测:" print knn.predict(iris_X_test) print "实际:" print iris_y_test i = knn.predict(iris_X_test) == iris_y_test k = 0 for j in i: if j == False: k += 1 print len(i) print "预测错误数:" print k diabetes = datasets.load_diabetes() #糖尿病数据集 diabetes_X_train = diabetes.data[:-20] diabetes_X_test = diabetes.data[-20:] diabetes_y_train = diabetes.target[:-20]
reader = csv.reader(open("reduced_features.csv", "r"), delimiter=",") X = list(reader) X = np.array(X) X = X.astype(np.float) #create result vector reader = csv.reader(open("target_output.csv", "r"), delimiter=",") y = list(reader) y = np.array(y) y = y.astype(np.int) y = y.ravel() X_Train_embedded = TSNE(n_components=2).fit_transform(X) print X_Train_embedded.shape model = KNeighborsClassifier(n_neighbors=1).fit(X, y) y_predicted = model.predict(X) # replace the above by your data and model # create meshgrid resolution = 1024 # 100x100 background pixels X2d_xmin, X2d_xmax = np.min(X_Train_embedded[:, 0]), np.max(X_Train_embedded[:, 0]) X2d_ymin, X2d_ymax = np.min(X_Train_embedded[:, 1]), np.max(X_Train_embedded[:, 1]) xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution)) # approximate Voronoi tesselation on resolution x resolution grid using 1-NN background_model = KNeighborsClassifier(n_neighbors=1).fit(
''' # Load data. path_weights = "resources/knn_weights.bin" # path_train = "resources/crimes_training_ones.bin" path_train = "resources/crimes_samples_training.bin" # path_tests = "resources/crimes_testing_ones.bin" path_tests = "resources/crimes_samples_testing.bin" print "Normalizing train" crime_train = CrimeData(path_train) crime_train.data[:, 22:24], mean_x_y, std_x_y = z_norm_by_feature(crime_train.data[:, 22:24]) crime_train.data[:, 1:5], mean_time, std_time = z_norm_by_feature(crime_train.data[:, 1:5]) crime_train.data = np.hstack((crime_train.data[:, 0:24], crime_train.data[:, 141:241])) print "Normalizing test" crime_test = CrimeData(path_tests) crime_test.data[:, 22:24] = z_norm_by_feature(crime_test.data[:, 22:24], mean_x_y, std_x_y) crime_test.data[:, 1:5] = z_norm_by_feature(crime_test.data[:, 1:5], mean_time, std_time) crime_test.data = np.hstack((crime_test.data[:, 0:24], crime_test.data[:, 141:241])) n = 0.1 for i in range(1, 10): n *= 10 clf = KNeighborsClassifier(n_neighbors = n) print "Fitting" clf.fit(crime_train.data, crime_train.y) print "Testing" preds = clf.predict(crime_test.data[0:10000]) print n, np.mean(crime_test.y[0:10000] == preds[0:10000])