def _nearestneighbors(*, train, test, x_predict=None, metrics, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None): """ For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors """ model = NearestNeighbors(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) model.fit(train[0], train[1]) model_name = 'Nearest Neighbors' y_hat = model.predict(test[0]) if metrics == 'accuracy': accuracy = accuracy_score(test[1], y_hat) if metrics == 'f1': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard': accuracy = jaccard_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def wine_cross(): wine = datasets.load_wine() x = wine.data y = wine.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42) clf = NearestNeighbors(n_neighbors=5) clf = KMeans(n_clusters=4, random_state=0) clf.fit(x_train) clf.labels_ y_pred = clf.predict(x_test) print('accuracy: ', accuracy_score(y_test, y_pred))
def fixed_outlier_detector_by_LOF(feature, outlier_fraction): """ this function takes a training data X, output the outlier index in X with the outlier fraction is outlier_fraction """ model = NearestNeighbors(contamination= outlier_fraction) model.fit(feature) y_predict = model.predict(feature) outliers = [] i=0 for y in y_predict: if y == -1: outliers.append(i) i=i+1 return outliers
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ algorithm = NearestNeighbors(n_neighbors=2) algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def knn(self, partitions, predictors, outcome): # making individual that's necessary to determine the optimal amount of neighbors test_individual = partitions['valid_X'][predictors].iloc[0, :] # making initial KNN model knn = NearestNeighbors(n_neighbors=3) knn.fit(partitions['train_X'][predictors]) results = [] for k in range(1, 40): knn = KNeighborsClassifier(n_neighbors=k).fit( partitions['train_X'], partitions['train_y']) results.append({ 'k': k, 'accuracy': accuracy_score(predictors['valid_y'], knn.predict(predictors['valid_X'])) })
# K-NN import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split irisDataset = datasets.load_iris() irisFeatures = irisDataset.data irisTarget = irisDataset.target xTrain, xTest, yTrain, yTest = train_test_split(irisFeatures, irisTarget, test_size=0.2) from sklearn.neighbors import NearestNeighbors knn = NearestNeighbors(n_neighbors=3, algorithm='ballTree') knn.fit(xTrain, yTrain) yPred = knn.predict(xTest) from sklearn.metrics import confusion_matrix, f1_score confusion_matrix = confusion_matrix(yTest, yPred) f1_score = f1_score(yTest, yPred, average='weighted') print("Confusion Matrix: \n", confusion_matrix) print("F1-Score: ", f1_score)
features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii] == 1 ] #### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import NearestNeighbors clf = NearestNeighbors(n_neighbors=2) clf = clf.fit(features_train, labels_train) pred = clf.predict(features_test) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test) print acc try: prettyPicture(clf, features_test, labels_test) except NameError: pass
def run(tr, ts): Xtr = tr.as_matrix(['lat', 'lon']) Xts = ts.as_matrix(['lat', 'lon']) print('check outliers...') m = NearestNeighbors(10).fit(Xtr) dtr, _ = m.kneighbors(Xtr) dtr = np.mean(dtr[:, 1:], 1) dts, _ = m.kneighbors(Xts) dts = np.mean(dts[:, :-1], 1) tr_inliers = dtr < 0.02 ts_inliers = dts < 0.02 print('clustering all points...') k_all = 10 m = KMeans(k_all) _Ctr = m.fit_predict(Xtr[tr_inliers]) _Cts = m.predict(Xts[ts_inliers]) # outliers = cluster 0 _Ctr += 1 Ctr = np.zeros(len(Xtr), int) Ctr[tr_inliers] = _Ctr _Cts += 1 Cts = np.zeros(len(Xts), int) Cts[ts_inliers] = _Cts Dtr = m.transform(Xtr) Dts = m.transform(Xts) # one hot encoding Ctr = np.asarray([[int(c == i) for c in Ctr] for i in range(k_all + 1)]).T Cts = np.asarray([[int(c == i) for c in Cts] for i in range(k_all + 1)]).T Xtr_ = np.c_[Ctr, Dtr] Xts_ = np.c_[Cts, Dts] print('clustering across revenue classes...') k_across = 3 y = tr.as_matrix(['y'])[:, 0] Dtrs = [] Dtss = [] for klass in range(1, 6): Xtr[y == klass] m = KMeans(k_across) m.fit(Xtr[np.logical_and(tr_inliers, y == klass)]) Dtrs.append(np.amin(m.transform(Xtr), 1)) Dtss.append(np.amin(m.transform(Xts), 1)) Dtrs = np.asarray(Dtrs).T Dtss = np.asarray(Dtss).T Xtr_ = np.c_[Xtr_, Dtrs] Xts_ = np.c_[Xts_, Dtss] names = ['cluster-%d' % i for i in range(k_all+1)] + \ ['cluster-dist-%d' % i for i in range(k_all)] + \ ['cluster-class-dist-%d' % i for i in range(1, 6)] return pd.DataFrame(Xtr_, columns=names), pd.DataFrame(Xts_, columns=names)
def fast_knn(X, n_clusters=5, n_neighbors=None, graph_mode='distance', cluster_mode='spectral', algorithm='brute', n_jobs=1, random_state=1234, force_sklearn=False): r""" Arguments: X : `ndarray` or tuple of (X, y) n_neighbors: int (default = 5) The top K closest datapoints you want the algorithm to return. Currently, this value must be < 1024. graph_mode : {'distance', 'connectivity'}, default='distance' This mode decides which values `kneighbors_graph` will return: - 'connectivity' : will return the connectivity matrix with ones and zeros (for 'SpectralClustering'). - 'distance' : will return the distances between neighbors according to the given metric (for 'DBSCAN'). cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote' This mode decides how to generate cluster prediction from the neighbors graph: - 'dbscan' : - 'spectral' : - 'isomap' : - 'kmeans' : algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. """ kwargs = dict(locals()) X = kwargs.pop('X') force_sklearn = kwargs.pop('force_sklearn') random_state = kwargs.pop('random_state') n_clusters = int(kwargs.pop('n_clusters')) if n_neighbors is None: kwargs['n_neighbors'] = n_clusters n_neighbors = n_clusters ## graph mode graph_mode = str(kwargs.pop('graph_mode')).strip().lower() assert graph_mode in ('distance', 'connectivity') ## cluster mode cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower() ## fine-tuning the kwargs use_cuml = _check_cuml(force_sklearn) if use_cuml: from cuml.neighbors import NearestNeighbors kwargs['n_gpus'] = kwargs['n_jobs'] kwargs.pop('n_jobs') kwargs.pop('algorithm') else: from sklearn.neighbors import NearestNeighbors ## fitting knn = NearestNeighbors(**kwargs) knn.fit(X) knn._fitid = id(X) ## Transform mode knn._random_state = random_state knn._n_clusters = n_clusters knn._graph_mode = graph_mode knn._cluster_mode = cluster_mode if use_cuml: knn.n_samples_fit_ = X.shape[0] knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn) knn.transform = types.MethodType(nn_transform, knn) knn.fit_transform = types.MethodType(nn_fit_transform, knn) knn.predict = types.MethodType(nn_predict, knn) return knn
# 5.超参优化(略) from sklearn.model_selection import GridSearchCV params = {'n_neighbors': range(1, 10)} mdl = KNeighborsClassifier() grid = GridSearchCV(mdl, param_grid=params) grid.fit(X, y) print('最优参数:', grid.best_params_) print('最优得分:', grid.best_score_) mdl = grid.best_estimator_ # 6.评估模型 y_pred = mdl.predict(X) displayClassifierMetrics(y, y_pred, mdl.classes_) y_prob = mdl.predict_proba(X) displayROCurve(y, y_prob, mdl.classes_) # 相关类 # KNeighborsClassifier(n_neighbors=5,weights=’uniform’,algorithm=’auto’, # leaf_size=30,p=2,metric=’minkowski’,metric_params=None,n_jobs=1,*kwargs) # n_neighbors: int, 可选参数(默认为 5) # weights(权重): str or callable(自定义类型), 可选参数(默认为 ‘uniform’) # 用于预测的权重函数。可选参数如下: # - ‘uniform’ : 统一的权重. 在每一个邻居区域里的点的权重都是一样的。 # - ‘distance’ : 权重点等于他们距离的倒数。使用此函数,更近的邻居对于所预测的点的影响更大。 # - [callable] : 一个用户自定义的方法,此方法接收一个距离的数组,然后返回一个相同形状并且包含权重的数组。 # algorithm(算法): {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, 可选参数(默认为 'auto')
# In[45]: #accuracy train_X = trainNorm[['zinventorygrowth', 'zpopulationgrowth']] train_y = trainNorm['yoygtenp'] valid_X = validNorm[['zinventorygrowth', 'zpopulationgrowth']] valid_y = validNorm['yoygtenp'] # Train a classifier for different values of k results = [] for k in range(1, 12): knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y) results.append({ 'k': k, 'accuracy': accuracy_score(valid_y, knn.predict(valid_X)) }) # Convert results to a pandas data frame results = pd.DataFrame(results) print(results) # Retrain with full dataset---KNN retail_X = retailNorm[['zinventorygrowth', 'zpopulationgrowth']] retail_y = retailNorm['yoygtenp'] knn = KNeighborsClassifier(n_neighbors=4).fit(retail_X, retail_y) distances, indices = knn.kneighbors(newretailNorm) print(knn.predict(newretailNorm)) print('Distances', distances) print('Indices', indices) print(retailNorm.iloc[indices[0], :])
def knn_predictor(x_train, y_train, x_test, y_test): clf = NearestNeighbors(n_neighbors = 5) clf.fit(x_train) accuracy = clf.score(x_test, y_test) f1 = precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')[2] print(accuracy, f1)