class KNN(object): def __init__(self, task_type="cla", module_type="performance", **params): assert task_type in ["cla", "reg"] # 两种类型 assert module_type in ["balance", "debug", "performance", None] # 三种 性能模型 self.module_type = module_type if self.module_type == "debug": params["n_jobs"] = 1 elif self.module_type == "performance": # 性能模型 params["n_jobs"] = cpu_count() # cpu核心数 elif self.module_type == "balance": # 均衡模型 params["n_jobs"] = cpu_count() // 2 else: params["n_jobs"] = None self.task_type = task_type # weights 取值{"uniform", "distance",None} # 默认使用的uniform # "algorithm" 取值 {"auto", "ball_tree", "kd_tree", "brute",None}s # 权重 uniform 均匀权重, distance 按照其距离的倒数 # "ball_tree" 使用BallTree算法, # kd_tree 使用的KDTree 算法 # brute 使用暴力搜索 算法。 # p的取值, int 类型数据, 默认为2 马尔科夫功率参数 # p=1 时,等校于p=2使用manhattan_distance(l1)和euclidean_distance(l2). # 对于任意的p, 使用minkowskidistance(l_p) if self.task_type == "cla": self.model = KNeighborsClassifier( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), # 叶子大小 p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None) # 并行数 ) else: self.model = KNeighborsRegressor( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None)) def fit(self, x, y=None): self.model.fit(X=x, y=y) def get_params(self): return self.model.get_params(deep=True) def set_params(self, params): self.model.set_params(**params) def predict(self, x): return self.model.predict(X=x) def predict_proba(self, x): if self.task_type == "cla": return self.model.predict_proba(X=x) else: ValueError("回归任务无法使用") def get_score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def search_kneighbors(self, x=None, n_neighbors=None, return_distance=True): # 查找K近邻居 return self.model.kneighbors(X=x, n_neighbors=n_neighbors, return_distance=return_distance) def get_kneighbors_graph(self, x=None, n_neighbors=None, mode='connectivity'): # 获取最近邻图 """ :param x: :param n_neighbors: :param mode: "distance","connectivity" :return: """ return self.model.kneighbors_graph(X=x, n_neighbors=n_neighbors, mode=mode)
x_train = x_train_with_names.drop('name', axis=1) x_test = x_test_with_names.drop('name', axis=1) # parameters n_neighbors = 5 # building the model neigh = KNeighborsRegressor(n_neighbors=n_neighbors) # training the model neigh.fit(x_train, y_train) # looking at the methods get_params = neigh.get_params() # returns the parameters of the model kneighbours = neigh.kneighbors( x_test.head(1), n_neighbors=n_neighbors ) # the first array gives the distance between the new data point and the k neighbours, and the second array gives the sample number of the k neighbours kneighbours_graph = neigh.kneighbors_graph( x_test.head(1), n_neighbors=n_neighbors, mode='distance' ) # returns a sparce matrix for the k neighbours for the new data points prediction_array = neigh.predict(x_test) # predicted test values train_score = neigh.score(x_train, y_train) # the mean auuracy of the training dataset test_score = neigh.score(x_test, y_test) # the mean acccuracy for the test dataset print( 'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f' % (train_score, test_score)) pdb.set_trace()
X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] X_test = [[1.5]] from sklearn.neighbors import KNeighborsRegressor neigh = KNeighborsRegressor(n_neighbors=2) neigh.fit(X, y) print(neigh.predict(X_test)) samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] from sklearn.neighbors import NearestNeighbors neigh = NearestNeighbors(n_neighbors=1) neigh.fit(samples) print("Q. who’s the closest point to [1,1,1] ?") print("A. distance, index of point, datatype = ") print(neigh.kneighbors([[1., 1., 1.]])) X = [[0., 1., 0.], [1., 0., 1.]] print("Q. who’s the closest points to", X, "? (multi version)") print("A. index list of points = ") print(neigh.kneighbors(X, return_distance=False)) X = [[0], [3], [1]] from sklearn.neighbors import NearestNeighbors neigh = NearestNeighbors(n_neighbors=2) neigh.fit(X) A = neigh.kneighbors_graph(X) print("Computes the (weighted) graph of k-Neighbors for points in X") print("k = ", 2-1, " X = ", X) print("(point 1, 2) distance") print(A) print(A.toarray())