Пример #1
0
class KNN(object):
    def __init__(self, task_type="cla", module_type="performance", **params):

        assert task_type in ["cla", "reg"]  # 两种类型
        assert module_type in ["balance", "debug", "performance",
                               None]  # 三种 性能模型
        self.module_type = module_type

        if self.module_type == "debug":
            params["n_jobs"] = 1
        elif self.module_type == "performance":  # 性能模型
            params["n_jobs"] = cpu_count()  # cpu核心数
        elif self.module_type == "balance":  # 均衡模型
            params["n_jobs"] = cpu_count() // 2
        else:
            params["n_jobs"] = None

        self.task_type = task_type
        # weights  取值{"uniform", "distance",None}   # 默认使用的uniform
        # "algorithm" 取值 {"auto", "ball_tree", "kd_tree", "brute",None}s
        # 权重 uniform 均匀权重, distance  按照其距离的倒数
        # "ball_tree" 使用BallTree算法,
        # kd_tree   使用的KDTree 算法
        # brute   使用暴力搜索 算法。

        # p的取值,  int 类型数据, 默认为2  马尔科夫功率参数
        # p=1 时,等校于p=2使用manhattan_distance(l1)和euclidean_distance(l2).
        # 对于任意的p, 使用minkowskidistance(l_p)

        if self.task_type == "cla":
            self.model = KNeighborsClassifier(
                n_neighbors=params.get("n_neighbors", 5),
                weights=params.get("weights", 'uniform'),
                algorithm=params.get("algorithm", 'auto'),
                leaf_size=params.get("leaf_size", 30),  # 叶子大小
                p=params.get("p", 2),
                metric=params.get("metric", 'minkowski'),
                metric_params=params.get("metric_params", None),
                n_jobs=params.get("n_jobs", None)  # 并行数
            )

        else:
            self.model = KNeighborsRegressor(
                n_neighbors=params.get("n_neighbors", 5),
                weights=params.get("weights", 'uniform'),
                algorithm=params.get("algorithm", 'auto'),
                leaf_size=params.get("leaf_size", 30),
                p=params.get("p", 2),
                metric=params.get("metric", 'minkowski'),
                metric_params=params.get("metric_params", None),
                n_jobs=params.get("n_jobs", None))

    def fit(self, x, y=None):
        self.model.fit(X=x, y=y)

    def get_params(self):
        return self.model.get_params(deep=True)

    def set_params(self, params):
        self.model.set_params(**params)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_proba(self, x):
        if self.task_type == "cla":
            return self.model.predict_proba(X=x)
        else:
            ValueError("回归任务无法使用")

    def get_score(self, x, y, sample_weight):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def search_kneighbors(self,
                          x=None,
                          n_neighbors=None,
                          return_distance=True):  # 查找K近邻居
        return self.model.kneighbors(X=x,
                                     n_neighbors=n_neighbors,
                                     return_distance=return_distance)

    def get_kneighbors_graph(self,
                             x=None,
                             n_neighbors=None,
                             mode='connectivity'):  # 获取最近邻图
        """
        :param x:
        :param n_neighbors:
        :param mode: "distance","connectivity"
        :return:
        """
        return self.model.kneighbors_graph(X=x,
                                           n_neighbors=n_neighbors,
                                           mode=mode)
x_train = x_train_with_names.drop('name', axis=1)
x_test = x_test_with_names.drop('name', axis=1)

# parameters
n_neighbors = 5
# building the model
neigh = KNeighborsRegressor(n_neighbors=n_neighbors)

# training the model
neigh.fit(x_train, y_train)

# looking at the methods
get_params = neigh.get_params()  # returns the parameters of the model
kneighbours = neigh.kneighbors(
    x_test.head(1), n_neighbors=n_neighbors
)  # the first array gives the distance between the new data point and the k neighbours, and the second array gives the sample number of the k neighbours
kneighbours_graph = neigh.kneighbors_graph(
    x_test.head(1), n_neighbors=n_neighbors, mode='distance'
)  # returns a sparce matrix for the k neighbours for the new data points
prediction_array = neigh.predict(x_test)  # predicted test values
train_score = neigh.score(x_train,
                          y_train)  # the mean auuracy of the training dataset
test_score = neigh.score(x_test,
                         y_test)  # the mean acccuracy for the test dataset

print(
    'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f'
    % (train_score, test_score))

pdb.set_trace()
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
X_test = [[1.5]]
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X, y) 
print(neigh.predict(X_test))

samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(samples) 
print("Q. who’s the closest point to [1,1,1] ?")
print("A. distance, index of point, datatype = ")
print(neigh.kneighbors([[1., 1., 1.]])) 
X = [[0., 1., 0.], [1., 0., 1.]]
print("Q. who’s the closest points to", X, "? (multi version)")
print("A. index list of points = ")
print(neigh.kneighbors(X, return_distance=False)) 

X = [[0], [3], [1]]
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
neigh.fit(X) 
A = neigh.kneighbors_graph(X)
print("Computes the (weighted) graph of k-Neighbors for points in X")
print("k = ", 2-1, " X = ", X)
print("(point 1, 2) distance")
print(A)
print(A.toarray())