Пример #1
0
class OPTICS(Verbose):
    def __init__(self,
                 max_eps=0.5,
                 min_samples=10,
                 metric=euclidean,
                 verbose=True):
        self.max_eps = max_eps
        self.min_samples = min_samples
        self.metric = metric
        super(OPTICS, self).__init__(verbose)

    def _get_neighbors(self, point_id):
        """计算数据点的eps-邻域,同时也得到core-distance
        :param point_id:
        :returns:
        :rtype:

        """
        point = self.kd_tree[point_id]
        neighbors = self.kd_tree.query_ball_point(point, self.max_eps)
        ## 把节点本身排除在外
        neighbors.pop(0)

        if len(neighbors) < self.min_samples:
            core_distance = np.inf
        else:
            core_distance = neighbors[self.min_samples - 1][0]

        return [x[1] for x in neighbors], core_distance

    def _update(self, order_seeds, neighbors, point_id):
        """

        :returns:
        :rtype:
        """
        ## 能进到这个函数的core_distance都是满足core_distance<np.inf的
        core_distance = self.results[point_id][1]
        for neighbor in neighbors:
            ## 如果该邻域点没有处理过,更新reachability_distance
            ## 注意:如果某个点已经处理过(计算core_distance并作为point_id进入该函数),那么
            ## 该点的reachability_distance就不会再被更新了,即采用“先到先得”的模式
            if not self.results[neighbor][0]:
                self.printer("节点{}尚未处理,计算可达距离".format(neighbor))
                new_reachability_distance = max(
                    core_distance,
                    self.metric(self.kd_tree[point_id],
                                self.kd_tree[neighbor]))

                ## 如果新的reachability_distance小于老的,那么进行更新,否则不更新
                if new_reachability_distance < self.results[neighbor][2]:
                    self.printer("节点{}的可达距离从{}缩短至{}".format(
                        neighbor, self.results[neighbor][2],
                        new_reachability_distance))
                    self.results[neighbor][2] = new_reachability_distance
                    ## 对新数据执行插入,对老数据执行decrease_key
                    order_seeds.push([new_reachability_distance, neighbor])

    def _expand_cluste_order(self, point_id):
        """ FIXME briefly describe function


        :param point_id:
        :returns:
        :rtype:

        """

        neighbors, core_distance = self._get_neighbors(point_id)
        self.printer("节点{}的邻域点数量为{},核心距离为{}".format(point_id, len(neighbors),
                                                    core_distance))
        self.results[point_id][0] = True  # 标记为已处理
        self.results[point_id][1] = core_distance
        if (not self.results_order.count(point_id)):
            self.results_order.append(point_id)  # 记录数据点被处理的顺序
        if core_distance < np.inf:
            self.printer("节点{}为核心点,递归处理其邻域".format(point_id))
            ## order_seeds是以reachability_distance为key,point_id为handle的优先队列(堆)
            order_seeds = Heap(verbose=False)
            data = [[self.results[x][2], x] for x in neighbors]
            order_seeds.heapify(data)
            self._update(order_seeds, neighbors, point_id)
            while not order_seeds.is_empty:
                _, current_point_id = order_seeds.pop()
                neighbors, core_distance = self._get_neighbors(
                    current_point_id)
                self.printer("节点{}的邻域点数量为{},核心距离为{}".format(
                    current_point_id, len(neighbors), core_distance))
                self.results[current_point_id][0] = True  # 标记为已处理
                self.results[current_point_id][1] = core_distance
                if (not self.results_order.count(current_point_id)):
                    self.results_order.append(current_point_id)
                if core_distance < np.inf:
                    self.printer("节点{}为核心点,递归处理其邻域".format(point_id))
                    self._update(order_seeds, neighbors, current_point_id)

    def fit(self, points):
        """聚类主函数
        聚类过程主要是通过expand_cluste_order函数实现,流程如下:
        给定一个起始点pt,计算pt的core_distance和eps-邻域,并更新eps-邻域中数据点的
        reachability_distance
        然后按reachability_distance从小到大依次处理pt的eps-邻域中未处理的点(流程同上)

        遍历整个数据集,对每个未expand的数据点都执行expand,便完成了聚类,结果存储在self.results中
        数据点遍历顺序存储在self.results_order中,二者结合便可以导出具体的聚类信息

        :param points: [list] 输入数据列表,list中的每个元素都是长度固定的1维np数组
        :returns:
        :rtype:

        """
        """
        results[遍历标记,核心距离,可达距离]
        results_order 存放数据遍历顺序
        """

        self.point_num = len(points)
        self.point_size = points[0].size
        self.results = [[None, np.inf, np.inf] for x in range(self.point_num)]
        self.results_order = []
        ## 数据存储在kd树中以便检索【好像并没有用到检索...】
        self.kd_tree = KDTree(self.point_size)
        self.kd_tree.create(points)

        for point_id in range(self.point_num):
            ## 如果当前节点没有处理过,执行expand
            if not self.results[point_id][0]:
                self._expand_cluste_order(point_id)
        return self

    def extract(self, eps):
        """从计算结果中抽取出聚类信息
        抽取的方式比较简单,就是扫描所有数据点,判断当前点的core_distance
        和reachability_distance与给定eps的大小,然后决定点的类别。规则如下:
        1. 如果reachability_distance<eps,属于当前类别
        2. 如果大于eps,不属于当前类别
           2-1. 如果core_distance小于eps,可以自成一类
           2-2. 如果core_distance大于eps,认为是噪声点
        注意:
        数据的扫描顺序同fit函数中的处理顺序是一致的。
        :returns:
        :rtype:

        """
        if eps > self.max_eps:
            raise ValueError("eps参数不能大于{},当前值为{}".format(self.max_eps, eps))
        labels = np.zeros(self.point_num, dtype=np.int64)
        counter = count()
        idx = next(counter)
        for point_id in self.results_order:
            # for point_id in range(self.point_num):
            _, core_distance, reachability_distance = self.results[point_id]
            ## 如果可达距离大于eps,认为要么是core point要么是噪音数据
            if reachability_distance > eps:
                ## 如果core distance小于eps,那么可以成为一个类
                if core_distance < eps:
                    idx = next(counter)
                    labels[point_id] = idx
                ## 否则成为噪声数据
                else:
                    labels[point_id] = 0
            ## 可达距离小于eps,属于当前类别
            ## 这个点的顺序是由fit函数中的主循环函数维持的,注意
            else:
                labels[point_id] = idx

        return labels