class OPTICS(Verbose): def __init__(self, max_eps=0.5, min_samples=10, metric=euclidean, verbose=True): self.max_eps = max_eps self.min_samples = min_samples self.metric = metric super(OPTICS, self).__init__(verbose) def _get_neighbors(self, point_id): """计算数据点的eps-邻域,同时也得到core-distance :param point_id: :returns: :rtype: """ point = self.kd_tree[point_id] neighbors = self.kd_tree.query_ball_point(point, self.max_eps) ## 把节点本身排除在外 neighbors.pop(0) if len(neighbors) < self.min_samples: core_distance = np.inf else: core_distance = neighbors[self.min_samples - 1][0] return [x[1] for x in neighbors], core_distance def _update(self, order_seeds, neighbors, point_id): """ :returns: :rtype: """ ## 能进到这个函数的core_distance都是满足core_distance<np.inf的 core_distance = self.results[point_id][1] for neighbor in neighbors: ## 如果该邻域点没有处理过,更新reachability_distance ## 注意:如果某个点已经处理过(计算core_distance并作为point_id进入该函数),那么 ## 该点的reachability_distance就不会再被更新了,即采用“先到先得”的模式 if not self.results[neighbor][0]: self.printer("节点{}尚未处理,计算可达距离".format(neighbor)) new_reachability_distance = max( core_distance, self.metric(self.kd_tree[point_id], self.kd_tree[neighbor])) ## 如果新的reachability_distance小于老的,那么进行更新,否则不更新 if new_reachability_distance < self.results[neighbor][2]: self.printer("节点{}的可达距离从{}缩短至{}".format( neighbor, self.results[neighbor][2], new_reachability_distance)) self.results[neighbor][2] = new_reachability_distance ## 对新数据执行插入,对老数据执行decrease_key order_seeds.push([new_reachability_distance, neighbor]) def _expand_cluste_order(self, point_id): """ FIXME briefly describe function :param point_id: :returns: :rtype: """ neighbors, core_distance = self._get_neighbors(point_id) self.printer("节点{}的邻域点数量为{},核心距离为{}".format(point_id, len(neighbors), core_distance)) self.results[point_id][0] = True # 标记为已处理 self.results[point_id][1] = core_distance if (not self.results_order.count(point_id)): self.results_order.append(point_id) # 记录数据点被处理的顺序 if core_distance < np.inf: self.printer("节点{}为核心点,递归处理其邻域".format(point_id)) ## order_seeds是以reachability_distance为key,point_id为handle的优先队列(堆) order_seeds = Heap(verbose=False) data = [[self.results[x][2], x] for x in neighbors] order_seeds.heapify(data) self._update(order_seeds, neighbors, point_id) while not order_seeds.is_empty: _, current_point_id = order_seeds.pop() neighbors, core_distance = self._get_neighbors( current_point_id) self.printer("节点{}的邻域点数量为{},核心距离为{}".format( current_point_id, len(neighbors), core_distance)) self.results[current_point_id][0] = True # 标记为已处理 self.results[current_point_id][1] = core_distance if (not self.results_order.count(current_point_id)): self.results_order.append(current_point_id) if core_distance < np.inf: self.printer("节点{}为核心点,递归处理其邻域".format(point_id)) self._update(order_seeds, neighbors, current_point_id) def fit(self, points): """聚类主函数 聚类过程主要是通过expand_cluste_order函数实现,流程如下: 给定一个起始点pt,计算pt的core_distance和eps-邻域,并更新eps-邻域中数据点的 reachability_distance 然后按reachability_distance从小到大依次处理pt的eps-邻域中未处理的点(流程同上) 遍历整个数据集,对每个未expand的数据点都执行expand,便完成了聚类,结果存储在self.results中 数据点遍历顺序存储在self.results_order中,二者结合便可以导出具体的聚类信息 :param points: [list] 输入数据列表,list中的每个元素都是长度固定的1维np数组 :returns: :rtype: """ """ results[遍历标记,核心距离,可达距离] results_order 存放数据遍历顺序 """ self.point_num = len(points) self.point_size = points[0].size self.results = [[None, np.inf, np.inf] for x in range(self.point_num)] self.results_order = [] ## 数据存储在kd树中以便检索【好像并没有用到检索...】 self.kd_tree = KDTree(self.point_size) self.kd_tree.create(points) for point_id in range(self.point_num): ## 如果当前节点没有处理过,执行expand if not self.results[point_id][0]: self._expand_cluste_order(point_id) return self def extract(self, eps): """从计算结果中抽取出聚类信息 抽取的方式比较简单,就是扫描所有数据点,判断当前点的core_distance 和reachability_distance与给定eps的大小,然后决定点的类别。规则如下: 1. 如果reachability_distance<eps,属于当前类别 2. 如果大于eps,不属于当前类别 2-1. 如果core_distance小于eps,可以自成一类 2-2. 如果core_distance大于eps,认为是噪声点 注意: 数据的扫描顺序同fit函数中的处理顺序是一致的。 :returns: :rtype: """ if eps > self.max_eps: raise ValueError("eps参数不能大于{},当前值为{}".format(self.max_eps, eps)) labels = np.zeros(self.point_num, dtype=np.int64) counter = count() idx = next(counter) for point_id in self.results_order: # for point_id in range(self.point_num): _, core_distance, reachability_distance = self.results[point_id] ## 如果可达距离大于eps,认为要么是core point要么是噪音数据 if reachability_distance > eps: ## 如果core distance小于eps,那么可以成为一个类 if core_distance < eps: idx = next(counter) labels[point_id] = idx ## 否则成为噪声数据 else: labels[point_id] = 0 ## 可达距离小于eps,属于当前类别 ## 这个点的顺序是由fit函数中的主循环函数维持的,注意 else: labels[point_id] = idx return labels