def cluster(self, data, keys, save_path): self._clust_len = len(self._cluster) - 1 self._basic_cluster(data, keys, self._cluster_tree, 0) _file.op_file(file_path=save_path, data=self._cluster_tree, model='json', method='save') """
def search_tencent(self, dts, root_path, branchs=2, candidate=1, distance=3): root = _file.op_file(root_path, method='read') dist = {} for idx, i in enumerate(root['center_point']): val = data.point_distance(dts, i) dist[round(val, 3)] = idx keys = list(dist.keys()) keys.sort() sel_point = len(root['center_point']) if len( root['center_point']) < branchs[0] else branchs[0] branchs.pop(0) all_res = [] for j in range(sel_point): k = 'file' + str(dist[keys[j]]) path = root['festival'][k] sr = self.similirity(dts, path, branchs, candidate, distance) all_res.extend(sr) boult = [g for g in sorted(all_res, key=lambda k: k[0], reverse=False)] save_len = candidate if candidate < len(boult) else len(boult) return boult[0:save_len]
def take9_file(self, root_path): file_tree = { "tree0": 0, "tree1": 0, "tree2": 0, "tree3": 0, "tree4": 0, "tree5": 0, "tree6": 0, "tree7": 0, "tree8": 0 } for f in range(3, 9): key = 'tree' + str(f) f_p = 'data/tencent/tree' + str(f) + '.json' file_tree[key] = _file.op_file(f_p, method='read') vals = list(file_tree[key].values()) ks = list(file_tree[key].keys()) sp = 'data/tencent/tc_tree' + str(f) + '.json' self.cluster(vals, ks, sp) del file_tree[key] del self._cluster_tree self._cluster_tree = { "position": 'root', "festival": [], "center_point": None } print(ord)
def tencent(self, data, words, clusters=[5]): _kmeans_tree = {"position": "root", "center_point": [], "festival": {}} class_data = {} one = clusters.pop(0) km = KMeans(init="k-means++", n_clusters=one) km.fit_predict(data) points = [] for j, i in enumerate(km.cluster_centers_): key = 'file' + str(j) points.append(list(i)) class_data[key] = {} _kmeans_tree['center_point'] = points # 将所有数据按类分开,存成字典 for a, b in enumerate(km.labels_): key2 = 'file' + str(b) class_data[key2][words[a]] = data[a] # 各类存到不同的文件 for idx in range(one): key1 = 'file' + str(idx) save_path = 'data/tree' + str(idx) + '.json' _kmeans_tree['festival'][key1] = save_path _file.op_file(file_path=save_path, data=class_data[key1], model='json', method='save') # 保存后删除 del class_data[key1] # 存储根节点查找文件 _file.op_file(file_path='data/root.json', data=_kmeans_tree, model='json', method='save')
def similirity(self, data, file_path, branchs=[2, 2], candidate=3, distance=15): self._max_dist = distance self._search_branch = branchs self._search_result = [] result = _file.op_file(file_path, model='json', method='read') self.search_tree(data, result) sr = [ c for c in sorted( self._search_result, key=lambda k: k[0], reverse=False) ] save_len = candidate if candidate < len(sr) else len(sr) return sr[0:save_len]