def getXmlData(self, save=False, relative=True): """ 保存id和data数据 :return: """ if relative: path = os.path.join(Properties.getRootPath()+Properties.getXmlLocation() + self.name + ".xml") else: path = self.name images = parse(path) id = [] data = [] for node in images.getElementsByTagName(self.tag): idNode = node.getElementsByTagName("id")[0].childNodes[0].data id.append(idNode) dataNode = node.getElementsByTagName("data")[0].childNodes[0].data dataNode = dataNode[1:-1].split(',') data.append(dataNode) id = np.asarray(id) id = Series(id) data = np.asarray(list(map(conv, data)), dtype=np.float) if save: if not os.path.exists(Properties.getRootPath()+Properties.getDefaultDataFold() + "/cache/" + self.name): # f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") # f.close() os.makedirs(Properties.getRootPath()+Properties.getDefaultDataFold() + "/cache/" + self.name) np.save(Properties.getRootPath()+Properties.getRootPath() + "/data/cache/" + self.name + "/id.npy", id) np.save(Properties.getRootPath()+Properties.getRootPath() + "/data/cache/" + self.name + "/data.npy", data) return id, data
def get_threshold(): from context.resource_manager import Properties from view import shape_view from view import plot_utils from cluster import density_cluster id = np.load(Properties.getRootPath() + "/data/cache/id.npy") data = np.load(Properties.getRootPath() + "/data/cache/data.npy") image_size = round(math.sqrt(float(data[0].shape[0]))) #plot_utils.plot_image( data[551], w, w) data = density_cluster.binary_array(data) # shape_view.pandas_view_record((data)) import numpy import multiprocessing threshold = DataFrame([], columns=['H', 'd_c', 'cluster']) N = 20 pool = multiprocessing.Pool(processes=N) result = list(range(N)) log.info("init " + str(N) + " workers") for i in range(N): pool.apply_async(density_cluster.multi_processing_cluster, (N, i, threshold, id, data)) # d = numpy.concatenate([c, c], axis=0) pool.close() pool.join() log.debug(threshold) if not os.path.exists(Properties.getDefaultDataFold() + "/csv"): #f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") #f.close() os.makedirs(Properties.getDefaultDataFold() + "/csv") threshold.to_csv(Properties.getDefaultDataFold() + "/csv/threshold.csv")
def save(name='default'): """ 保存id和data数据 :return: """ from context.resource_manager import Properties from pandas import DataFrame, Series path = os.path.join(Properties.getXmlLocation() + name + ".xml") from xml.dom.minidom import parse, parseString images = parse(path) id = [] data = [] for node in images.getElementsByTagName("Image"): idNode = node.getElementsByTagName("id")[0].childNodes[0].data id.append(idNode) dataNode = node.getElementsByTagName("data")[0].childNodes[0].data dataNode = dataNode[1:-1].split(',') data.append(dataNode) id = np.asarray(id) id = Series(id) data = np.asarray(list(map(conv, data)), dtype=np.float) if not os.path.exists(Properties.getDefaultDataFold() + "/cache/" + name): #f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") #f.close() os.makedirs(Properties.getDefaultDataFold() + "/cache/" + name) np.save(Properties.getRootPath() + "/data/cache/" + name + "/id.npy", id) np.save(Properties.getRootPath() + "/data/cache/" + name + "/data.npy", data)
def get_threshold(name='default'): """ 计算信息熵 :return: """ from context.resource_manager import Properties from view import shape_view from view import plot_utils from cluster import density_cluster_demo id = np.load(Properties.getRootPath() + "/data/cache/" + name + "/id.npy") data = np.load(Properties.getRootPath() + "/data/cache/" + name + "/data.npy") #image_size= round(math.sqrt(float(data[0].shape[0]))) #plot_utils.plot_image( data[551], w, w) # data=density_cluster_demo.binary_array(data) # shape_view.pandas_view_record((data)) threshold = density_cluster_demo.cluster(id, data, dataset=name) if not os.path.exists(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static()): #f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") #f.close() os.makedirs(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static()) threshold.to_csv(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static() + "/threshold.csv")
def get_data_from_xml(path=Properties.getImageXmlResource()): """ 解析xml文件,获得对应的id,data,作为运算的基础 :param path: :return: list, NumpArray 用于显示控制与用于计算 list is used to make the index to location the value """ log.info("starting running compute_distance_from_xml function.") from context.resource_manager import Properties from pandas import DataFrame, Series path = os.path.join(Properties.getRootPath(), Properties.getImageXmlResource()) from xml.dom.minidom import parse, parseString images = parse(path) id = [] data = [] for node in images.getElementsByTagName("Image"): idNode = node.getElementsByTagName("id")[0].childNodes[0].data id.append(idNode) dataNode = node.getElementsByTagName("data")[0].childNodes[0].data dataNode = dataNode[1:-1].split(',') data.append(dataNode) id = np.asarray(id) id = id.tolist() data = np.asarray(data) data = np.asarray(list(map(_conv, data)), dtype=np.float) return id, data
def __init__(self,type): self.configParse=config_parser_extender.CapitalCaseConfigParser() self.type=type if type == 'D': self.configParse.read(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini') # self.f = open(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini', 'w+') elif type == 'F': self.configParse.read(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'document.ini')
def save_dataframe_csv(threshold=DataFrame(), name="default", relative=True): if not os.path.exists(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static()): #f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") #f.close() os.makedirs(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static()) threshold.to_csv(Properties.getDefaultDataFold() + "/csv/" + name + "/" + resource_manager.Properties.name_str_static() + "/threshold.csv")
def save(self): if self.type == 'D': f=open(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini', 'w+') self.configParse.write(f) f.close() # self.f = open(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini', 'w+') elif self.type == 'F': f=open(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'document.ini', 'w+') self.configParse.write(f) f.close()
def compute_distance(data=np.array([])): """ :param data: :return: the numpy array 按行进行查看 """ if data.size <= 0: raise Exception log.critical( "cluster need the data, and there has no data in numpy array collection." ) else: log.info("start running compute distance. data count:" + str(data.shape[0]) + " point has to computing.") row = data.shape[0] result = np.zeros((row, row)) i = 0 for i in range(row): j = i for j in range(row): k = compute_point_distance(data[i], data[j]) result[i][j] = k result[j][i] = k result[i][i] = np.nan np.save(Properties.getRootPath() + "/data/cache/distance/data.npy", result) return result
def __init__(self, dataset): self.dataset = dataset try: self.recorder_csv = pandas.read_csv( Properties.getDefaultDataFold() + "/csv/recorder_csv_" + self.dataset + ".csv", index_col=0) except: self.recorder_csv = DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H', 'note'])
def save(): from context.resource_manager import Properties from pandas import DataFrame, Series path = os.path.join(Properties.getRootPath(), Properties.getImageXmlResource()) from xml.dom.minidom import parse, parseString images = parse(path) id = [] data = [] for node in images.getElementsByTagName("Image"): idNode = node.getElementsByTagName("id")[0].childNodes[0].data id.append(idNode) dataNode = node.getElementsByTagName("data")[0].childNodes[0].data dataNode = dataNode[1:-1].split(',') data.append(dataNode) id = np.asarray(id) id = Series(id) data = np.asarray(list(map(conv, data)), dtype=np.float) np.save(Properties.getRootPath() + "/data/cache/id.npy", id) np.save(Properties.getRootPath() + "/data/cache/data.npy", data)
def loadCacheData(self, name='default', tag="Image", relative=True): """ 返回xml数据 :param name: :param tag: :param relative: :return: """ if relative: path = os.path.join(Properties.getRootPath() + "/data/cache/" + name) else: path = name try: id = np.load(path + "/id.npy") data = np.load(path + "/data.npy") except: return self.getXmlData(name=name, tag=tag, save=True) return id, data
def record_expriment(name='path', save_name='default'): from context.resource_manager import Properties from context import resource_manager from view import shape_view record_img_path = resource_manager.Properties.getDefaultDataFold( ) + "result/temp/" + name + "/" + save_name + "/" record_csv_path = Properties.getDefaultDataFold( ) + "/csv/" + name + "/" + save_name + "/" path = resource_manager.Properties.getDefaultDataFold( ) + "/result/" + name + "/" + save_name if not os.path.exists(record_csv_path): # shutil.rmtree(resource_manager.Properties.getDefaultDataFold()+"result/temp/"+save_name+ "/") os.makedirs(record_csv_path) threshold = pandas.read_csv(record_csv_path + "threshold.csv") save_plot(name, threshold, save_name) log.debug(threshold['cluster'].sort_values(ascending=False)) shutil.copytree(record_img_path, path) shutil.copy(record_csv_path + "threshold.csv", path) log.warn("finished")
def setValue(self, row, columns, value): self.recorder_csv.set_value(row, columns, value) self.recorder_csv.set_value(row, 'end', Properties.name_str_FULL())
def save_show_cluster(index_id, data, distance_c, pile_id, dataset="/", level="INFO", level_info='scatter figure'): from view import plot_utils from context import resource_manager path = resource_manager.Properties.getDefaultDataFold() + "result" + resource_manager.getSeparator() + "temp/" + dataset + "/" + resource_manager.Properties.name_str_static() + "/" level_path = resource_manager.Properties.getDefaultDataFold() + "result" + resource_manager.getSeparator() + "temp/" + level + "/" + resource_manager.Properties.name_str_static() + "/" + str( distance_c) + "/" if not os.path.exists(path[:path.rfind('/')]): os.makedirs(path[:path.rfind('/')]) if not os.path.exists(level_path[:level_path.rfind('/')]): os.makedirs(level_path[:level_path.rfind('/')]) pile_id = pile_id.sort_values('size', ascending=False) x = [] y = [] label = [] i = 1 for m in range(len(pile_id)): # l=pile_id.irow(m)['pile'] l = pile_id.iloc[m]['pile'] # size=pile_id.irow(m)['size'] size = pile_id.iloc[m]['size'] if pile_id.loc[m]['outlier'] is False: for node in l: index = index_id[node] x.append(data[index][0]) y.append(data[index][1]) label.append(i) i = i + 1 else: for node in l: index = index_id[node] x.append(data[index][0]) y.append(data[index][1]) label.append(0) if level is "SEE": plot_utils.plot_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title=level_info, label=label) if level is "DEBUG": # plot_utils.save_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label,path=level_path+resource_manager.Properties.name_str_FULL()+".png") plot_utils.save_all_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title=level_info, label=label, path=level_path + resource_manager.Properties.name_str_FULL() + ".png") else: plot_utils.save_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label, path=path + str( distance_c) + ".png") plot_utils.save_all_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label, path=path + str( distance_c) + ".png") # plot_utils.plot_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label) log.debug(("\n") + str(pile_id)) try: p = Properties.getDefaultDataFold() + "/csv/" + dataset + "/" + resource_manager.Properties.name_str_static() + "/" + str( distance_c) + ".csv" pile_id.to_csv(p) except: if not os.path.exists(p[:p.rfind('/')]): pp = p.rfind('/') os.makedirs(p[:pp]) os.mknod(p) pile_id.to_csv(p)
def load_data(): id = np.load(Properties.getRootPath() + "/data/cache/flame/id.npy") data = np.load(Properties.getRootPath() + "/data/cache/flame/data.npy") return id, data
def ent_dc_step_by_step(id_index, index_id, data, threshold, distance, distance_c, dataset='none'): """ :param id_index: :param index_id: :param data: :param threshold: :param distance: :param distance_c: :return: """ # TODO modify the clustering without outlier i = 0 N = int(index_id.shape[0]) # next_distance_c=get_next_distance_c(distance,distance_c) max_distance_c = max_distance(distance, distance_c) learning_rate = 0 gradient = 0.00001 jarge_now = 0 jarge_pre = 5 pre = 65535 # 方差步长 temp = distance.copy() temp[np.isnan(temp)] = 0 stand = np.std(temp) temp = distance.copy() temp[np.isnan(temp)] = stand temp = temp.min(axis=0) next_distance_c = np.std(temp) clusterRecorder = ClusterRecorder(dataset) cr_i = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS()) clusterRecorder.setValue(str(cr_i), 'd_c', distance_c) clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_i), 'dataset', dataset) clusterRecorder.setValue(str(cr_i), 'pile_size', N) clusterRecorder.setValue(str(cr_i), 'H', 65535) clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间') if learning_rate != 0: distance_c = distance_c + learning_rate log.debug("init the first max distance_c:" + str(max_distance_c) + " distance shape:" + str(distance.shape)) start_time = Properties.name_str_HMS() while max_distance_c >= distance_c: i = i + 1 last_time = Properties.name_str_HMS() # pile = 0 # 设置pile的pile元素,与pile的类成员个数 pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier']) # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index) data_id = DataFrame([], columns=[ 'i_id', 'j_id', 'rho', 'delta', 'gamma', 'i', 'j', 'pile' ], index=id_index.values) pile_id = pile_function(pile_id, id_index, index_id, data_id, distance, distance_c, next_distance_c) pile_size = pile_id['size'] pile = pile_id.shape[0] - np.sum(pile_id['outlier']) # id_index, index_id e = _calc_ent(pile_size.values / N) merge = list([e, distance_c, pile]) threshold = add_row(threshold, merge) jarge_now = pre - e # if jarge_now > jarge_pre: if jarge_now > 0: cr_j = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', last_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id) cr_j = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', start_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id) start_time = Properties.name_str_HMS() pre = e # jarge_now = jarge_now + 1 # jarge_pre = jarge_now # next_distance_c = get_next_distance_c(distance, distance_c) # next_distance_c = 0 distance_c = distance_c + next_distance_c # if gradient==0.00005: # # distance_c = distance_c + gradient # gradient = gradient + 0.00001 # elif learning_rate != 0: # distance_c = distance_c + learning_rate # gradient = 0.00001 # else: # distance_c = distance_c + next_distance_c # gradient = 0.00001 # distance_c = distance_c + learning_rate if e == 0: log.debug("e is 0.") break log.info( str(i) + " time, finished the next_distance_c about: " + str(next_distance_c) + " distance_c:" + str(distance_c) + " next-learning_rate:" + str(learning_rate) + " H:" + str(e)) clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS()) clusterRecorder.save() log.debug(threshold) return threshold
rho_id=Series(rho,index=id) log.critical(rho) """ from cluster import density_cluster from pandas import Series from pandas import Series, DataFrame from context.resource_manager import Properties from view import shape_view from cluster import density_cluster name = 'path' distance_c = 12.3972318748 m = '3_44' pile = 0 id = np.load(Properties.getRootPath() + "/data/cache/" + name + "/id.npy") data = np.load(Properties.getRootPath() + "/data/cache/" + name + "/data.npy") id_index = Series(id.tolist()) from cluster import density_cluster index_id = Series(id_index.index, index=id_index.values) distance = density_cluster.compute_distance(data) pile_id = DataFrame([], columns=['pile', 'size']) rho_id = density_cluster.rho_function(index_id, distance, distance_c=distance_c) rho_id = Series(rho_id, index=index_id.index) rho_id = rho_id.sort_values(ascending=False) #delta_id, data_id = density_cluster.delta_function(id_index, index_id, rho_id, distance) log.debug(rho_id) pile=['3_44'] pile_max=14 pile = density_cluster.pile_brother(index_id, id_index, distance, distance_c, pile,pile_max) log.debug("pile info:")
def save_show_cluster(index_id, data, distance_c, pile_id): from view import plot_utils from context import resource_manager path = resource_manager.Properties.getDefaultDataFold( ) + "result" + resource_manager.getSeparator( ) + "temp" + resource_manager.getSeparator( ) + resource_manager.Properties.name_str_static( ) + resource_manager.getSeparator() + str(distance_c) + ".png" if not os.path.exists(resource_manager.Properties.getDefaultDataFold() + "result" + resource_manager.getSeparator() + "temp" + resource_manager.getSeparator() + resource_manager.Properties.name_str_static() + resource_manager.getSeparator()): os.makedirs(resource_manager.Properties.getDefaultDataFold() + "result" + resource_manager.getSeparator() + "temp" + resource_manager.getSeparator() + resource_manager.Properties.name_str_static() + resource_manager.getSeparator()) pile_id = pile_id.sort_values('size', ascending=False) x = [] y = [] label = [] i = 1 for m in range(len(pile_id)): # l=pile_id.irow(m)['pile'] l = pile_id.iloc[m]['pile'] # size=pile_id.irow(m)['size'] size = pile_id.iloc[m]['size'] if size >= 1 and i < 15: for node in l: index = index_id[node] x.append(data[index][0]) y.append(data[index][1]) label.append(i) i = i + 1 else: for node in l: index = index_id[node] x.append(data[index][0]) y.append(data[index][1]) label.append(0) plot_utils.save_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label, path=path) # plot_utils.plot_scatter_diagram(None, x=x, y=y, x_label='x', y_label='y', title='scatter figure', label=label) log.debug(pile_id) try: p = Properties.getDefaultDataFold( ) + "/csv/" + resource_manager.Properties.name_str_static( ) + "/" + str(distance_c) + ".csv" pile_id.to_csv(p) except: if not os.path.exists(p): pp = p.rfind('/') os.makedirs(p[:pp]) os.mknod(p) pile_id.to_csv(p)
def reload(self): if type == 'D': self.configParse.read(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini') # self.f = open(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini', 'w+') elif type == 'F': self.configParse.read(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'document.ini')
def save(self): self.recorder_csv.to_csv(Properties.getDefaultDataFold() + "/csv/recorder_csv_" + self.dataset + ".csv")
def removeConfigFiles(cls): if os.path.isfile(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini'): os.remove(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'directory.ini') if os.path.isfile(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'document.ini'): os.remove(Properties.getRootPath()+'conf' + resource_manager.getSeparator() + 'document.ini')
def ent_dc_step_by_step(id_index, index_id, data, threshold, distance, distance_c, dataset='none'): """ :param id_index: :param index_id: :param data: :param threshold: :param distance: :param distance_c: :return: """ # TODO modify the clustering without outlier i = 0 level = "INFO" N = int(index_id.shape[0]) # next_distance_c=get_next_distance_c(distance,distance_c) max_distance_c = max_distance(distance, distance_c) learning_rate = 0 gradient = 0.00001 jarge_now = 0 jarge_pre = 5 pre = 65535 # 方差步长 temp = distance.copy() # cache=temp.ravel() # percent = 0.2 # position = int(index_id.shape[0] * (index_id.shape[0] + 1) / 2 * percent / 100) # log.debug("init the first max distance_c:" + str(max_distance_c) + " distance shape:" + str(distance.shape)+" start:"+str(sorted(cache)[position * 2 + index_id.shape[0]])) # distance_c = sorted(cache)[position * 2 + index_id.shape[0]] temp[np.isnan(temp)] = 0 stand = np.std(temp) temp = distance.copy() temp[np.isnan(temp)] = stand temp = temp.min(axis=0) next_distance_c = np.std(temp) clusterRecorder = ClusterRecorder(dataset) cr_i = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS()) clusterRecorder.setValue(str(cr_i), 'd_c', distance_c) clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_i), 'dataset', dataset) clusterRecorder.setValue(str(cr_i), 'pile_size', N) clusterRecorder.setValue(str(cr_i), 'H', 65535) clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间') if learning_rate != 0: distance_c = distance_c + learning_rate start_time = Properties.name_str_HMS() while max_distance_c >= distance_c: i = i + 1 last_time = Properties.name_str_HMS() # pile = 0 # 设置pile的pile元素,与pile的类成员个数 pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier']) # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index) data_id = DataFrame([], columns=['i_id', 'j_id', 'rho', 'delta', 'gamma', 'i', 'j', 'pile'], index=id_index.values) pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c, next_distance_c, dataset) pile_size = pile_id['size'] pile = pile_id.shape[0] - np.sum(pile_id['outlier']) # id_index, index_id e=[] e_outlier=0 #log.fatal(pile_size.values) pile_id = pile_id.sort_values('size', ascending=False) pile_id = pile_id.reset_index(drop=True) for i in range(0,len(pile_id)): if not pile_id.loc[i,'outlier']: e.append(pile_id.loc[i,'size']) else: e_outlier+=pile_id.loc[i,'size'] if e_outlier>0: e.append(e_outlier) ee=np.array(e) e = _calc_ent(ee / N) merge = list([e, distance_c, pile]) threshold = add_row(threshold, merge) jarge_now = pre - e # if jarge_now > jarge_pre: if jarge_now > 0: cr_j = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', last_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id, dataset) cr_j = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', start_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id, dataset) start_time = Properties.name_str_HMS() clusterRecorder.save() if level is "DEBUG": pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c - next_distance_c, next_distance_c, dataset, level="DEBUG") save_show_cluster(index_id, data, distance_c, pile_id, dataset, level="DEBUG") pre = e # jarge_now = jarge_now + 1 # jarge_pre = jarge_now # next_distance_c = get_next_distance_c(distance, distance_c) # next_distance_c = 0 distance_c = distance_c + next_distance_c # if gradient==0.00005: # # distance_c = distance_c + gradient # gradient = gradient + 0.00001 # elif learning_rate != 0: # distance_c = distance_c + learning_rate # gradient = 0.00001 # else: # distance_c = distance_c + next_distance_c # gradient = 0.00001 # distance_c = distance_c + learning_rate if e == 0: log.debug("e is 0.") break log.info( str(i) + " time, finished the next_distance_c about: " + str(next_distance_c) + " distance_c:" + str( distance_c) + " next-learning_rate:" + str(learning_rate) + " H:" + str(e)) clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS()) clusterRecorder.save() log.debug(threshold) return threshold
pool.join() log.debug(threshold) if not os.path.exists(Properties.getDefaultDataFold() + "/csv"): #f=open(Properties.getDefaultDataFold()+"/csv/threshold.csv","w") #f.close() os.makedirs(Properties.getDefaultDataFold() + "/csv") threshold.to_csv(Properties.getDefaultDataFold() + "/csv/threshold.csv") if __name__ == '__main__': get_threshold() from context.resource_manager import Properties from view import shape_view from view import plot_utils from cluster import density_cluster threshold = pandas.read_csv(Properties.getDefaultDataFold() + "/csv/threshold.csv") d_c = np.asarray(threshold['d_c'].values) log.debug(d_c) log.critical(type(d_c)) plot_utils.plot_scatter_diagram(None, x=d_c, y=threshold['H'].values, x_label='delta', y_label='H', title='threshold scatter figure') """ delta_index=Series(id,index=id,dtype=np.float) i=0 order_id=Series(result[:,0],index=id_index.values)