def ent_dc_step_by_step(id_index, index_id, data, threshold, distance, distance_c, dataset='none'): """ :param id_index: :param index_id: :param data: :param threshold: :param distance: :param distance_c: :return: """ # TODO modify the clustering without outlier i = 0 level = "INFO" N = int(index_id.shape[0]) # next_distance_c=get_next_distance_c(distance,distance_c) max_distance_c = max_distance(distance, distance_c) learning_rate = 0 gradient = 0.00001 jarge_now = 0 jarge_pre = 5 pre = 65535 # 方差步长 temp = distance.copy() # cache=temp.ravel() # percent = 0.2 # position = int(index_id.shape[0] * (index_id.shape[0] + 1) / 2 * percent / 100) # log.debug("init the first max distance_c:" + str(max_distance_c) + " distance shape:" + str(distance.shape)+" start:"+str(sorted(cache)[position * 2 + index_id.shape[0]])) # distance_c = sorted(cache)[position * 2 + index_id.shape[0]] temp[np.isnan(temp)] = 0 stand = np.std(temp) temp = distance.copy() temp[np.isnan(temp)] = stand temp = temp.min(axis=0) next_distance_c = np.std(temp) clusterRecorder = ClusterRecorder(dataset) cr_i = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS()) clusterRecorder.setValue(str(cr_i), 'd_c', distance_c) clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_i), 'dataset', dataset) clusterRecorder.setValue(str(cr_i), 'pile_size', N) clusterRecorder.setValue(str(cr_i), 'H', 65535) clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间') if learning_rate != 0: distance_c = distance_c + learning_rate start_time = Properties.name_str_HMS() while max_distance_c >= distance_c: i = i + 1 last_time = Properties.name_str_HMS() # pile = 0 # 设置pile的pile元素,与pile的类成员个数 pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier']) # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index) data_id = DataFrame([], columns=['i_id', 'j_id', 'rho', 'delta', 'gamma', 'i', 'j', 'pile'], index=id_index.values) pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c, next_distance_c, dataset) pile_size = pile_id['size'] pile = pile_id.shape[0] - np.sum(pile_id['outlier']) # id_index, index_id e=[] e_outlier=0 #log.fatal(pile_size.values) pile_id = pile_id.sort_values('size', ascending=False) pile_id = pile_id.reset_index(drop=True) for i in range(0,len(pile_id)): if not pile_id.loc[i,'outlier']: e.append(pile_id.loc[i,'size']) else: e_outlier+=pile_id.loc[i,'size'] if e_outlier>0: e.append(e_outlier) ee=np.array(e) e = _calc_ent(ee / N) merge = list([e, distance_c, pile]) threshold = add_row(threshold, merge) jarge_now = pre - e # if jarge_now > jarge_pre: if jarge_now > 0: cr_j = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', last_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id, dataset) cr_j = str(Properties.name_str_static() + "#" + str(i)) # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', start_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id, dataset) start_time = Properties.name_str_HMS() clusterRecorder.save() if level is "DEBUG": pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c - next_distance_c, next_distance_c, dataset, level="DEBUG") save_show_cluster(index_id, data, distance_c, pile_id, dataset, level="DEBUG") pre = e # jarge_now = jarge_now + 1 # jarge_pre = jarge_now # next_distance_c = get_next_distance_c(distance, distance_c) # next_distance_c = 0 distance_c = distance_c + next_distance_c # if gradient==0.00005: # # distance_c = distance_c + gradient # gradient = gradient + 0.00001 # elif learning_rate != 0: # distance_c = distance_c + learning_rate # gradient = 0.00001 # else: # distance_c = distance_c + next_distance_c # gradient = 0.00001 # distance_c = distance_c + learning_rate if e == 0: log.debug("e is 0.") break log.info( str(i) + " time, finished the next_distance_c about: " + str(next_distance_c) + " distance_c:" + str( distance_c) + " next-learning_rate:" + str(learning_rate) + " H:" + str(e)) clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS()) clusterRecorder.save() log.debug(threshold) return threshold
def ent_dc_step_by_step(id_index, index_id, data, threshold, distance, distance_c, dataset='none'): """ :param id_index: :param index_id: :param data: :param threshold: :param distance: :param distance_c: :return: """ # TODO modify the clustering without outlier i = 0 N = int(index_id.shape[0]) # next_distance_c=get_next_distance_c(distance,distance_c) max_distance_c = max_distance(distance, distance_c) learning_rate = 0 gradient = 0.00001 jarge_now = 0 jarge_pre = 5 pre = 65535 # 方差步长 temp = distance.copy() temp[np.isnan(temp)] = 0 stand = np.std(temp) temp = distance.copy() temp[np.isnan(temp)] = stand temp = temp.min(axis=0) next_distance_c = np.std(temp) clusterRecorder = ClusterRecorder(dataset) cr_i = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS()) clusterRecorder.setValue(str(cr_i), 'd_c', distance_c) clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_i), 'dataset', dataset) clusterRecorder.setValue(str(cr_i), 'pile_size', N) clusterRecorder.setValue(str(cr_i), 'H', 65535) clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间') if learning_rate != 0: distance_c = distance_c + learning_rate log.debug("init the first max distance_c:" + str(max_distance_c) + " distance shape:" + str(distance.shape)) start_time = Properties.name_str_HMS() while max_distance_c >= distance_c: i = i + 1 last_time = Properties.name_str_HMS() # pile = 0 # 设置pile的pile元素,与pile的类成员个数 pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier']) # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance) # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index) data_id = DataFrame([], columns=[ 'i_id', 'j_id', 'rho', 'delta', 'gamma', 'i', 'j', 'pile' ], index=id_index.values) pile_id = pile_function(pile_id, id_index, index_id, data_id, distance, distance_c, next_distance_c) pile_size = pile_id['size'] pile = pile_id.shape[0] - np.sum(pile_id['outlier']) # id_index, index_id e = _calc_ent(pile_size.values / N) merge = list([e, distance_c, pile]) threshold = add_row(threshold, merge) jarge_now = pre - e # if jarge_now > jarge_pre: if jarge_now > 0: cr_j = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', last_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id) cr_j = str(Properties.name_str_static() + "#" + str(i)) #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note']) clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static()) clusterRecorder.setValue(str(cr_j), 'start', start_time) clusterRecorder.setValue(str(cr_j), 'd_c', distance_c) clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c) clusterRecorder.setValue(str(cr_j), 'dataset', dataset) clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id)) clusterRecorder.setValue(str(cr_j), 'H', e) clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间') clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS()) save_show_cluster(index_id, data, distance_c, pile_id) start_time = Properties.name_str_HMS() pre = e # jarge_now = jarge_now + 1 # jarge_pre = jarge_now # next_distance_c = get_next_distance_c(distance, distance_c) # next_distance_c = 0 distance_c = distance_c + next_distance_c # if gradient==0.00005: # # distance_c = distance_c + gradient # gradient = gradient + 0.00001 # elif learning_rate != 0: # distance_c = distance_c + learning_rate # gradient = 0.00001 # else: # distance_c = distance_c + next_distance_c # gradient = 0.00001 # distance_c = distance_c + learning_rate if e == 0: log.debug("e is 0.") break log.info( str(i) + " time, finished the next_distance_c about: " + str(next_distance_c) + " distance_c:" + str(distance_c) + " next-learning_rate:" + str(learning_rate) + " H:" + str(e)) clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS()) clusterRecorder.save() log.debug(threshold) return threshold