def get_poi(data): result = [] if len(data) > 500: compressed_data = Utility.data_preprocessing(data) num_limit = 51 dist_2_array = Utility.calculate_dist_2_array(compressed_data) dist_2_array_copy = [] for arr in dist_2_array: # 备份起来,最后的优化处理会用到 arr_list = [] for ele in arr: arr_list.append(ele) dist_2_array_copy.append(arr_list) partial_dist_list = Utility.calculate_partial_dist(dist_2_array, num_limit) Utility.normalize_dist_array(dist_2_array, partial_dist_list) optimal_sigma = 0.3 dfc.normalize_point_velocity(data) velocity_list = [point.velocity for point in compressed_data] potential_list = Utility.calculate_potential_value(dist_2_array, velocity_list, optimal_sigma, num_limit) temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) potential_threshold = dfc.calculate_potential_threshold(temp_potential_list) dfc.refresh_dist_array(compressed_data, dist_2_array) density_dist_list = dfc.calculate_density_distance(dist_2_array, potential_list) dfc.add_attributes(compressed_data, potential_list, density_dist_list) temp_distance = [] for i in range(len(density_dist_list)): temp_distance.append(density_dist_list[i]) dist_threshold = dfc.calculate_dist_threshold(temp_distance) centre_potential = [] centre_dist = [] centre_index_list = [] if potential_threshold > 0 and dist_threshold > 0: for i in range(len(density_dist_list)): if potential_list[i] > potential_threshold and density_dist_list[i] > dist_threshold: centre_potential.append(potential_list[i]) centre_dist.append(density_dist_list[i]) centre_index_list.append(i) else: print 'there are something wrong with the threshold' # stop_point_list = dfc.get_stop_position(compressed_data, centre_index_list) centre_index_merge = dfc.merge_stop_position(dist_2_array_copy, centre_index_list, potential_list) new_centre_index_list = dfc.refine_stop_position(dist_2_array_copy, centre_index_merge, compressed_data) new_stop_point_list = dfc.get_stop_position(compressed_data, new_centre_index_list) result = new_stop_point_list return result
def compare_efficiency(data_dir, txt_save_file, png_save_dir): if not os.path.exists(data_dir): os.makedirs(data_dir) if not os.path.exists(png_save_dir): os.makedirs(png_save_dir) f_w = open(txt_save_file, 'w') velocity_sigma = 0.5 num_limit = 71 optimal_sigma = 0.5 for parent, dirName, file_names in os.walk(data_dir): for file_name in file_names: print file_name + '\n' begin_time = time.time() #记录开始时间 content = file_name + ',' data = Utility.read_geolife_data_file(data_dir + '\\' + file_name) compressed_data = Utility.data_preprocessing(data) partial_dist_list = Utility.caculate_adjacent_dist_list( compressed_data, num_limit) Utility.normalize_adjacent_dist_list(partial_dist_list) stability_list = Utility.calculate_stability( compressed_data, num_limit) potential_list = Utility.calculate_density_value( partial_dist_list, stability_list, optimal_sigma, velocity_sigma) # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) density_dist_list = range(len(compressed_data)) dfc.add_attributes(compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 clusters_dic = dbscan_with_datafield( compressed_data, potential_threshold, num_limit) # 返回的clusters 是字典型的 clusters = [] for id, points in clusters_dic.iteritems(): if len(points) > 0: clusters.append(points) clusters.sort(key=lambda pp: pp[0].time) merged_clusters1 = Utility.merge_clusters(clusters) merged_clusters = Utility.clusters_refinement(merged_clusters1) # ***************保存图片***************** x = [point.lon for point in compressed_data] y = [point.lat for point in compressed_data] pl.plot(x, y, 'g') colors = ['or', 'ob', 'og', 'oy', 'ok', 'oc'] color_idx = 0 for points in merged_clusters: lon = [point.lon for point in points] lat = [point.lat for point in points] pl.plot(lon, lat, colors[color_idx % len(colors)]) color_idx += 1 pl.title(file_name + '_' + str(len(merged_clusters)) + '_' + 'clusters') png_file = png_save_dir + '\\' + file_name + '_' + 'nap_' + \ str(num_limit) + 'distSigma_' + str(optimal_sigma) + '.png' pl.savefig(png_file) pl.close() end_time = time.time() time_gap = end_time - begin_time point_number = len(compressed_data) content += str(point_number) + ',' content += str(time_gap) + ',\n' f_w.write(content) f_w.close()
def parameter_choose(data_dir, dist_sigmas): f_w = open( u'D:\Python\PaperPy\data_filed_cluster\experience_result\\new experiment\\NewDBSCAN' u'\compare_experiment\\new_DBSCAN\parameter_compare\\parameter_compare.txt', 'w') velocity_sigma = 0.5 for parent, dirName, file_names in os.walk(data_dir): for file_name in file_names: stop_lists = [] #用来暂时存放结果的,根据nap来存放,一个nap值对应一条线 f_w.write(file_name + '\n') naps = [21, 51, 71, 101, 151] n = 0 for num_limit in naps: f_w.write('*' * 10 + '\n') stop_list = [] for optimal_sigma in dist_sigmas: f_w.write('nap= ' + str(num_limit) + ' dist_sigma=' + str(optimal_sigma) + '\n') print 'n' n += 1 data = Utility.read_geolife_data_file(data_dir + '\\' + file_name) compressed_data = Utility.data_preprocessing(data) partial_dist_list = Utility.caculate_adjacent_dist_list( compressed_data, num_limit) Utility.normalize_adjacent_dist_list(partial_dist_list) stability_list = Utility.calculate_stability( compressed_data, num_limit) potential_list = Utility.calculate_density_value( partial_dist_list, stability_list, optimal_sigma, velocity_sigma) # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) density_dist_list = range(len(compressed_data)) dfc.add_attributes( compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 clusters_dic = dbscan_with_datafield( compressed_data, potential_threshold, num_limit) # 返回的clusters 是字典型的 clusters = [] for id, points in clusters_dic.iteritems(): if len(points) > 0: clusters.append(points) clusters.sort(key=lambda pp: pp[0].time) merged_clusters1 = Utility.merge_clusters(clusters) # merged_clusters1 = clusters # 不采用合并 merged_clusters = Utility.clusters_refinement( merged_clusters1) count = len(merged_clusters) stop_list.append(count) f_w.write('count=' + str(count) + '\n') stop_lists.append(stop_list) for s_list in stop_lists: pl.plot(dist_sigmas, s_list) pl.xlabel('dist_sigma') pl.xlabel('the number of stops') pl.show() f_w.close()
def parameter_group_experiment(data_dir, txt_save_dir, png_save_dir): if not os.path.exists(txt_save_dir): os.makedirs(txt_save_dir) if not os.path.exists(png_save_dir): os.makedirs(png_save_dir) naps = [51] distance_sigmas = [0.3] for nap in naps: for dist_sigma in distance_sigmas: wf_name = txt_save_dir + '\\' + str(nap) + '_' + str( dist_sigma) + '.txt' wf_name2 = txt_save_dir + '\\' + str(nap) + '_' + str( dist_sigma) + '_cluster_info.txt' f_w = open(wf_name, 'w') f_w2 = open(wf_name2, 'w') head_line = 'nap=' + str(nap) + '; distance_sigma=' + str( dist_sigma) + '\n' f_w.write(head_line) for parent, dirName, file_names in os.walk(data_dir): for file_name in file_names: f_name = data_dir + '\\' + file_name f_info = 'file_name: ' + file_name + '\n' f_w.write(f_info) # data = Utility.read_geolife_data_file(f_name) data = Utility.read_own_data_file(f_name) compressed_data = Utility.data_preprocessing(data) partial_dist_list = Utility.caculate_adjacent_dist_list( compressed_data, nap) Utility.normalize_adjacent_dist_list(partial_dist_list) Utility.caculate_velocity(compressed_data) velocity_sigma = 0.5 ''' # 这些是按速度计算的那些 original_velocity_list = [point.velocity for point in compressed_data] interpolated_idx = [] # 保存插值点的索引值 for idx, p in enumerate(compressed_data): if p.is_interpolated: interpolated_idx.append(idx) kernel = Utility.generate_gaus_kernel(4) # kernel = [float(1) / 30 for i in range(31)] smoothed_velocity_list = Utility.calculate_conv(original_velocity_list, kernel) velocity_list = dfc.normalize_velocity(smoothed_velocity_list) ''' stability_list = Utility.calculate_stability( compressed_data, nap) potential_list = Utility.calculate_density_value( partial_dist_list, stability_list, dist_sigma, velocity_sigma) # ************************完毕************************ # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) density_dist_list = range(len(compressed_data)) dfc.add_attributes( compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 clusters_dic = dbscan_with_datafield( compressed_data, potential_threshold, nap) # 返回的clusters 是字典型的 clusters = [] for id, points in clusters_dic.iteritems(): if len(points) > 0: clusters.append(points) clusters.sort(key=lambda pp: pp[0].time) merged_clusters1 = Utility.merge_clusters(clusters) merged_clusters = Utility.clusters_refinement( merged_clusters1) # ***************保存图片***************** x = [point.lon for point in compressed_data] y = [point.lat for point in compressed_data] pl.plot(x, y, 'g') colors = ['or', 'ob', 'og', 'oy', 'ok', 'oc'] color_idx = 0 for points in merged_clusters: lon = [point.lon for point in points] lat = [point.lat for point in points] pl.plot(lon, lat, colors[color_idx % len(colors)]) color_idx += 1 pl.title(file_name + '_' + str(len(merged_clusters)) + '_' + 'clusters') png_file = png_save_dir + '\\' + file_name + '_' + 'nap_' + \ str(nap) + 'distSigma_' + str(dist_sigma) + '.png' pl.savefig(png_file) pl.close() # ******************************** cluster_info = 'cluster count= ' + str( len(merged_clusters)) + '\n' f_w.write(cluster_info) f_w2.write(str(len(merged_clusters)) + '\n') for cluster in merged_clusters: f_w2.write(str(len(cluster)) + '\n') cluster.sort(key=lambda pp: pp.time) length = len(cluster) first_line = str(cluster[0].lon) + ',' + str( cluster[0].lat) + ',' + cluster[0].time_str end_line = str(cluster[length-1].lon) + ',' + str(cluster[length-1].lat) + \ ',' + cluster[length-1].time_str + '\n' f_w.write(first_line) f_w.write(end_line) for point in cluster: p_content = str(point.lon) + ',' + str( point.lat) + ',\n' f_w2.write(p_content) f_w.close() f_w2.close()
def dbscan_process(data, velocity_sigma, nap, sigma1): num_limit = nap velocity_sigma = velocity_sigma optimal_sigma = sigma1 compressed_data = Utility.data_preprocessing(data) lon = [point.lon for point in compressed_data] lat = [point.lat for point in compressed_data] # pl.plot(lon, lat, 'og') # pl.show() # 在计算相邻点之间的距离时,把速度也计算出来 partial_dist_list = Utility.caculate_adjacent_dist_list( compressed_data, num_limit) Utility.normalize_adjacent_dist_list(partial_dist_list) Utility.caculate_velocity(compressed_data) # ****************新增的对速度进行平滑***************** original_velocity_list = [point.velocity for point in compressed_data] kernel = Utility.generate_gaus_kernel(4) smoothed_velocity_list = Utility.calculate_conv(original_velocity_list, kernel) # pl.plot(smoothed_velocity_list) # # pl.plot(original_velocity_list) # pl.xlabel('sequence number') # pl.ylabel('velocity') # pl.savefig('C:\\Users\\Administrator\\Desktop\\Figure 1\\smoothed velocity.png', dpi=200) # pl.show() velocity_list = dfc.normalize_velocity(smoothed_velocity_list) stability_list = Utility.calculate_stability(compressed_data, num_limit) # smoothed_stability_list = Utility.calculate_conv(stability_list, kernel) # pl.plot(smoothed_stability_list) # pl.xlabel('sequence number') # pl.ylabel('move ability') # pl.savefig('C:\\Users\\Administrator\\Desktop\\Figure 1\\smoothed move_ability.png', dpi=200) # pl.show() # ************************完毕************************ # potential_list = Utility.calculate_density_value(partial_dist_list, velocity_list, optimal_sigma, velocity_sigma) potential_list = Utility.calculate_density_value(partial_dist_list, stability_list, optimal_sigma, velocity_sigma) # ****************新增的对密度进行平滑***************** # ************************完毕************************ # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) density_dist_list = range(len(compressed_data)) dfc.add_attributes(compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 clusters_dic = dbscan_with_datafield(compressed_data, potential_threshold, num_limit) # 返回的clusters 是字典型的 clusters = [] for id, points in clusters_dic.iteritems(): if len(points) > 0: clusters.append(points) clusters.sort(key=lambda pp: pp[0].time) return clusters
def main(file_name): start_time = time.time() show_track.show_track(file_name) # data = Utility.read_data_file(file_name) # 注意修改数据文件时要修改读文件程序 # data = Utility.read_geolife_data_file(file_name) data = Utility.read_own_data_file(file_name) compressed_data = Utility.data_preprocessing(data) # 绘制预处理后的轨迹图 pl.figure(figsize=Utility.figure_size) x = [point.lon for point in compressed_data] y = [point.lat for point in compressed_data] l1 = pl.plot(x, y, 'og') pl.setp(l1, markersize=3) pl.xlabel("longitude") pl.ylabel("latitude") pl.show() # 计算距离矩阵,同时得到轨迹点的速度特征 num_limit = 101 dist_2_array = Utility.calculate_dist_2_array(compressed_data) dist_2_array_copy = [] for arr in dist_2_array: # 备份起来,最后的优化处理会用到 arr_list = [] for ele in arr: arr_list.append(ele) dist_2_array_copy.append(arr_list) # 归一化距离矩阵 partial_dist_list = Utility.calculate_partial_dist(dist_2_array, num_limit) Utility.normalize_dist_array(dist_2_array, partial_dist_list) optimal_sigma = 0.3 velocity_sigma = 0.5 print '*' * 20 + 'the optimal sigma is ' print optimal_sigma dfc.normalize_point_velocity(compressed_data) # 速度归一化,速度在此进行了平滑 velocity_list = [point.velocity for point in compressed_data] # potential_list = calculate_optimal_potential(dist_2_array, velocity_list, num_limit) # 画出sigma图 potential_list = Utility.calculate_potential_value(dist_2_array, velocity_list, optimal_sigma, velocity_sigma, num_limit) # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) # 归一化 # max_potential = max(potential_list) # for i in range(len(potential_list)): # potential_list[i] /= max_potential # 画未归一化的密度序列图(未排序) xx = [float(i) / len(compressed_data) for i in range(len(compressed_data))] pl.figure(figsize=Utility.figure_size) pl.plot(xx, potential_list, linewidth=Utility.line_width) pl.xlabel('x') pl.ylabel('density') pl.title('density sequence before smooth') pl.show() # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) # 更新距离矩阵,加入时间距离 # dfc.refresh_dist_array(compressed_data, dist_2_array) density_dist_list = dfc.calculate_density_distance( dist_2_array, potential_list) # 画出distance图 # 画出距离曲线 xx = range(len(compressed_data)) pl.plot(xx, density_dist_list) pl.title('distance sequence') pl.show() dfc.add_attributes(compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 temp_distance = [] for i in range(len(density_dist_list)): temp_distance.append(density_dist_list[i]) dist_threshold = dfc.calculate_dist_threshold(temp_distance) print 'potential threshold' print potential_threshold print 'distance threshold' print dist_threshold centre_potential = [] centre_dist = [] centre_index_list = [] # 直接存放聚类中心点 if potential_threshold > 0 and dist_threshold > 0: for i in range(len(density_dist_list)): if potential_list[i] > potential_threshold and density_dist_list[ i] > dist_threshold: centre_potential.append(potential_list[i]) centre_dist.append(density_dist_list[i]) centre_index_list.append(i) # pl.plot(centre_potential, centre_dist, 'or', label='centre_point') print 'centre potential' print centre_potential print 'centre_dist' print centre_dist else: print 'there are something wrong with the threshold' # 画出potential_distance图 # # 结果显示 pl.plot(potential_list, density_dist_list, 'ob') pl.plot(centre_potential, centre_dist, 'oy') pl.xlabel('density') pl.ylabel('distance') pl.show() # result_index_list = dfc.result_improvement(compressed_data, num_limit, centre_index_list) # dfc.result_show(data, compressed_data, result_index_list) stop_point_list = dfc.get_stop_position(compressed_data, centre_index_list) dfc.result_show(data, stop_point_list) for point in stop_point_list: print str(point.lon) + ', ' + str(point.lat) + ', ' + \ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(point.time / float(1000))) print '*' * 20 + 'after merge....' centre_index_merge = dfc.merge_stop_position(dist_2_array_copy, centre_index_list, potential_list) # new_centre_index_list = dfc.refine_stop_position(dist_2_array_copy, centre_index_merge, compressed_data) new_stop_point_list = dfc.get_stop_position(compressed_data, centre_index_merge) # ******************************* # 画经过提纯处理后的decision graph # new_centre_potential = [] # new_centre_dist = [] # for index in new_centre_index_list: # new_centre_potential.append(potential_list[index]) # new_centre_dist.append(density_dist_list[index]) # plt.figure(figsize=Utility.figure_size) # pl.plot(potential_list, density_dist_list, 'ob') # pl.plot(new_centre_potential, new_centre_dist, 'oy') # pl.xlabel('density') # pl.ylabel('distance') # pl.show() # ****************************** end_time = time.time() print u'程序总共运行时间:%f秒' % (end_time - start_time) dfc.result_show(data, new_stop_point_list) for point in new_stop_point_list: print str(point.lon) + ', ' + str(point.lat) + ', ' + \ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(point.time / float(1000)))
def main(file_name): start_time = time.time() show_track.show_track(file_name) data = Utility.read_data_file(file_name) # 注意修改数据文件时要修改读文件程序 # data = Utility.read_geolife_data_file(file_name) compressed_data = Utility.data_preprocessing(data) ''' # 绘制压缩后的轨迹点 pl.figure(figsize=Utility.figure_size) x = [point.lon for point in compressed_data] y = [point.lat for point in compressed_data] l1 = pl.plot(x, y, 'og') pl.setp(l1, markersize=3) pl.xlabel("longitude") pl.ylabel("latitude") pl.show() ''' num_limit = 51 dist_2_array = Utility.calculate_dist_2_array(compressed_data) dist_2_array_copy = [] for arr in dist_2_array: # 备份起来,最后的优化处理会用到 arr_list = [] for ele in arr: arr_list.append(ele) dist_2_array_copy.append(arr_list) partial_dist_list = Utility.calculate_partial_dist(dist_2_array, num_limit) Utility.normalize_dist_array(dist_2_array, partial_dist_list) max_partial_dist = max(partial_dist_list) optimal_sigma = 0.3 velocity_sigma = 0.5 ''' print '*' * 20 + 'the optimal sigma is ' print optimal_sigma print 'the max partial dist is ' print max_partial_dist ''' dfc.normalize_point_velocity(compressed_data) # 速度归一化 velocity_list = [point.velocity for point in compressed_data] potential_list = Utility.calculate_potential_value(dist_2_array, velocity_list, optimal_sigma, velocity_sigma, num_limit) # 备份 temp_potential_list = [] for i in range(len(potential_list)): temp_potential_list.append(potential_list[i]) ''' # 画未归一化的密度序列图(未排序) xx = [float(i)/len(compressed_data) for i in range(len(compressed_data))] pl.figure(figsize=Utility.figure_size) pl.plot(xx, potential_list, linewidth=Utility.line_width) pl.xlabel('x') pl.ylabel('density') pl.show() ''' # 画出potential的统计图 采用备份数据,函数里面需要排序 potential_threshold = dfc.calculate_potential_threshold( temp_potential_list) # 更新距离矩阵,加入时间距离 dfc.refresh_dist_array(compressed_data, dist_2_array) # 计算每个聚类点的距离特征; density_dist_list = dfc.calculate_density_distance( dist_2_array, potential_list) # 画出distance图 ''' xx = range(len(compressed_data)) pl.plot(xx, density_dist_list) pl.show() ''' dfc.add_attributes(compressed_data, potential_list, density_dist_list) # 完善了数据的potential和dist 属性 temp_distance = [] for i in range(len(density_dist_list)): temp_distance.append(density_dist_list[i]) dist_threshold = dfc.calculate_dist_threshold(temp_distance) # ************直接画出聚类2016.10.27******************************* cluster_points = dfc.get_cluster_points(compressed_data, potential_threshold) cluster_lon = [point.lon for point in cluster_points] cluster_lat = [point.lat for point in cluster_points] # 获得所有的聚类 cluster_list = [] is_next_cluster = False cluster = [] for i in range(len(cluster_points) - 1): if i == (len(cluster_points) - 4): print 'why' cluster.append(cluster_points[i]) # 如果需要分成下一个聚类,则要重新定义一个元组 if is_next_cluster: cluster_list.append(cluster) cluster = [] distance = Utility.distance_calculate(cluster_points[i], cluster_points[i + 1]) time_interval = cluster_points[i + 1].time - cluster_points[i].time # 聚类之间的距离阈值为200米,时间间隔阈值为30分钟 if distance > 800 or time_interval > 30 * 60 * 1000: is_next_cluster = True else: is_next_cluster = False cluster_list.append(cluster) # comp_file = open(u"D:\Python\PaperPy\DataOperation\compress.txt", 'w') # for point in compressed_data: # comp_file.write(str(point.lon) + ' ' + str(point.lat) + ' ' + str(point.time) + '\n') # comp_file.close() # cluster_file = open(u"D:\Python\PaperPy\DataOperation\cluster.txt", 'w') # for point in cluster_points: # cluster_file.write(str(point.lon) + ' ' + str(point.lat) + ' ' + str(point.time) + '\n') # cluster_file.close() # 绘制压缩后的轨迹点 pl.figure(figsize=Utility.figure_size) # x = [point.lon for point in compressed_data] # y = [point.lat for point in compressed_data] # l1 = pl.plot(x, y, 'og') # pl.setp(l1, markersize=3) pl.xlabel("longitude") pl.ylabel("latitude") colors = ['or', 'ok', 'ob'] for i in range(len(cluster_list)): lon = [] lat = [] for j in range(len(cluster_list[i])): lon.append(cluster_list[i][j].lon) lat.append(cluster_list[i][j].lat) pl.plot(lon, lat, colors[i % len(colors)]) pl.show()