def compare_multi(all_methods, all_datasets, output_name, measure=adjusted_rand_score): """Shows the results of multiple methods on multiple datasets in a grid. This is an adaptation of a snippet from scikit-learn library: https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html Args: all_methods: A list of tuples each of which corresponds to a clustering method. Each tuple contains 3 elements, a name that will be shown in the plots for the method, the function object for the clustering method, and a dictionary that will be passed the the clustering method as the hyper-parameters. all_datasets: A list of tuples each of which corresponds to a dataset function. Each tuple contains 3 elements, the function object for the dataset, a dictionary which will be passed as the hyper-parameters for the dataset function, and a dictionary that contains the hyper-parameters for the clustering function (this will override the pre-defined values). All the data generated by the dataset functions have to be 2d, otherwise the first two dimentions will be used. output_name: the name of the file (containing the plot) that will be saved. The posfix of the image file has to be defined (e.g. .png). Example: all_methods = [('K_means', K_means, {'required_format': 'data_vectors'})] all_datasets = [(noisy_circle, {}, {'input_format': 'data_vectors'})] """ plt.figure(figsize=(len(all_methods)*2, len(all_datasets)*2)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 for i_dataset, dataset_tuple in enumerate(all_datasets): dataset_method = dataset_tuple[0] dataset_params = dataset_tuple[1] dataset_clustering_method_overriding_params = dataset_tuple[2] X, y = dataset_method(dataset_params) prev_coords = None for name, algorithm, hyper_params in all_methods: hyper_params.update(dataset_clustering_method_overriding_params) hyper_params.update({'y': y, 'coords_given': prev_coords}) output_dict = clustering_method_call(X, algorithm, hyper_params=hyper_params) coords = output_dict['coords'] if coords is not None: prev_coords = coords else: coords = prev_coords y_pred = output_dict['y_pred'] plt.subplot(len(all_datasets), len(all_methods), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00', '#BB8FCE', '#F7DC6F', '#117A65', '#CA6F1E', '#979A9A', '#34495E', '#9A7D0A', '#0B5345', '#641E16']), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) # If the points reside in a space with more than 2 dimensions PCA is used if coords.shape[1] > 2: pca = PCA(n_components=2) coords_2d = pca.fit_transform(coords) else: coords_2d = coords plt.scatter(coords_2d[:, 0], coords_2d[:, 1], s=10, color=colors[y_pred]) plt.xticks(()) plt.yticks(()) if len(np.unique(y)) > 1: ari_score = measure(y, y_pred) plt.text(.99, .01, ('%.2f' % ari_score).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 plt.savefig(output_name, dpi=400)
def compare_timestamps(entitys, output_name, fixed_window_size=None, update_clusters=False, batch_portion=0.5, incremental_dist_threshold=0.5, batch_dist_threshold=0.9): all_v_result = [] all_inc_c = [] all_inc_h = [] sessions_count = [] print(entitys.keys()) for p_id in entitys: v_result = [] x = [] single_data = entitys[p_id] sim_map = single_data['similarity_graph'] labels = single_data['id_to_cluster_map'] print('number of sessions: ', len(labels)) sessions_count.append(len(labels)) dist_vals = np.array(list(sim_map.values())) y = [labels[key] for key in sorted(labels.keys())] time_stamps_orig = single_data['time_stamps'] try: time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig] except: time_stamps = [np.datetime64(elem) for elem in time_stamps_orig] period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us')) batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us') for start_date in range(int(np.ceil(period*batch_portion))): first_day = time_stamps[0] + np.timedelta64(start_date*86400000000, 'us') first_idx = 0 last_idx = len(time_stamps) for i in range(len(time_stamps)): if time_stamps[i] < first_day: first_idx += 1 if time_stamps[i] > first_day + batch_period: last_idx += i break sim_map_tmp = filter_sim_map(sim_map, last_idx-1) sim_map_new = filter_sim_map(sim_map_tmp, first_idx, min_case=True) inc_params = {'dist_threshold': incremental_dist_threshold, 'data_size': last_idx-first_idx, 'update_clusters': update_clusters, 'window_size': None} y_pred_inc = incremental_average_SI(sim_map_new, hyper_params=inc_params) batch_params = {'input_format': 'similarity_dict', 'required_format': 'dist_mat', 'distance_threshold': 0.9, 'n_clusters': None, 'data_size': last_idx-first_idx} output_dict = clustering_method_call(sim_map_new, HAC_average, batch_params) y_pred_batch = output_dict['y_pred'] v_result.append(v_measure_score(y_pred_batch, y_pred_inc)) x.append(start_date) all_v_result.append(v_result) inc_result = np.average(all_v_result, axis=0) plt.plot(x, inc_result)
def warm_start_inc_avg_SI(sim_map, hyper_params={}): params = {'batch_portion': 0.5, 'data_size': None, 'dist_threshold': 0.1, 'batch_threshold': 0.9, 'time_stamps': None, 'window_size': 15, 'batch_params': {}, 'inc_params': {}} params.update(hyper_params) dist_threshold = params['dist_threshold'] batch_threshold = params['batch_threshold'] all_data_size = params['data_size'] window_size = params['window_size'] time_stamps_orig = params['time_stamps'] batch_portion = params['batch_portion'] if time_stamps_orig is not None: try: time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig] except: time_stamps = [np.datetime64(elem) for elem in time_stamps_orig] period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us')) batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us') batch_size = 0 for i in range(len(time_stamps)): if time_stamps[i] < time_stamps[0] + batch_period: batch_size += 1 else: break else: time_stamps = time_stamps_orig batch_size = int(np.ceil(all_data_size * batch_portion)) batch_sim_map = filter_sim_map(sim_map, batch_size-1) batch_params = {'input_format': 'similarity_dict', 'required_format': 'dist_mat', 'distance_threshold': batch_threshold, 'n_clusters': None, 'data_size': batch_size, # 'top_eigenvals': 2, # 'embedding_method': get_coords_mds_stress, # 'embedding_hyper_params': {'eps': 0.0000001, # 'max_iter': 1000} } batch_params.update(params['batch_params']) output_dict = clustering_method_call(batch_sim_map, HAC_average, batch_params) y_pred_batch = output_dict['y_pred'] clusters_initial_tmp = output_dict['cluster_to_id_map'].values() clusters_initial = [set(cluster) for cluster in clusters_initial_tmp] inc_params = {'dist_threshold': dist_threshold, 'update_clusters': False, 'clusters': clusters_initial, 'data_size': all_data_size, 'time_stamps': time_stamps, 'window_size': window_size} inc_params.update(params['inc_params']) y_label_inc = incremental_average_SI(sim_map, hyper_params=inc_params) return y_label_inc, y_pred_batch
def compare_incremental_multi(entitys, output_name, warm_start=False, weighted=False, time_stamps_flag=False, fixed_window_size=None, batch_portion=0.5, update_clusters=False, second_half_batch=False, linkage='centroid', incremental_no_first_half=False, batch_dist_threshold=0.9): all_inc_result = [] all_inc_c = [] all_inc_h = [] sessions_count = [] for p_id in range(len(entitys)): single_data = entitys[p_id] sim_map = single_data['similarity_graph'] labels = single_data['id_to_cluster_map'] print('number of sessions: ', len(labels)) sessions_count.append(len(labels)) dist_vals = np.array(list(sim_map.values())) y = [labels[key] for key in sorted(labels.keys())] if time_stamps_flag: time_stamps_orig = single_data['time_stamps'] try: time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig] except: time_stamps = [np.datetime64(elem) for elem in time_stamps_orig] period = 1 + int((time_stamps[-1] - time_stamps[0]) / np.timedelta64(86400000000, 'us')) batch_period = np.timedelta64(int(np.ceil(period*batch_portion))* 86400000000, 'us') batch_size = 0 for i in range(len(time_stamps)): if time_stamps[i] < time_stamps[0] + batch_period: batch_size += 1 else: break inc_result = [] inc_c = [] inc_h = [] best_scores = [] x = [] for dist in np.arange(0.00, 1.01, 0.05): if not warm_start: if time_stamps_flag: inc_params = {'dist_threshold': dist, 'data_size': len(labels), 'time_stamps': time_stamps, 'window_size': fixed_window_size, 'linkage': linkage, 'update_clusters': update_clusters} else: inc_params = {'dist_threshold': dist, 'data_size': len(labels), 'linkage': linkage, 'update_clusters': update_clusters} # batch_size = len(y)//2 batch_size = int(np.ceil(len(y)*batch_portion)) if incremental_no_first_half: inc_params.update({'data_size': len(labels)-batch_size}) new_sim_map = filter_sim_map(sim_map, batch_size, min_case=True) y_label_inc = incremental_average_SI(new_sim_map, hyper_params=inc_params) else: y_label_inc = incremental_average_SI(sim_map, hyper_params=inc_params) y_label_inc = y_label_inc[batch_size:] else: if time_stamps_flag: warm_start_params = {'dist_threshold': dist, 'data_size': len(labels), 'batch_portion': batch_portion, 'time_stamps': time_stamps, 'window_size': fixed_window_size, 'inc_params': {'update_clusters': update_clusters, 'linkage': linkage}, 'batch_params': {'distance_threshold': batch_dist_threshold}} else: warm_start_params = {'dist_threshold': dist, 'data_size': len(labels), 'batch_portion': batch_portion, 'inc_params': {'update_clusters': update_clusters, 'linkage': linkage}, 'batch_params': {'distance_threshold': batch_dist_threshold}} y_label_inc, y_pred_batch = warm_start_inc_avg_SI(sim_map, hyper_params=warm_start_params) batch_size = len(y_pred_batch) y_label_inc = y_label_inc[batch_size:] y_half = y[batch_size:] if second_half_batch: second_sim_map = filter_sim_map(sim_map, batch_size, min_case=True) batch_params = {'input_format': 'similarity_dict', 'required_format': 'dist_mat', 'data_size': len(y_half), 'n_clusters': None, 'distance_threshold': batch_dist_threshold} output_dict = clustering_method_call(second_sim_map, HAC_average, hyper_params=batch_params) new_y_half = output_dict['y_pred'] y_half = new_y_half inc_result.append(v_measure_score(y_half, y_label_inc)) inc_c.append(completeness_score(y_half, y_label_inc)) inc_h.append(homogeneity_score(y_half, y_label_inc)) x.append(dist) print('second half: ', len(y_label_inc)) best_scores.append(np.max(inc_result)) all_inc_result.append(inc_result) all_inc_c.append(inc_c) all_inc_h.append(inc_h) if weighted: inc_result = np.average(all_inc_result, axis=0, weights=sessions_count) inc_c = np.average(all_inc_c, axis=0, weights=sessions_count) inc_h = np.average(all_inc_h, axis=0, weights=sessions_count) else: inc_result = np.average(all_inc_result, axis=0) inc_c = np.average(all_inc_c, axis=0) inc_h = np.average(all_inc_h, axis=0) inc_result_err = np.std(all_inc_result, axis=0) inc_c_err = np.std(all_inc_c, axis=0) inc_h_err = np.std(all_inc_h, axis=0) inc_result_err_lower = inc_result - inc_result_err inc_result_err_upper = inc_result + inc_result_err inc_result_err_tup = list(zip(inc_result_err_lower, inc_result_err_upper)) inc_result_err_tup = np.array(inc_result_err_tup).T # print('err tup: ', inc_result_err_tup) # print(list(y_pred_batch)) # print(y[:len(y_pred_batch)]) # print(v_measure_score(y[:len(y_pred_batch)], y_pred_batch)) # print(purity(y[:len(y_pred_batch)], y_pred_batch)) x.reverse() plt.errorbar(x, inc_result, yerr=inc_result_err, fmt='-o') title = "" if warm_start: title += 'Warm start, ' else: title += 'Cold start, ' if update_clusters: title += 'with merging clusters, ' else: title += 'no merging clusters, ' if fixed_window_size is not None: title += str(fixed_window_size) + ' day window' else: title += 'no limited window' plt.title(title) plt.xlabel('Similarity Threshold') plt.ylabel('Score') # plt.legend() plt.ylim(0.0, 1.0) plt.xlim(0.0, 1.0) plt.savefig(output_name + '.png') plt.clf() plt.hist(best_scores) # plt.savefig(output_name + '_best_hist.png') plt.clf() return x, inc_result, inc_c, inc_h
def incremental_average_SI(sim_map, hyper_params={}): params = {'dist_threshold': 0.5, 'update_clusters': False, 'clusters': [set([0])], 'time_stamps': None, 'window_size': 15, 'data_size': None, 'linkage': 'centroid', 'max_sim_value': 1.0, 'only_final_clusters': False, 'normalize_flag': False, 'keep_timestamp_results': {'active': False, 'v_scores': [], 'batch_params': {}, 'complete_batch': [], 'v_scores_complete': [], 'size_list': [], 'h_scores': [], 'h_scores_complete': [], 'c_scores': [], 'c_scores_complete': [], 'v_scores_limited': []}, 'dist_mat': False} params.update(hyper_params) data_size = params['data_size'] max_sim_value = params['max_sim_value'] normalize_flag = params['normalize_flag'] linkage = params['linkage'] keep_timestamp_results = params['keep_timestamp_results'] if params['dist_mat']: dist_mat = sim_map else: sim_mat = dict_to_mat(sim_map, data_size=data_size, max_val=max_sim_value)[0] dist_mat = sim_dist_convert(sim_mat, normalize_flag=normalize_flag) dist_threshold = params['dist_threshold'] update_clusters = params['update_clusters'] clusters = deepcopy(params['clusters']) time_stamps_orig = params['time_stamps'] window_size = params['window_size'] windowed_flag = False if time_stamps_orig is not None and window_size is not None: windowed_flag = True try: time_stamps = [np.datetime64(datetime.datetime.utcfromtimestamp(elem)) for elem in time_stamps_orig] except: time_stamps = [np.datetime64(elem) for elem in time_stamps_orig] outdated_set = set() already_clustered = set() for cluster in clusters: already_clustered |= cluster first_window_move_flag = True first_window_move_idx = 0 outdate_index = 0 # points to position after the last outdated element. for i in range(len(dist_mat)): if i in already_clustered: continue if windowed_flag: while (time_stamps[i] - time_stamps[outdate_index]) / np.timedelta64(86400000000, 'us') >= window_size: print((time_stamps[i] - time_stamps[outdate_index]) / np.timedelta64(86400000000, 'us')) outdated_set.add(outdate_index) outdate_index += 1 if first_window_move_flag: first_window_move_flag = False first_window_move_idx = i best_cluster = -1 min_dist = float("inf") for idx, cluster in enumerate(clusters): new_dist = average_dist_SI(dist_mat, cluster, i, outdated_set=outdated_set, linkage=linkage) if new_dist < min_dist: min_dist = new_dist best_cluster = idx update_necessary = False if min_dist < dist_threshold: changed_cluster = idx clusters[best_cluster].add(i) update_necessary = True else: changed_cluster = len(clusters) clusters.append(set([i])) if update_clusters and update_necessary: remaining_flag = True while remaining_flag: remaining_flag = False for c in range(len(clusters)): if c == changed_cluster: continue cc_dist = compute_new_dist_SI(dist_mat, clusters[changed_cluster], clusters[c], outdated_set=outdated_set, linkage=linkage) if cc_dist < dist_threshold: remaining_flag = True clusters[changed_cluster] = clusters[changed_cluster] | clusters[c] if c >= len(clusters)-1: clusters = clusters[:c] else: clusters = clusters[:c] + clusters[c+1:] if c < changed_cluster: changed_cluster -= 1 break if keep_timestamp_results['active']: # print('-------------') keep_timestamp_results['first_window_move_idx'] = first_window_move_idx tmp_y_inc = [-1 for _ in range(i+1)] for idx, cluster in enumerate(clusters): for elem in list(cluster): tmp_y_inc[elem] = idx tmp_y_inc = tmp_y_inc[outdate_index:] # print('inc: ', tmp_y_inc) tmp_sim_map = filter_sim_map(sim_map, i) ### if we do not want to outdate: keep_timestamp_results['batch_params']['data_size'] = i+1 keep_timestamp_results['size_list'].append(len(tmp_y_inc)) tmp_batch_output = clustering_method_call(tmp_sim_map, HAC_average, hyper_params=keep_timestamp_results['batch_params']) tmp_y_batch = tmp_batch_output['y_pred'] tmp_y_batch = tmp_y_batch[-len(tmp_y_inc):] keep_timestamp_results['v_scores'].append(v_measure_score(tmp_y_batch, tmp_y_inc)) keep_timestamp_results['h_scores'].append(homogeneity_score(tmp_y_batch, tmp_y_inc)) keep_timestamp_results['c_scores'].append(completeness_score(tmp_y_batch, tmp_y_inc)) ### if we want to outdate: tmp_sim_map = filter_sim_map(tmp_sim_map, outdate_index, min_case=True) keep_timestamp_results['batch_params']['data_size'] = len(tmp_y_inc) tmp_batch_output = clustering_method_call(tmp_sim_map, HAC_average, hyper_params=keep_timestamp_results['batch_params']) tmp_y_batch = tmp_batch_output['y_pred'] keep_timestamp_results['v_scores_limited'].append(v_measure_score(tmp_y_batch, tmp_y_inc)) complete_batch = keep_timestamp_results['complete_batch'] if len(complete_batch) > 0: keep_timestamp_results['v_scores_complete'].append(v_measure_score(complete_batch[outdate_index: i+1], tmp_y_inc)) keep_timestamp_results['h_scores_complete'].append(homogeneity_score(complete_batch[outdate_index: i+1], tmp_y_inc)) keep_timestamp_results['c_scores_complete'].append(completeness_score(complete_batch[outdate_index: i+1], tmp_y_inc)) first_index = 0 if params['only_final_clusters']: first_index = outdate_index y = [-1 for i in range(len(dist_mat))] for idx, cluster in enumerate(clusters): for elem in list(cluster): if elem < first_index: continue y[elem] = idx return y[first_index:]