def compare_datasets(datasets1, datasets2, ds_name_1="ds1", ds_name_2="ds2" ): #combined_datasets = [{**ds1, **ds2} for ds1, ds2 in zip(datasets_mean, datasets_codebook)] for embedding_name, _ in EMBEDDING_FOLDERS.items(): metrics_differences = [compute_metrics_difference(ds1, ds2, embedding_name) for ds1, ds2 in zip(datasets1, datasets2)] make_datasets_report(metrics_differences, 'purity', ds_name_1, ds_name_2) make_datasets_report(metrics_differences, 'adjusted_mutual_info', ds_name_1, ds_name_2) make_datasets_report(metrics_differences, 'adjusted_rand', ds_name_1, ds_name_2)
def write_stats_clusterings_xls(datasets, file_name='clustering_stats.xlsx'): header_1 = [''] header_2 = [''] stats_data = [] for idx, dataset in enumerate(datasets): dataset_name = dataset['name'] data_row = [dataset_name] for embedding_name, _ in EMBEDDING_FOLDERS.items(): evaluation_metrics = dataset['evaluation_metrics_{}'.format( embedding_name)] purity = evaluation_metrics['purity'] adj_mi = evaluation_metrics['adjusted_mutual_info'] adj_rand = evaluation_metrics['adjusted_rand'] avg_clust_cohesiveness = evaluation_metrics[ 'average_cluster_cohesiveness'] avg_semantic_cohesiveness = evaluation_metrics[ 'average_semantic_cohesiveness'] num_clusters = max(dataset['labels_{}'.format(embedding_name)]) + 1 data_row += [ purity, adj_mi, adj_rand, avg_clust_cohesiveness, avg_semantic_cohesiveness, num_clusters ] if idx == 0: # add this headers only once for each feature header_1 += [embedding_name, '', '', '', '', ''] header_2 += [ 'Purity', 'Adj. MI', 'Adj, Rand', 'Avg Clust Cohesiveness', 'Avg Semantic Cohesiveness', '# clusters' ] stats_data.append(data_row) data = [header_1, header_2] + stats_data write_2d_array_to_xls(data, file_name)
def add_clustering_info_for_web_visu(datasets): clustering_info = { 'datasets': [dataset['name'] for dataset in datasets], 'features': [embedding_name for embedding_name, _ in EMBEDDING_FOLDERS.items()] } json.dump(clustering_info, open('web-visu/clustering_info.json', 'w'))
def remove_failed_embeddings(datasets): """ Check all embedding folders to list missing files. Remove them from the datasets. """ all_ids = json.load(open('all_sound_ids.json', 'rb')) missing_embeddings = [] # parse all embedding folders and see which files are missing for embedding_folder in EMBEDDING_FOLDERS.values(): if embedding_folder: embeddings_files = os.listdir(embedding_folder) for s_id in all_ids: if '{0}.npy'.format(s_id) not in embeddings_files: missing_embeddings.append(s_id) missing_embeddings = set(missing_embeddings) # remove sounds in the datasets for dataset in datasets: for s_id in missing_embeddings: try: dataset['sound_ids'].remove(s_id) except: pass for _, obj in dataset['dataset'].items(): for s_id in missing_embeddings: try: obj.remove(s_id) except: pass
def local_codebook_encoding_main(codebook_sizes, datasets, ontology_by_id): for codebook_size in codebook_sizes: for dataset in datasets: sound_ids, true_labels, label_ids, label_names = create_label_vector( dataset['dataset'], ontology_by_id) for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items(): features = load_features(sound_ids, embedding_folder) if (not have_enough_data_to_cluster(features, codebook_size)): print("\nNot enough data to process codebook for ", codebook_size, " words, in dataset: ", dataset['name'], "\n\n") dataset = None else: codebook = generate_codebooks(features, codebook_size) encoded_audios = encode_frames(features, codebook) histograms = create_histograms( codebook, encoded_audios ) # maybe not really needed, only for plotting similarity_matrix = compute_similarity_matrix_from_tfidf( encoded_audios) cluster_labels, eval_metrics, graph = cluster_dataset( true_labels, similarity_matrix) log_clustering( dataset['name'], eval_metrics, "with local codebook of size " + str(codebook_size)) dataset['sound_ids'] = sound_ids # order of the sounds dataset['labels'] = true_labels # idx dataset['label_ids'] = label_ids # audioset id dataset['label_names'] = label_names # name dataset['X_{}'.format(embedding_name)] = features dataset['codebook_{}'.format(embedding_name)] = codebook dataset['encoded_{}'.format( embedding_name)] = encoded_audios dataset['histograms_{}'.format( embedding_name)] = histograms dataset['labels_{}'.format( embedding_name)] = cluster_labels dataset['evaluation_metrics_{}'.format( embedding_name)] = eval_metrics web_visu_dataset_name = '-'.join([ dataset['name'], 'local_codebook', str(codebook_size) ]) dataset['web_visu_dataset_name'] = web_visu_dataset_name save_graph_json_for_web_visu(graph, sound_ids, web_visu_dataset_name, embedding_name) #save_graph_json_for_web_visu(graph, sound_ids, dataset['name'], embedding_name) filename_to_save = "files/" + str( codebook_size) + "_codebook_clusters.file" save_as_binary(datasets, filename_to_save) add_clustering_info_for_web_visu(datasets)
def get_all_frames(datasets, ontology_by_id): all_frames = [] for _, embedding_folder in EMBEDDING_FOLDERS.items(): for dataset in datasets: sound_ids, _, _, _ = create_label_vector(dataset['dataset'], ontology_by_id) features = load_features(sound_ids, embedding_folder) all_frames += features return all_frames
def add_clustering_info_for_web_visu(datasets): previous_info = load_json('web-visu/clustering_info.json') clustering_info = { 'datasets': [dataset['web_visu_dataset_name'] for dataset in datasets], 'features': [embedding_name for embedding_name, _ in EMBEDDING_FOLDERS.items()] } if len(previous_info) > 0: updated_info = { 'datasets': previous_info['datasets'] + clustering_info['datasets'], 'features': [ embedding_name for embedding_name, _ in EMBEDDING_FOLDERS.items() ], } else: updated_info = clustering_info json.dump(updated_info, open('web-visu/clustering_info.json', 'w'))
def add_sound_metadata_to_graph(dataset, metadata): for embedding_name, _ in EMBEDDING_FOLDERS.items(): graph = json.load( open( 'web-visu/json/{}-{}.json'.format(dataset['name'], embedding_name), 'rb')) for node in graph['nodes']: try: node.update(metadata[str(node['sound_id'])]) except: pass json.dump( graph, open( 'web-visu/json/{}-{}.json'.format(dataset['name'], embedding_name), 'w'))
def statistical_agg_main(datasets, ontology_by_id): #========= original pipelines ======================================= for dataset in datasets: sound_ids, true_labels, label_ids, label_names = create_label_vector( dataset['dataset'], ontology_by_id) for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items(): features = load_features(sound_ids, embedding_folder) #dataset = create_merged_features(dataset) #==> needs openl3? features_mean = statistical_aggregation_features(features) similarity_matrix = compute_similarity_matrix(features_mean) cluster_labels, eval_metrics, graph = cluster_dataset( true_labels, similarity_matrix) log_clustering(dataset['name'], eval_metrics, "with original setup") dataset['sound_ids'] = sound_ids # order of the sounds dataset['labels'] = true_labels # idx dataset['label_ids'] = label_ids # audioset id dataset['label_names'] = label_names # name dataset['X_mean_{}'.format(embedding_name)] = features_mean dataset['X_{}'.format(embedding_name)] = features dataset['labels_{}'.format(embedding_name)] = cluster_labels dataset['evaluation_metrics_{}'.format( embedding_name)] = eval_metrics #web-visu #web_visu_dataset_name = dataset['name'] + ' mean' #web_visu_dataset_name = 'test-mfcc' #web_visu_dataset_name = '{}-{}'.format(dataset['name'], embedding_name) web_visu_dataset_name = '-'.join([dataset['name'], 'mean']) dataset['web_visu_dataset_name'] = web_visu_dataset_name save_graph_json_for_web_visu(graph, sound_ids, web_visu_dataset_name, embedding_name) filename_to_save = "files/mean_computed_datasets_clusters.file" save_as_binary(datasets, filename_to_save) add_clustering_info_for_web_visu(datasets)
def load_features(dataset): """ Returns given dataset with embedding features. """ sound_ids = dataset['sound_ids'] for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items(): if embedding_folder: embedding_files = [ embedding_folder + '{0}.npy'.format(sound_id) for sound_id in sound_ids ] features = [np.load(f) for f in embedding_files] if embedding_name == 'mfcc': X = features else: X = [np.mean(f, axis=0) for f in features] dataset['X_{}'.format(embedding_name)] = X # X = [return_gmm(f) for f in features] # dataset['X_gmm_{}'.format(embedding_name)] = X return dataset
def cluster_dataset(dataset): """ Aplly clustering on the given dataset for the different features. Saves results in the dataset dict. Display evaluation results. """ print('\n') print('{} dataset:\n'.format(dataset['name'])) for embedding_name, _ in EMBEDDING_FOLDERS.items(): X = dataset['X_{}'.format(embedding_name)] true_labels = dataset['labels'] # knn graph clustering labels, graph_json = cluster(X) # kmeans clustering # labels, graph_json = cluster_kmeans(X, num_clusters=max(dataset['labels'])+1) # agglomerative clustering # labels, graph_json = cluster_agglomerative(X, num_clusters=max(dataset['labels'])+1) # spectral clustering # labels, graph_json = cluster_spectral(X) # DBSCAN clustering # NOT WORKING # labels, graph_json = cluster_DBSCAN(X) # MeanShift clustering # labels, graph_json = cluster_MeanShift(X) dataset['labels_{}'.format(embedding_name)] = labels purity, adjusted_mutual_info, adjusted_rand, cluster_cohesiveness, \ semantic_cohesiveness, all_semantic_cohesiveness = evaluate(labels, true_labels) # Associate semantic cohesiveness to label name semantic_cohesiveness_per_category = { label_name: s_cohesiveness for label_name, s_cohesiveness in zip(dataset['label_names'], all_semantic_cohesiveness) } dataset['evaluation_metrics_{}'.format(embedding_name)] = { 'purity': purity, 'adjusted_mutual_info': adjusted_mutual_info, 'adjusted_rand': adjusted_rand, 'average_cluster_cohesiveness': cluster_cohesiveness, 'average_semantic_cohesiveness': semantic_cohesiveness, 'semantic_cohesiveness_per_category': semantic_cohesiveness_per_category } print('{} embeddings'.format(embedding_name)) print(dataset['evaluation_metrics_{}'.format(embedding_name)]) # save clustered graph as json file for node in graph_json['nodes']: node.update({'sound_id': dataset['sound_ids'][node['id']]}) json.dump( graph_json, open( 'web-visu/json/{}-{}.json'.format(dataset['name'], embedding_name), 'w')) return dataset