def compare_datasets(datasets1, datasets2, ds_name_1="ds1", ds_name_2="ds2" ):
    #combined_datasets = [{**ds1, **ds2} for ds1, ds2 in zip(datasets_mean, datasets_codebook)]
    for embedding_name, _ in EMBEDDING_FOLDERS.items():
        metrics_differences = [compute_metrics_difference(ds1, ds2, embedding_name) for ds1, ds2 in zip(datasets1, datasets2)]
        make_datasets_report(metrics_differences, 'purity', ds_name_1, ds_name_2)
        make_datasets_report(metrics_differences, 'adjusted_mutual_info', ds_name_1, ds_name_2)
        make_datasets_report(metrics_differences, 'adjusted_rand', ds_name_1, ds_name_2)
def write_stats_clusterings_xls(datasets, file_name='clustering_stats.xlsx'):
    header_1 = ['']
    header_2 = ['']
    stats_data = []
    for idx, dataset in enumerate(datasets):
        dataset_name = dataset['name']
        data_row = [dataset_name]
        for embedding_name, _ in EMBEDDING_FOLDERS.items():
            evaluation_metrics = dataset['evaluation_metrics_{}'.format(
                embedding_name)]
            purity = evaluation_metrics['purity']
            adj_mi = evaluation_metrics['adjusted_mutual_info']
            adj_rand = evaluation_metrics['adjusted_rand']
            avg_clust_cohesiveness = evaluation_metrics[
                'average_cluster_cohesiveness']
            avg_semantic_cohesiveness = evaluation_metrics[
                'average_semantic_cohesiveness']
            num_clusters = max(dataset['labels_{}'.format(embedding_name)]) + 1

            data_row += [
                purity, adj_mi, adj_rand, avg_clust_cohesiveness,
                avg_semantic_cohesiveness, num_clusters
            ]

            if idx == 0:  # add this headers only once for each feature
                header_1 += [embedding_name, '', '', '', '', '']
                header_2 += [
                    'Purity', 'Adj. MI', 'Adj, Rand', 'Avg Clust Cohesiveness',
                    'Avg Semantic Cohesiveness', '# clusters'
                ]
        stats_data.append(data_row)
    data = [header_1, header_2] + stats_data
    write_2d_array_to_xls(data, file_name)
def add_clustering_info_for_web_visu(datasets):
    clustering_info = {
        'datasets': [dataset['name'] for dataset in datasets],
        'features':
        [embedding_name for embedding_name, _ in EMBEDDING_FOLDERS.items()]
    }
    json.dump(clustering_info, open('web-visu/clustering_info.json', 'w'))
def remove_failed_embeddings(datasets):
    """
    Check all embedding folders to list missing files.
    Remove them from the datasets.

    """
    all_ids = json.load(open('all_sound_ids.json', 'rb'))
    missing_embeddings = []

    # parse all embedding folders and see which files are missing
    for embedding_folder in EMBEDDING_FOLDERS.values():
        if embedding_folder:
            embeddings_files = os.listdir(embedding_folder)
            for s_id in all_ids:
                if '{0}.npy'.format(s_id) not in embeddings_files:
                    missing_embeddings.append(s_id)

    missing_embeddings = set(missing_embeddings)

    # remove sounds in the datasets
    for dataset in datasets:
        for s_id in missing_embeddings:
            try:
                dataset['sound_ids'].remove(s_id)
            except:
                pass
        for _, obj in dataset['dataset'].items():
            for s_id in missing_embeddings:
                try:
                    obj.remove(s_id)
                except:
                    pass
def local_codebook_encoding_main(codebook_sizes, datasets, ontology_by_id):
    for codebook_size in codebook_sizes:
        for dataset in datasets:
            sound_ids, true_labels, label_ids, label_names = create_label_vector(
                dataset['dataset'], ontology_by_id)
            for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items():
                features = load_features(sound_ids, embedding_folder)

                if (not have_enough_data_to_cluster(features, codebook_size)):
                    print("\nNot enough data to process codebook for ",
                          codebook_size, " words, in dataset: ",
                          dataset['name'], "\n\n")
                    dataset = None
                else:
                    codebook = generate_codebooks(features, codebook_size)
                    encoded_audios = encode_frames(features, codebook)
                    histograms = create_histograms(
                        codebook, encoded_audios
                    )  # maybe not really needed, only for plotting

                    similarity_matrix = compute_similarity_matrix_from_tfidf(
                        encoded_audios)
                    cluster_labels, eval_metrics, graph = cluster_dataset(
                        true_labels, similarity_matrix)

                    log_clustering(
                        dataset['name'], eval_metrics,
                        "with local codebook of size " + str(codebook_size))

                    dataset['sound_ids'] = sound_ids  # order of the sounds
                    dataset['labels'] = true_labels  # idx
                    dataset['label_ids'] = label_ids  # audioset id
                    dataset['label_names'] = label_names  # name
                    dataset['X_{}'.format(embedding_name)] = features
                    dataset['codebook_{}'.format(embedding_name)] = codebook
                    dataset['encoded_{}'.format(
                        embedding_name)] = encoded_audios
                    dataset['histograms_{}'.format(
                        embedding_name)] = histograms
                    dataset['labels_{}'.format(
                        embedding_name)] = cluster_labels
                    dataset['evaluation_metrics_{}'.format(
                        embedding_name)] = eval_metrics

                    web_visu_dataset_name = '-'.join([
                        dataset['name'], 'local_codebook',
                        str(codebook_size)
                    ])
                    dataset['web_visu_dataset_name'] = web_visu_dataset_name
                    save_graph_json_for_web_visu(graph, sound_ids,
                                                 web_visu_dataset_name,
                                                 embedding_name)

                    #save_graph_json_for_web_visu(graph, sound_ids, dataset['name'], embedding_name)

        filename_to_save = "files/" + str(
            codebook_size) + "_codebook_clusters.file"
        save_as_binary(datasets, filename_to_save)
        add_clustering_info_for_web_visu(datasets)
def get_all_frames(datasets, ontology_by_id):
    all_frames = []
    for _, embedding_folder in EMBEDDING_FOLDERS.items():
        for dataset in datasets:
            sound_ids, _, _, _ = create_label_vector(dataset['dataset'],
                                                     ontology_by_id)
            features = load_features(sound_ids, embedding_folder)
            all_frames += features
    return all_frames
def add_clustering_info_for_web_visu(datasets):
    previous_info = load_json('web-visu/clustering_info.json')
    clustering_info = {
        'datasets': [dataset['web_visu_dataset_name'] for dataset in datasets],
        'features':
        [embedding_name for embedding_name, _ in EMBEDDING_FOLDERS.items()]
    }

    if len(previous_info) > 0:
        updated_info = {
            'datasets':
            previous_info['datasets'] + clustering_info['datasets'],
            'features': [
                embedding_name
                for embedding_name, _ in EMBEDDING_FOLDERS.items()
            ],
        }
    else:
        updated_info = clustering_info

    json.dump(updated_info, open('web-visu/clustering_info.json', 'w'))
def add_sound_metadata_to_graph(dataset, metadata):
    for embedding_name, _ in EMBEDDING_FOLDERS.items():
        graph = json.load(
            open(
                'web-visu/json/{}-{}.json'.format(dataset['name'],
                                                  embedding_name), 'rb'))
        for node in graph['nodes']:
            try:
                node.update(metadata[str(node['sound_id'])])
            except:
                pass

        json.dump(
            graph,
            open(
                'web-visu/json/{}-{}.json'.format(dataset['name'],
                                                  embedding_name), 'w'))
def statistical_agg_main(datasets, ontology_by_id):
    #========= original pipelines =======================================
    for dataset in datasets:
        sound_ids, true_labels, label_ids, label_names = create_label_vector(
            dataset['dataset'], ontology_by_id)
        for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items():
            features = load_features(sound_ids, embedding_folder)
            #dataset = create_merged_features(dataset) #==> needs openl3?
            features_mean = statistical_aggregation_features(features)
            similarity_matrix = compute_similarity_matrix(features_mean)
            cluster_labels, eval_metrics, graph = cluster_dataset(
                true_labels, similarity_matrix)

            log_clustering(dataset['name'], eval_metrics,
                           "with original setup")

            dataset['sound_ids'] = sound_ids  # order of the sounds
            dataset['labels'] = true_labels  # idx
            dataset['label_ids'] = label_ids  # audioset id
            dataset['label_names'] = label_names  # name

            dataset['X_mean_{}'.format(embedding_name)] = features_mean
            dataset['X_{}'.format(embedding_name)] = features
            dataset['labels_{}'.format(embedding_name)] = cluster_labels
            dataset['evaluation_metrics_{}'.format(
                embedding_name)] = eval_metrics

            #web-visu
            #web_visu_dataset_name = dataset['name'] + ' mean'
            #web_visu_dataset_name = 'test-mfcc'
            #web_visu_dataset_name = '{}-{}'.format(dataset['name'], embedding_name)

            web_visu_dataset_name = '-'.join([dataset['name'], 'mean'])
            dataset['web_visu_dataset_name'] = web_visu_dataset_name
            save_graph_json_for_web_visu(graph, sound_ids,
                                         web_visu_dataset_name, embedding_name)

    filename_to_save = "files/mean_computed_datasets_clusters.file"
    save_as_binary(datasets, filename_to_save)
    add_clustering_info_for_web_visu(datasets)
def load_features(dataset):
    """
    Returns given dataset with embedding features.

    """
    sound_ids = dataset['sound_ids']
    for embedding_name, embedding_folder in EMBEDDING_FOLDERS.items():
        if embedding_folder:
            embedding_files = [
                embedding_folder + '{0}.npy'.format(sound_id)
                for sound_id in sound_ids
            ]

            features = [np.load(f) for f in embedding_files]
            if embedding_name == 'mfcc':
                X = features
            else:
                X = [np.mean(f, axis=0) for f in features]

            dataset['X_{}'.format(embedding_name)] = X
            # X = [return_gmm(f) for f in features]
            # dataset['X_gmm_{}'.format(embedding_name)] = X

    return dataset
def cluster_dataset(dataset):
    """
    Aplly clustering on the given dataset for the different features.
    Saves results in the dataset dict.
    Display evaluation results.
    
    """
    print('\n')
    print('{} dataset:\n'.format(dataset['name']))
    for embedding_name, _ in EMBEDDING_FOLDERS.items():
        X = dataset['X_{}'.format(embedding_name)]
        true_labels = dataset['labels']

        # knn graph clustering
        labels, graph_json = cluster(X)

        # kmeans clustering
        # labels, graph_json = cluster_kmeans(X, num_clusters=max(dataset['labels'])+1)

        # agglomerative clustering
        # labels, graph_json = cluster_agglomerative(X, num_clusters=max(dataset['labels'])+1)

        # spectral clustering
        # labels, graph_json = cluster_spectral(X)

        # DBSCAN clustering
        # NOT WORKING
        # labels, graph_json = cluster_DBSCAN(X)

        # MeanShift clustering
        # labels, graph_json = cluster_MeanShift(X)

        dataset['labels_{}'.format(embedding_name)] = labels
        purity, adjusted_mutual_info, adjusted_rand, cluster_cohesiveness, \
                semantic_cohesiveness, all_semantic_cohesiveness = evaluate(labels, true_labels)

        # Associate semantic cohesiveness to label name
        semantic_cohesiveness_per_category = {
            label_name: s_cohesiveness
            for label_name, s_cohesiveness in zip(dataset['label_names'],
                                                  all_semantic_cohesiveness)
        }

        dataset['evaluation_metrics_{}'.format(embedding_name)] = {
            'purity':
            purity,
            'adjusted_mutual_info':
            adjusted_mutual_info,
            'adjusted_rand':
            adjusted_rand,
            'average_cluster_cohesiveness':
            cluster_cohesiveness,
            'average_semantic_cohesiveness':
            semantic_cohesiveness,
            'semantic_cohesiveness_per_category':
            semantic_cohesiveness_per_category
        }

        print('{} embeddings'.format(embedding_name))
        print(dataset['evaluation_metrics_{}'.format(embedding_name)])

        # save clustered graph as json file
        for node in graph_json['nodes']:
            node.update({'sound_id': dataset['sound_ids'][node['id']]})
        json.dump(
            graph_json,
            open(
                'web-visu/json/{}-{}.json'.format(dataset['name'],
                                                  embedding_name), 'w'))
    return dataset