def main():
    # conversations = [("B", "A"), ("B", "M"), ("C", "E"), ("C", "F"), ("D", "G"), ("D", "L"), ("E", "I")]
    # conversations = [("E", "J"), ("F", "E"), ("F", "J"), ("G", "B"), ("G", "L")]
    conversations = [[]]
    for convo in conversations:
        convo_reduction_start = time.time()
        # names= data_handling.read_names(["case_study"])
        print("Conversation: ", convo)
        names = data_handling.read_names(["geco", data_handling.get_conversation_directory(convo)])
        samples, targets, zero_length_indices = data_handling.dataset(names)
        names = np.delete(names, zero_length_indices)

        print("original shapes, samples: {} \ttargets: {} \tnames: {}".format(np.shape(samples), np.shape(targets), np.shape(names)))
        samples = preprocessing.scale_select(samples)

        # data = np.hstack((samples, np.reshape(targets, (len(targets), 1))))
        # print("Data shape: {}".format(np.shape(data)))
        # unique, counts = np.unique(targets, return_counts = True)
        # print("Class breakdown: {}".format(dict(zip(unique, counts))))

        # data = preprocessing.balance_classes(data)
        # print("Balanced data shape: {}".format(np.shape(data)))
        # samples = data[:, :-1]
        # targets = data[:, -1]

        if(True):
            perplexities = [30]
            learning_rates = np.arange(300, 700, 100)
            # learning_rates = [200]
            exaggerations = [72, 84, 96]
            for perplexity in perplexities:
                for exaggeration in exaggerations:
                    for l_r in learning_rates:
                        print("p: {}, lr:{}, exag: {}".format(perplexity, l_r, exaggeration))
                        start = time.time()
                        # tsne = manifold.TSNE(n_components = 2, init='pca', perplexity=perplexity, early_exaggeration = exaggeration, 
                        #                     learning_rate = l_r, n_iter = 1000,
                        #                     random_state=data_handling.RANDOM_SEED, verbose = 1)

                        # reduced_samples = tsne.fit_transform(samples)

                        reduced_samples, tsne = preprocessing.tsne_reduction(samples, perplexity, l_r = l_r, ex = exaggeration,
                                                iterations=1000, verbosity=1)

                        end = time.time()
                        log_entry = {
                            "perplexity" : perplexity,
                            "learning_rate" : int(l_r),
                            "early_exaggeration" : int(exaggeration),
                            "final_error" : str(tsne.kl_divergence_)
                        }
                        logger.store_log_entry(log_entry, "tSNE_viz_log.json")

                        print("reduced samples shape: ", np.shape(reduced_samples))
                        print("Reduction took: {}\n".format(end-start))
                        # cluster_scatter_2D(reduced_samples, targets, tsne)
                        scatter_2D(reduced_samples, tsne)

                        print()
        convo_reduction_end = time.time()
        print("Conversation explored in: {}\n".format(convo_reduction_end-convo_reduction_start))
        
        if(False):
            tsne = manifold.TSNE(n_components = 2, init='pca', perplexity=5, early_exaggeration = 36, 
                                        learning_rate = 800, n_iter = 5000,
                                        random_state=data_handling.RANDOM_SEED, verbose = 1)

            reduced_samples = tsne.fit_transform(samples)


            log_entry = {
                "perplexity" : perplexity,
                "learning_rate" : l_r,
                "early_exaggeration" : exaggeration,
                "final_error" : tsne.kl_divergence_
            }
            logger.store_log_entry(log_entry, "tSNE_viz_log.json")

            print("reduced samples shape: ", np.shape(reduced_samples))
            cluster_scatter_2D(reduced_samples, targets, tsne)

            print()
def main():
    names = data_handling.read_names(["keynote"])
    samples, targets, zero_length_indices = data_handling.dataset(names)
    names = np.delete(names, zero_length_indices)
    # Samples Preprocessing
    samples = data_handling.scale_select(samples)

    # Class balancing
    # data = data_handling.balance_classes(data)
    # print("Balanced data shape: {}".format(np.shape(data)))
    # samples = data[:, :-1]
    # targets = data[:, -1]
    # unique, counts = np.unique(targets, return_counts=True)
    # print("Class breakdown: {}\n".format(dict(zip(unique, counts))))

    #output_file
    #   data_handling.output_files_labels_case_study(names, targets, data_root2)
    unique, counts = np.unique(targets, return_counts=True)
    print("The case study data are broken into classes like so: {}".format(
        dict(zip(unique, counts))))

    #### CROSS validation

    print("Perplexity: {}".format(PERPLEXITY))
    #Reduction
    #reduced_samples = data_handling.tsne_reduction(samples, PERPLEXITY, l_r=LEARNING_RATE)
    #print("Reduced samples shape: {}".format(np.shape(reduced_samples)))
    ########
    data = np.hstack((samples, np.reshape(targets, (len(targets), 1))))
    #data = np.hstack((reduced_samples, np.reshape(targets, (len(targets), 1))))
    #data = data_handling.balance_classes(data)

    clf = SVC(C=1, class_weight='balanced', verbose=0, probability=True)
    #cv = ShuffleSplit(n_splits=1, test_size=0.8, random_state=0)
    score1 = cross_val_score(clf,
                             data[:, :-1],
                             data[:, -1],
                             cv=5,
                             scoring='accuracy')
    # score1 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='accuracy')
    print("Cross validation (accuracy) score")
    print(score1)
    print("average:")
    print(np.average(score1))
    print("std:")
    print(np.std(score1))
    score2 = cross_val_score(clf,
                             data[:, :-1],
                             data[:, -1],
                             cv=5,
                             scoring='precision_macro')
    #score2 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='precision_macro')
    print("Cross validation (precision) score")
    print(score2)
    print("average:")
    print(np.average(score2))
    print("std:")
    print(np.std(score2))
    score3 = cross_val_score(clf,
                             data[:, :-1],
                             data[:, -1],
                             cv=5,
                             scoring='recall_macro')
    #  score3 = cross_val_score(clf, reduced_samples, targets, cv=5, scoring='recall_macro')
    print("Cross validation (recall) score")
    print(score3)
    print("average:")
    print(np.average(score3))
    print("std:")
    print(np.std(score3))

    # Split in train and test
    train, test, train_targets, test_targets = train_test_split(
        samples, targets, test_size=0.1, random_state=1337)
    #train, test, train_targets, test_targets = train_test_split(reduced_samples, targets, test_size=0.1, random_state=1337)
    print("shape train: {}".format(np.shape(train)))
    print("shape test: {}".format(np.shape(test)))

    clf = SVC(C=1, class_weight='balanced', verbose=0, probability=True)
    #clf.fit(train[:, :-1], train[:, -1])
    clf.fit(train, train_targets)

    # #### Try on Keynote#########
    print("Train confusion matrix=")
    predictions_train = clf.predict(train)
    cnf_train = confusion_matrix(train_targets,
                                 predictions_train,
                                 labels=["silences", "breathing", "clicks"])
    fig = plt.figure()
    ax1 = fig.add_subplot(121)
    plot_confusion_matrix(cnf_train,
                          classes=["silences", "breathing", "clicks"],
                          normalize=True,
                          title='Train confusion matrix normalized')
    print("Test confusion matrix=")
    #predictions_test= clf.predict(test[:, :-1])
    predictions_test = clf.predict(test)
    cnf_test = confusion_matrix(test_targets,
                                predictions_test,
                                labels=["silences", "breathing", "clicks"])

    ax2 = fig.add_subplot(122)
    plot_confusion_matrix(cnf_test,
                          classes=["silences", "breathing", "clicks"],
                          normalize=True,
                          title='Test confusion matrix normalized')

    # print("confusion matrix=")
    # predictions = clf.predict(data[:, :-1])
    # cnf = confusion_matrix(data[:, -1], predictions, labels=["silences", "breathing", "clicks"])
    # #fig = plt.figure()
    # ax3 = fig.add_subplot(122)
    # plot_confusion_matrix(cnf, classes=["silences", "breathing", "clicks"], normalize=True,
    #                       title=' confusion matrix normalized')
    plt.show()
def main():
    conversation = ["B", "A"]
    # conversation = []
    names = data_handling.read_names(["geco", data_handling.get_conversation_directory(conversation)])
    samples, _, zero_length_indices = data_handling.dataset(names)
    names = np.delete(names, zero_length_indices)
    speakers = data_handling.get_speakers(names)

    print("Original shapes, samples: {} \t names : {} \t speakers : {}".format(np.shape(samples), np.shape(names), np.shape(speakers)))
    
    # Samples Preprocessing
    samples = preprocessing.scale_select(samples)
    print("After feature selection shapes, samples: {}".format(np.shape(samples)))
    # ###########

    # Reduction
    print("Reduction")
    reduced_samples, tsne_params = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION, visually_best=VISUALLY_BEST)
    print("After dimensionality reduction shapes, samples: {}".format(np.shape(reduced_samples)))
    # ########

    
    if(True):
        # KMeans fit
        print()
        predictions = fit_kmeans(reduced_samples)
        plotter.plot_clusters(reduced_samples, predictions, "KMeans GECO unsupervised", speakers, VISUALLY_BEST)
        results_IO.clusters_to_file(names, reduced_samples, predictions, "KMeans", VISUALLY_BEST)

        # AgglomerativeClustering
        print()
        predictions = fit_AgglomerativeClustering(reduced_samples)
        plotter.plot_clusters(reduced_samples, predictions, "Agglomerative Clustering GECO unsupervised", speakers, VISUALLY_BEST)
        results_IO.clusters_to_file(names, reduced_samples, predictions, "Agglomerative_Clustering", VISUALLY_BEST)

        # DBSCAN fit
        # 3.1 35
        print()
        EPS = 2.8
        MIN_SAMPLES = 50
        predictions = fit_DBSCAN(reduced_samples, eps = EPS, min_samples = MIN_SAMPLES)
        
        unique, counts = np.unique(predictions, return_counts = True)
        print("The predictions are broken down as: {}".format(dict(zip(unique, counts))))

        create_log_entry(conversation, EPS, MIN_SAMPLES, tsne_params)
        plotter.plot_clusters(reduced_samples, predictions, "DBSCAN GECO unsupervised", speakers, VISUALLY_BEST)
        results_IO.clusters_to_file(names, reduced_samples, predictions, "DBSCAN", VISUALLY_BEST)

    # DBSCAN exploration
    if(False):
        print()
        for min_samples in np.arange(45, 60, 3):
            for eps in np.arange(2.6, 3.5, .05):
                print("mins_samples : {}, eps: {}".format(min_samples, eps))
                dbscan = DBSCAN(eps = eps, min_samples = min_samples)
                dbscan.fit(reduced_samples)

                predictions = dbscan.labels_
                
                print("Predictions shape: {}".format(np.shape(predictions)))
                unique, counts = np.unique(predictions, return_counts = True)
                print("The predictions are broken down as: {}".format(dict(zip(unique, counts))))
                # plotter.plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted")

                for i, prediction in enumerate(predictions):
                    if prediction == -1:
                        predictions[i] = unique[-1] + 1
                
                clustered = [[] for _ in unique]
                for i in range(len(reduced_samples)):
                    clustered[int(predictions[i])].append(reduced_samples[i])

                # colors = [plt.cm.Spectral(c) for c in unique]
                labels = ["label_{}".format(i+1) for i in unique]

                for i, cluster in enumerate(clustered):
                    cluster = np.array(cluster)
                    plt.scatter(cluster[:, 0], cluster[:, 1], label = labels[i], cmap = "RdBu", marker = ".", linewidths = 0)

                plt.legend()
                plt.title("DBSCAN clustering for eps:{}, min_samples:{}".format(eps, min_samples))
                plt.show()
                print()
def main():
    names = data_handling.read_names(["keynote"])
    samples, targets, zero_length_indices = data_handling.dataset(names)
    names = np.delete(names, zero_length_indices)

    print("Original shapes, samples: {} \t targets : {} \t names: {}".format(
        np.shape(samples), np.shape(targets), np.shape(names)))
    unique, counts = np.unique(targets, return_counts=True)
    print("Class breakdown: {}".format(dict(zip(unique, counts))))

    # Samples Preprocessing
    samples = preprocessing.scale_select(samples)
    print("Feature selection shapes, samples: {} \t targets : {}\n".format(
        np.shape(samples), np.shape(targets)))
    # ###########

    # Dimensionality reduction and classification exploration
    if (False):
        dimensionalities = [100, 150, 170, 200]
        perplexities = [35, 40, 45, 50]
        learning_rates = [500, 800]
        for dimensionality in dimensionalities:
            for perplexity in perplexities:
                for l_r in learning_rates:
                    print(
                        "Dimensionality: {} \t Perplexity: {} \t Learning rate : {}"
                        .format(dimensionality, perplexity, l_r))

                    # Reduction
                    print("Reduction")
                    reduced_samples = tSNE_IO.load_or_reduce(samples)
                    print("Reduced samples shape: {}".format(
                        np.shape(reduced_samples)))
                    # ########

                    # Class balancing
                    data = np.hstack(
                        (reduced_samples, np.reshape(targets,
                                                     (len(targets), 1))))
                    data, names = preprocessing.balance_classes(
                        data, names,
                        np.shape(reduced_samples)[1])
                    print("Balanced data shape: {}".format(np.shape(data)))
                    reduced_samples = data[:, :-1]
                    targets = data[:, -1]
                    unique, counts = np.unique(targets, return_counts=True)
                    print("Class breakdown: {}\n".format(
                        dict(zip(unique, counts))))
                    # ###########

                    # Splitting
                    # Split in train and test
                    train, test = preprocessing.split(reduced_samples, targets,
                                                      0.15)
                    # #########
                    for c in np.arange(.1, .9, .1):
                        # SVM fitting
                        clf = SVC(C=c,
                                  class_weight='balanced',
                                  verbose=0,
                                  probability=True)

                        clf.fit(train[:, :-1], train[:, -1])
                        score = clf.score(test[:, :-1], test[:, -1])

                        print(
                            "SVC score of fit on case study data: {}, with C: {}"
                            .format(score,
                                    clf.get_params()["C"]))

                        parameters = {
                            "algo": "SVC",
                            "kernel": clf.get_params()["kernel"],
                            "dataset": "keynote",
                            "tsne_perplexity": perplexity,
                            "tsne_learning_rate": l_r,
                            "tnse_dimensionality": dimensionality,
                            "svc_c": c,
                            "score": score,
                        }

                        logger.store_log_entry(
                            parameters,
                            "keynote_supervised_exploration_log.json")
            print()

    # Assume dimensionality reduction, classification exploration
    if (True):
        # Reduction
        print("Reduction")
        reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY,
                                                 LEARNING_RATE, EXAGGERATION)
        print("Reduced samples shape: {}".format(np.shape(reduced_samples)))
        # ########

        # Class balancing
        data = np.hstack(
            (reduced_samples, np.reshape(targets, (len(targets), 1))))
        data, names = preprocessing.balance_classes(
            data, names,
            np.shape(reduced_samples)[1])
        print("Balanced data shape: {}".format(np.shape(data)))
        reduced_samples = data[:, :-1]
        targets = data[:, -1]
        unique, counts = np.unique(targets, return_counts=True)
        print("Class breakdown: {}\n".format(dict(zip(unique, counts))))
        # ###########

        # Split in train and test
        train, test = preprocessing.split(reduced_samples, targets, 0.15)
        # #########

        for c in np.arange(.1, 1, .1):
            # SVM fitting
            clf = SVC(C=c,
                      class_weight='balanced',
                      verbose=0,
                      probability=True)

            clf.fit(train[:, :-1], train[:, -1])
            score = clf.score(test[:, :-1], test[:, -1])

            print("SVC score of fit on case study data: {}, with C: {}".format(
                score,
                clf.get_params()["C"]))

            parameters = {
                "algo": "SVC",
                "kernel": clf.get_params()["kernel"],
                "dataset": "keynote",
                "tsne_perplexity": PERPLEXITY,
                "tsne_learning_rate": LEARNING_RATE,
                "tnse_dimensionality": 2,
                "svc_c": c,
                "score": score,
            }

            logger.store_log_entry(parameters,
                                   "keynote_supervised_exploration_log.json")
예제 #5
0
def main():
    names = data_handling.read_names(["keynote"])
    samples, targets, zero_length_indices = data_handling.dataset(names)
    names = np.delete(names, zero_length_indices)


    print("Original shapes, samples: {} \t targets : {} \t names: {}".format(np.shape(samples), np.shape(targets), np.shape(names)))
    unique, counts = np.unique(targets, return_counts = True)
    print("Class breakdown: {}".format(dict(zip(unique, counts))))

    # Samples Preprocessing
    samples = preprocessing.scale_select(samples)
    print("Feature selection shapes, samples: {} \t targets : {}".format(np.shape(samples), np.shape(targets)))
    # ###########

    # tSNE Reduction
    print("Reduction")
    reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY, LEARNING_RATE, EXAGGERATION)
    print("Reduced samples shape: {}".format(np.shape(reduced_samples)))
    # ########

    plot_clusters(reduced_samples, targets, "Keynote unbalanced actual", type = "actual")

    # Class balancing
    data = np.hstack((reduced_samples, np.reshape(targets, (len(targets), 1))))
    data, names = preprocessing.balance_classes(data, names, np.shape(reduced_samples)[1])
    print("Balanced data shape: {}".format(np.shape(data)))
    reduced_samples = data[:, :-1]
    targets = data[:, -1]
    # ###########


    plot_clusters(reduced_samples, targets, "Keynote balanced actual", type = "actual")
    
    # KMeans fit
    if(False):
        kmeans = KMeans(n_clusters = 3)

        print("Kmeans fitting")
        kmeans.fit(reduced_samples)
        predictions = kmeans.labels_

        print("Predictions shape: {}".format(np.shape(predictions)))

        plot_clusters(reduced_samples, predictions, "KMeans Keynote unsupervised", type = "predicted")

    # AgglomerativeClustering
    if(False):
        print()
        aggro = AgglomerativeClustering(n_clusters = 3)
        
        print("AgglomerativeClustering fitting")
        aggro.fit(reduced_samples)
        predictions = aggro.labels_

        plot_clusters(reduced_samples, predictions, "Agglomerative Clustering Keynote unsupervised", type = "predicted")

    # DBSCAN exploration
    if(False):
        print()
        for min_samples in np.arange(20, 50, 5):
            for eps in np.arange(4, 10, .5):
                print("mins_samples : {}, eps: {}".format(min_samples, eps))
                dbscan = DBSCAN(eps = eps, min_samples = min_samples)
                print("DBSCAN fitting")
                dbscan.fit(reduced_samples)

                predictions = dbscan.labels_
                
                print("Predictions shape: {}".format(np.shape(predictions)))
                unique, counts = np.unique(predictions, return_counts = True)
                print("The predictions are broken down as: {}".format(dict(zip(unique, counts))))
                # plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted")

                for i, prediction in enumerate(predictions):
                    if prediction == -1:
                        predictions[i] = unique[-1] + 1
                
                clustered = [[] for _ in unique]
                for i in range(len(reduced_samples)):
                    clustered[int(predictions[i])].append(reduced_samples[i])

                # colors = [plt.cm.Spectral(c) for c in unique]
                labels = ["label_{}".format(i+1) for i in unique]

                for i, cluster in enumerate(clustered):
                    cluster = np.array(cluster)
                    plt.scatter(cluster[:, 0], cluster[:, 1], label = labels[i], cmap = "RdBu")

                plt.legend()
                plt.title("DBSCAN clustering for eps:{}, min_samples:{}".format(eps, min_samples))
                plt.show()
                print()


    # DBSCAN fit
    if (True):
        print()
        dbscan = DBSCAN(eps = 3.5, min_samples = 60)
        print("DBSCAN fitting")
        dbscan.fit(reduced_samples)

        predictions = dbscan.labels_
        
        print("Predictions shape: {}".format(np.shape(predictions)))
        unique, counts = np.unique(predictions, return_counts = True)
        print("The predictions are broken down as: {}".format(dict(zip(unique, counts))))

        for i, prediction in enumerate(predictions):
            if prediction == -1:
                predictions[i] = unique[-1] + 1

        plot_clusters(reduced_samples, predictions, "DBSCAN Keynote unsupervised", type = "predicted")
        extract_to_file(names, reduced_samples, predictions)