def cluster_tsne(self, fVectors, n_components, perplexity):
     tsne = t_sne(n_components=n_components,
                  perplexity=perplexity,
                  metric='cosine')
     #tsne = t_sne(n_components=n_components, perplexity=perplexity, metric=self.hierarchcal_distance)
     embedding = tsne.fit_transform(fVectors)
     return embedding
    def clusterBasedOnTime(self):
        codeEditsPerSession = self.dataProxy.getCodeTreesPerSession()

        #codeEditsPerSession = self.dataProxy.getRunEvents()
        timeDeltasPerSession = self.convertSessionsToTimeDeltas(codeEditsPerSession)
        # Generate session labels
        dateLabelMap = {}
        for index, datesForSession in enumerate(self.sessionDates):
            for date in datesForSession:
                dateLabelMap[date] = list(self.colorMapSessions.keys())[index]
        colorLabels = [self.colorMapSessions[dateLabelMap[session["_id"]["timestamp"]]] if session["_id"]["timestamp"] in dateLabelMap.keys() else self.colorMapSessions["other"] for session in codeEditsPerSession]
        # Convert to time bins
        maxTimeDelta = max(map(max, timeDeltasPerSession))
        test = self.convertTimeDeltaListToBins(timeDeltasPerSession[0], maxTimeDelta, 100)
        print(test)
        binnedTimes = list(map(lambda x: self.convertTimeDeltaListToBins(x, maxTimeDelta, 100), timeDeltasPerSession))
        # Convert to equal size numpy array
        length = max(map(len, binnedTimes))
        binnedTimesPerSession = np.array(binnedTimes)
        tsne = t_sne(n_components=2, perplexity=8, metric='euclidean')
        embedding = tsne.fit_transform(binnedTimesPerSession)
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(embedding[:, 0], embedding[:, 1], s=100, c=colorLabels)

        plt.show()
예제 #3
0
def learning_rates(folder,
                   modification='',
                   create=False,
                   learning_rates=np.arange(5, 1000, 5)):
    """
    Function used to generate or load t-SNE transformations with a range of different learning rates. 
    Parameters
    -------------
    folder: The name of the folder the pickles should be put in / are in
    modification: the modification done to the dataset, a string used in the name of the corresponding pickles. 
    learning_rates: the learning rates rates you want to create transformations of
    create: true is you want to create the transformations, false if you want to load them
    pkl: true if you want to make a pickle for every value of learning_rate, otherwise false 
    
    Output
    -------------
    l_Z: A ilst of the t-SNE transformations
    learning_rates: a vector with the corresponding values of learning rate
    l_times: a vector with the corresponding values of computational time
    l_kl_divergence: a vector with the corresponding values kl divergence
    l_differences: a vector with the corresponding values of difference in 2d distance
    """
    if create:
        l_Z = []
        l_times = np.zeros(len(learning_rates))
        l_kl_divergence = np.zeros(len(learning_rates))
        X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb"))
        for i, l in enumerate(learning_rates):
            tsne = t_sne(learning_rate=l, random_state=123)
            start_time = time.time()
            l_Z.append(tsne.fit_transform(X))
            l_times[i] = time.time() - start_time
            l_kl_divergence[i] = tsne.kl_divergence_
        pickle.dump(l_Z,
                    open(folder + "/l_Z_tsne" + modification + ".pkl", "wb"))
        pickle.dump(
            learning_rates,
            open(folder + "/learning_rates" + modification + ".pkl", "wb"))
        pickle.dump(l_times,
                    open(folder + "/l_times" + modification + ".pkl", "wb"))
        pickle.dump(
            l_kl_divergence,
            open(folder + "/l_kl_divergence" + modification + ".pkl", "wb"))
    else:
        l_Z = pickle.load(
            open(folder + "/l_Z_tsne" + modification + ".pkl", "rb"))
        learning_rates = pickle.load(
            open(folder + "/learning_rates" + modification + ".pkl", "rb"))
        l_times = pickle.load(
            open(folder + "/l_times" + modification + ".pkl", "rb"))
        l_kl_divergence = pickle.load(
            open(folder + "/l_kl_divergence" + modification + ".pkl", "rb"))
    X_2d_tsne = pickle.load(
        open(folder + "/X_2d" + modification + ".pkl", "rb"))
    l_differences = HL.get_differences(X_2d_tsne, l_Z)
    return l_Z, learning_rates, l_times, l_kl_divergence, l_differences
예제 #4
0
def early_exaggeration(folder,
                       modification='',
                       create=False,
                       early_exaggeration=np.arange(1, 80, 1)):
    """
    Function used to generate or load t-SNE transformations with a range of different early exaggeration rates. 
    Parameters
    -------------
    folder: The name of the folder the pickles should be put in / are in
    modification: the modification done to the dataset, a string used in the name of the corresponding pickles. 
    early_exaggeration: the early exaggeration rates you want to create transformations of
    create: true is you want to create the transformations, false if you want to load them
    pkl true if you want to make a pickle for every value of early_exaggeration, otherwise false 
    
    Output
    -------------
    e_Z: A ilst of the t-SNE transformations
    early_exaggeration: a vector with the corresponding values of early_exaggeration
    e_times: a vector with the corresponding values of computational time
    e_kl_divergence: a vector with the corresponding values kl divergence
    e_differences: a vector with the corresponding values of difference in 2d distance
    """
    if create:
        e_Z = []
        e_times = np.zeros(len(early_exaggeration))
        e_kl_divergence = np.zeros(len(early_exaggeration))
        X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb"))
        for i, e in enumerate(early_exaggeration):
            tsne = t_sne(early_exaggeration=e, random_state=123)
            start_time = time.time()
            e_Z.append(tsne.fit_transform(X))
            e_times[i] = time.time() - start_time
            e_kl_divergence[i] = tsne.kl_divergence_
        pickle.dump(e_Z,
                    open(folder + "/e_Z_tsne" + modification + ".pkl", "wb"))
        pickle.dump(
            early_exaggeration,
            open(folder + "/early_exaggeration" + modification + ".pkl", "wb"))
        pickle.dump(e_times,
                    open(folder + "/e_times" + modification + ".pkl", "wb"))
        pickle.dump(
            e_kl_divergence,
            open(folder + "/e_kl_divergence" + modification + ".pkl", "wb"))
    else:
        e_Z = pickle.load(
            open(folder + "/e_Z_tsne" + modification + ".pkl", "rb"))
        early_exaggeration = pickle.load(
            open(folder + "/early_exaggeration" + modification + ".pkl", "rb"))
        e_times = pickle.load(
            open(folder + "/e_times" + modification + ".pkl", "rb"))
        e_kl_divergence = pickle.load(
            open(folder + "/e_kl_divergence" + modification + ".pkl", "rb"))
    X_2d_tsne = pickle.load(
        open(folder + "/X_2d" + modification + ".pkl", "rb"))
    e_differences = HL.get_differences(X_2d_tsne, e_Z)
    return e_Z, early_exaggeration, e_times, e_kl_divergence, e_differences
예제 #5
0
    def calculateSessionStats(self, eventsPerSession):
        stats = list(map(self.caclulateEventHistogramForSession, eventsPerSession))
        '''changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0, 
                                              hist["runClicked"] if "runClicked" in hist else 0], stats))'''
        '''changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0,
                                              hist["simStart"] if "simStart" in hist else 0], stats))'''
        changeVsRuns = list(map(lambda hist: [hist["changedWorkspace"] if "changedWorkspace" in hist else 0,
                                                hist["runClicked"] + hist["simStart"] if ("runClicked" in hist and "simStart" in hist) else
                                                hist["runClicked"] if "runClicked" in hist else
                                                hist["simStart"] if "simStart" in hist else
                                                0], stats))
        print(changeVsRuns)
        changeVsRuns = np.array([np.array(elem) for elem in changeVsRuns])

        eventNames = ["blocklyBlockCreate", "blocklyBlockDelete", "blocklyBlockMove", "blocklyChange",	"changedWorkspace", "runClicked", "simStart"]
        barColors = ["red", "green", "blue", "yellow", "orange", "purple", "brown"]

        # calculate regression line
        q, m = polyfit(changeVsRuns[:, 0], changeVsRuns[:, 1], 1)

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(changeVsRuns[:, 0], changeVsRuns[:, 1], '.')
        ax.plot(changeVsRuns[:, 0], q + m * changeVsRuns[:, 0], '-')
        #ax.scatter(changeVsRuns[:, 0], changeVsRuns[:, 1])
        ax.set_xlabel('#changedWorkspace')
        ax.set_ylabel('#runClicked')
        plt.show()

        fig, ax = plt.subplots(10, 13, sharey=True)

        freqtables = []

        for i in range(10):
            for j in range(13):
                index = i*13 + j
                freq_table = list(map(lambda name: stats[index][name] if name in stats[index] else 0, eventNames))
                freqtables.append(freq_table)
                x = range(len(freq_table))
                ax[i, j].bar(x, freq_table, color=barColors)

        custom_lines = list(map(lambda color: Line2D([0], [0], color=color, lw=4), barColors))
        fig.legend(custom_lines, eventNames)

        plt.show()

        freqtables = np.array([np.array(elem) for elem in freqtables])

        tsne = t_sne(n_components=2, perplexity=10, metric='euclidean')
        embedding = tsne.fit_transform(freqtables)
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(embedding[:, 0], embedding[:, 1], s=100)

        plt.show()
예제 #6
0
def threshold(folder,
              modification='',
              create=False,
              threshold=np.logspace(-14, -1, 50)):
    """
    Function used to generate or load t-SNE transformations with a range of different thresholds (tol/min_grad_norm). 
    Parameters
    -------------
    folder: The name of the folder the pickles should be put in / are in
    modification: the modification done to the dataset, a string used in the name of the corresponding pickles. 
    threshold: the thresholds you want to create transformations of
    create: true is you want to create the transformations, false if you want to load them
    pkl: true if you want to make a pickle for every value of threshold, otherwise false 
    
    Output
    -------------
    t_Z: A ilst of the t-SNE transformations
    learning_rates: a vector with the corresponding values of learning rate
    t_times: a vector with the corresponding values of computational time
    t_kl_divergence: a vector with the corresponding values kl divergence
    t_differences: a vector with the corresponding values of difference in 2d distance
    """
    if create:
        t_Z = []
        t_times = np.zeros(len(threshold))
        t_kl_divergence = np.zeros(len(threshold))
        X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb"))
        for i, t in enumerate(threshold):
            tsne = t_sne(min_grad_norm=t, random_state=123)
            start_time = time.time()
            t_Z.append(tsne.fit_transform(X))
            t_times[i] = time.time() - start_time
            t_kl_divergence[i] = tsne.kl_divergence_
        pickle.dump(t_Z,
                    open(folder + "/t_Z_tsne" + modification + ".pkl", "wb"))
        pickle.dump(threshold,
                    open(folder + "/threshold" + modification + ".pkl", "wb"))
        pickle.dump(t_times,
                    open(folder + "/t_times" + modification + ".pkl", "wb"))
        pickle.dump(
            t_kl_divergence,
            open(folder + "/t_kl_divergence" + modification + ".pkl", "wb"))
    else:
        t_Z = pickle.load(
            open(folder + "/t_Z_tsne" + modification + ".pkl", "rb"))
        threshold = pickle.load(
            open(folder + "/threshold" + modification + ".pkl", "rb"))
        t_times = pickle.load(
            open(folder + "/t_times" + modification + ".pkl", "rb"))
        t_kl_divergence = pickle.load(
            open(folder + "/t_kl_divergence" + modification + ".pkl", "rb"))
    X_2d_tsne = pickle.load(
        open(folder + "/X_2d" + modification + ".pkl", "rb"))
    t_differences = HL.get_differences(X_2d_tsne, t_Z)
    return t_Z, threshold, t_times, t_kl_divergence, t_differences
예제 #7
0
 def cluster_tsne(self,
                  affinity_matrix,
                  color_labels,
                  title="TSNE",
                  n_components=2,
                  perplexity=30):
     tsne = t_sne(n_components=n_components,
                  metric="precomputed",
                  perplexity=perplexity)
     embedding = tsne.fit_transform(affinity_matrix)
     return embedding
예제 #8
0
def main():
    spike_nums_dur = load_data()
    spike_nums = build_spike_nums_and_peak_nums(spike_nums_dur)[0]
    nb_activation = firing_feature(spike_nums, 50)
    # print(f"nb_activation size is {nb_activation.shape}")
    corr_matrix = get_pearson_correlation_matrix(nb_activation)
    # svm = sns.heatmap(corr_matrix)
    # fig = svm.get_figure()
    # save_formats = ["pdf"]
    # if isinstance(save_formats, str):
    #     save_formats = [save_formats]
    #
    # path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000"
    # for save_format in save_formats:
    #     fig.savefig(f'{path_results}/test_batta'
    #                 f'.{save_format}',
    #                 format=f"{save_format}",
    #                 facecolor=fig.get_facecolor())

    ## DO HDBSCAN CLUSTERING ON CORRELATION MATRIX ( ACTIVATION FEATURE) ##

    clusterer = hdbscan.HDBSCAN(algorithm='best',
                                alpha=1.0,
                                approx_min_span_tree=True,
                                gen_min_span_tree=False,
                                leaf_size=40,
                                metric='precomputed',
                                min_cluster_size=3,
                                min_samples=None,
                                p=None)
    # metric='precomputed' euclidean
    clusterer.fit(corr_matrix)

    labels = clusterer.labels_
    # print(f"labels.shape: {labels.shape}")
    print(f"N clusters hdbscan: {labels.max()+1}")
    print(f"labels: {labels}")
    print(f"With no clusters hdbscan: {len(np.where(labels == -1)[0])}")
    n_clusters = 0
    if labels.max() + 1 > 0:
        n_clusters = labels.max() + 1

    if n_clusters > 0:
        n_epoch_by_cluster = [
            len(np.where(labels == x)[0]) for x in np.arange(n_clusters)
        ]
        print(
            f"Number of epochs by clusters hdbscan: {' '.join(map(str, n_epoch_by_cluster))}"
        )

    corr_matrix_order = np.copy(corr_matrix)
    labels_indices_sorted = np.argsort(labels)
    corr_matrix_order = corr_matrix_order[labels_indices_sorted, :]
    corr_matrix_order = corr_matrix_order[:, labels_indices_sorted]

    mean_corr_values = np.zeros(n_clusters)
    for i in np.arange(0, n_clusters - 1):
        tmp = corr_matrix_order[np.where(labels == i)[0], :]
        tmp = tmp[:, np.where(labels == i)[0]]
        mean_corr_values[i] = np.mean(tmp)
    print(f" {mean_corr_values}")
    # print(f"{np.where(mean_corr_values>0.6)}")
    # print(f"{np.max(mean_corr_values[np.where(n_epoch_by_cluster>5)])}")
    # print(f"{np.where(labels==7)}")
    # print(f"{tmp}")

    # Generate figure: correlation matrix ordered by cluster
    svm = sns.heatmap(corr_matrix_order)
    svm.set_yticklabels(labels_indices_sorted)
    svm.set_xticklabels(labels_indices_sorted)
    fig = svm.get_figure()

    save_formats = ["pdf"]
    if isinstance(save_formats, str):
        save_formats = [save_formats]

    path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper"
    for save_format in save_formats:
        fig.savefig(f'{path_results}/test_hdbscan'
                    f'.{save_format}',
                    format=f"{save_format}",
                    facecolor=fig.get_facecolor())

    ## DO T-SNE CLUSTERING ON CORRELATION MATRIX  ##

    tsne = t_sne(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(corr_matrix)

    # first figure: plot t-sne without color
    df_subset = pd.DataFrame()
    df_subset['tsne-2d-one'] = tsne_results[:, 0]
    df_subset['tsne-2d-two'] = tsne_results[:, 1]
    df_subset['color'] = labels
    plt.figure(figsize=(16, 10))
    svm = sns.scatterplot(x="tsne-2d-one",
                          y="tsne-2d-two",
                          data=df_subset,
                          legend="full",
                          alpha=1)
    fig = svm.get_figure()

    path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper"
    for save_format in save_formats:
        fig.savefig(f'{path_results}/tsne_cluster'
                    f'.{save_format}',
                    format=f"{save_format}",
                    facecolor=fig.get_facecolor())
    plt.close()

    # second figure: plot t-sne with color from previous hdbscan result
    df_subset = pd.DataFrame()
    df_subset['tsne-2d-one'] = tsne_results[:, 0]
    df_subset['tsne-2d-two'] = tsne_results[:, 1]
    df_subset['color'] = labels

    plt.figure(figsize=(16, 10))
    svm = sns.scatterplot(x="tsne-2d-one",
                          y="tsne-2d-two",
                          hue="color",
                          palette=sns.color_palette("hls",
                                                    labels.max() + 2),
                          data=df_subset,
                          legend="full",
                          alpha=1)
    fig = svm.get_figure()

    path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper"
    for save_format in save_formats:
        fig.savefig(
            f'{path_results}/tsne_colors_from_previous_hdbscan_clustering'
            f'.{save_format}',
            format=f"{save_format}",
            facecolor=fig.get_facecolor())
    plt.close()

    # DO CLUSTERING ON T-SNE RESULTS TO COLOR THE T-SNE FIGURE ##

    clusterer = hdbscan.HDBSCAN(algorithm='best',
                                alpha=1.0,
                                approx_min_span_tree=True,
                                gen_min_span_tree=False,
                                leaf_size=40,
                                metric='euclidean',
                                min_cluster_size=3,
                                min_samples=None,
                                p=None)
    clusterer.fit(tsne_results)
    labels_hdbscan_on_tsne = clusterer.labels_
    print(
        f"N clusters hdbscan on t-sne results: {labels_hdbscan_on_tsne.max()+1}"
    )
    # print(f"labels: {labels_hdbscan_on_tsne}")

    df_subset = pd.DataFrame()
    df_subset['tsne-2d-one'] = tsne_results[:, 0]
    df_subset['tsne-2d-two'] = tsne_results[:, 1]
    df_subset['color'] = labels_hdbscan_on_tsne

    plt.figure(figsize=(16, 10))
    svm = sns.scatterplot(x="tsne-2d-one",
                          y="tsne-2d-two",
                          hue="color",
                          palette=sns.color_palette(
                              "hls",
                              labels_hdbscan_on_tsne.max() + 2),
                          data=df_subset,
                          legend="full",
                          alpha=1)
    # plt.show()
    fig = svm.get_figure()

    path_results = "D:/Robin/data_hne/data/p41/p41_19_04_30_a000/clawson_battaglia_paper"
    for save_format in save_formats:
        fig.savefig(
            f'{path_results}/tsne_colors_from_post_tsne_clustering'
            f'.{save_format}',
            format=f"{save_format}",
            facecolor=fig.get_facecolor())
    plt.close()
예제 #9
0
def spotdist_function(ms, param):

    ###################################################################################################################
    # CHOOSE METHOD TO USE #
    method_homemade = True  # The one to use if want to run on traces be careful with risk of memory error
    method_battaglia = False  # Run faster for raster_dur / raster but EMD is not normalized

    # DECIDE ON WHICH DATA TO WORK
    data_to_use = "traces"
    possible_data_to_use = [
        "raster_dur", "raster", "traces", "artificial_raster"
    ]
    if data_to_use not in possible_data_to_use:
        data_to_use = "raster_dur"
        raise Exception(
            "Can not run SpotDist on this data, by default use of raster_dur")

    # DECIDE EPOCH LENGTH  ONLY FOR NON ARTIFICIAL DATA
    len_epoch = 250

    # If you want to work on artificial data
    random_pattern_order = True
    known_pattern_order = False  # This option is obsolete, do not use
    use_one_shuffle_per_pattern = True
    do_general_shuffling_on_full_raster = False
    fuse_raster_with_noise = True

    # SET SAVING PATH
    path_results = param.path_results
    time_str = param.time_str

    ###################################################################################################################

    ################################
    # IF WORK ON ARTIFICIAL DATA   #
    ################################
    if data_to_use == "artificial_raster":
        # DEFINE RASTER #
        n_cells = 50
        len_pattern = 100
        if random_pattern_order:
            n_epochs = 100  # Put something that 4 can divide
            n_frames = len_pattern * n_epochs
        if known_pattern_order:
            n_epochs = 12  # Do not change, only 12 epochs are generated
            n_frames = len_pattern * n_epochs

        art_raster_dur = np.zeros((n_cells, n_frames), dtype="int8")
        art_raster_dur_noise = np.zeros((n_cells, n_frames), dtype="int8")
        rand_art_raster_dur = np.zeros((n_cells, n_frames), dtype="int8")
        art_raster_dur_pattern_shuffle = np.zeros((n_cells, n_frames),
                                                  dtype="int8")
        noise_matrix = np.zeros((n_cells, n_frames), dtype="int8")
        rand_art_raster_dur_noise = np.zeros((n_cells, n_frames), dtype="int8")

        n_epochs = n_frames // len_pattern
        # to make things easy for now, the number of frames should be divisible by the length of epochs
        if (n_frames % len_pattern) != 0:
            raise Exception(
                "number of frames {n_frames} not divisible by {len_epoch}")

        ############################################
        # CREATE PATTERNS ASSEMBLIES AND SEQUENCES #
        ############################################

        # create pattern#1 = sequence in order
        pattern1 = np.zeros((n_cells, len_pattern))
        for i in range(n_cells):
            pattern1[i, i] = 1
            pattern1[i, i + 50] = 1
        # create pattern#1 shuffle = sequence in a shuffle order
        pattern1_shuffle = np.copy(pattern1)
        np.random.shuffle(pattern1_shuffle)

        # create pattern#2 = assemblies in order
        pattern2 = np.zeros((n_cells, len_pattern))
        pattern2[13:26, 2:4] = 1
        pattern2[0:13, 14:16] = 1
        pattern2[39:50, 26:28] = 1
        pattern2[26:39, 38:40] = 1
        pattern2[13:26, 50:52] = 1
        pattern2[39:50, 62:64] = 1
        pattern2[26:39, 74:76] = 1
        pattern2[0:13, 86:88] = 1
        # create pattern#2 shuffle = assemblies in shuffle order
        pattern2_shuffle = np.copy(pattern2)
        np.random.shuffle(pattern2_shuffle)

        # create pattern#3 = sequence together with noise
        pattern3 = np.zeros((n_cells, len_pattern))
        n_cells_in_sequence = 40
        noisy_cells = n_cells - n_cells_in_sequence
        for i in range(n_cells_in_sequence):
            pattern3[i, i:i + 2] = 1
            pattern3[i, 20 + i:i + 22] = 1
        pattern3[n_cells_in_sequence:n_cells, :] = generate_poisson_pattern(
            noisy_cells, len_pattern, 10, 50, 1, 2)
        # create pattern#3 shuffle
        pattern3_shuffle = np.copy(pattern3)
        np.random.shuffle(pattern3_shuffle)

        # create pattern#4 = assemblies together with noise
        pattern4 = np.zeros((n_cells, len_pattern))
        cells_in_assemblies = 41
        cells_with_noise = n_cells - cells_in_assemblies
        pattern4[11:22, 2:4] = 1
        pattern4[0:11, 14:16] = 1
        pattern4[36:41, 26:28] = 1
        pattern4[22:36, 38:40] = 1
        pattern4[11:22, 50:52] = 1
        pattern4[36:41, 62:64] = 1
        pattern4[22:36, 74:76] = 1
        pattern4[0:11, 86:88] = 1
        pattern4[41:50, :] = generate_poisson_pattern(cells_with_noise,
                                                      len_pattern, 10, 50, 1,
                                                      2)
        # create pattern#2 shuffle = assemblies in shuffle order
        pattern4_shuffle = np.copy(pattern4)
        np.random.shuffle(pattern4_shuffle)

        #########################################
        # USE PATTERNS ASSEMBLIES AND SEQUENCES #
        #########################################

        if known_pattern_order:
            # CREATE ARTIFICIAL RASTER FROM KNOWN COMBINATION OF PATTERN
            art_raster_dur[:, 0:100] = pattern1
            art_raster_dur[:, 100:200] = generate_poisson_pattern(
                n_cells, len_pattern, 20, 50, 1, 2)
            art_raster_dur[:, 200:300] = pattern2
            art_raster_dur[:, 300:400] = pattern1
            art_raster_dur[:, 400:500] = generate_poisson_pattern(
                n_cells, len_pattern, 10, 50, 1, 2)
            art_raster_dur[:, 500:600] = generate_poisson_pattern(
                n_cells, len_pattern, 10, 50, 1, 2)
            art_raster_dur[:, 600:700] = pattern1
            art_raster_dur[:, 700:800] = pattern2
            art_raster_dur[:, 800:900] = generate_poisson_pattern(
                n_cells, len_pattern, 10, 50, 1, 2)
            art_raster_dur[:, 900:1000] = generate_poisson_pattern(
                n_cells, len_pattern, 10, 50, 1, 2)
            art_raster_dur[:, 1000:1100] = pattern2
            art_raster_dur[:, 1100:1200] = generate_poisson_pattern(
                n_cells, len_pattern, 10, 50, 1, 2)

        if random_pattern_order:
            # CREATE ARTIFICIAL RASTER COMBINATION OF THESE ASSEMBLIES SEQUENCES PLUS NOISE
            # Half of the epochs are noise pattern, the other half if equally divided in patterns
            n_patterns = 2
            n_epochs_noise = n_epochs // 2
            n_epochs_pattern = n_epochs - n_epochs_noise
            # n_epochs_pattern = int(n_epochs_pattern)
            n_epochs_pattern1 = n_epochs_pattern // n_patterns
            n_epochs_pattern2 = n_epochs_pattern // n_patterns

            pattern_id = np.zeros(n_epochs)
            pattern_id[0:n_epochs_noise] = 0
            pattern_id[n_epochs_noise:(n_epochs_noise + n_epochs_pattern1)] = 1
            pattern_id[(n_epochs_noise +
                        n_epochs_pattern1):(n_epochs_noise +
                                            n_epochs_pattern1 +
                                            n_epochs_pattern2)] = 2
            np.random.shuffle(pattern_id)

            for i in range(n_epochs):
                if pattern_id[i] == 0:
                    art_raster_dur[:,
                                   np.arange((i * len_pattern),
                                             (i * len_pattern) + len_pattern
                                             )] = generate_poisson_pattern(
                                                 n_cells, len_pattern, 10, 50,
                                                 1, 2)
                    art_raster_dur_pattern_shuffle[:,
                                                   np.arange(
                                                       (i * len_pattern),
                                                       (i * len_pattern) +
                                                       len_pattern
                                                   )] = generate_poisson_pattern(
                                                       n_cells, len_pattern,
                                                       10, 50, 1, 2)
                if pattern_id[i] == 1:
                    art_raster_dur[:,
                                   np.arange((i * len_pattern),
                                             (i * len_pattern) +
                                             len_pattern)] = pattern3
                    art_raster_dur_pattern_shuffle[:,
                                                   np.arange(
                                                       (i * len_pattern),
                                                       (i * len_pattern) +
                                                       len_pattern
                                                   )] = pattern3_shuffle
                if pattern_id[i] == 2:
                    art_raster_dur[:,
                                   np.arange((i * len_pattern),
                                             (i * len_pattern) +
                                             len_pattern)] = pattern4
                    art_raster_dur_pattern_shuffle[:,
                                                   np.arange(
                                                       (i * len_pattern),
                                                       (i * len_pattern) +
                                                       len_pattern
                                                   )] = pattern4_shuffle

        if use_one_shuffle_per_pattern is False:
            rand_art_raster_dur = np.copy(art_raster_dur)
        elif use_one_shuffle_per_pattern is True:
            rand_art_raster_dur = np.copy(art_raster_dur_pattern_shuffle)

        if do_general_shuffling_on_full_raster is True:
            np.random.shuffle(rand_art_raster_dur)

        rand_art_raster_dur.astype(int)

        # CREATE ARTIFICIAL RASTER COMBINATION OF NOISE ONLY
        for i in np.arange(n_epochs):
            tmp_patt_noise = np.zeros((n_cells, len_pattern))
            patt_num_noise = np.random.randint(3)
            n_cells_to_clear = np.random.randint(np.round(n_cells / 5),
                                                 n_cells)
            cell_to_clear_indices = np.random.randint(0,
                                                      n_cells,
                                                      size=n_cells_to_clear)
            # print(f"pattern number is {patt_num}")
            if patt_num_noise == 0:
                tmp_patt_noise = generate_poisson_pattern(
                    n_cells, len_pattern, 10, 40, 1, 2)
                tmp_patt_noise[cell_to_clear_indices, :] = 0
            if patt_num_noise == 1:
                tmp_patt_noise = generate_poisson_pattern(
                    n_cells, len_pattern, 25, 40, 1, 2)
                tmp_patt_noise[cell_to_clear_indices, :] = 0
            if patt_num_noise == 2:
                tmp_patt_noise = generate_poisson_pattern(
                    n_cells, len_pattern, 30, 40, 1, 2)
                tmp_patt_noise[cell_to_clear_indices, :] = 0
            noise_matrix[:,
                         np.arange(
                             (i * len_pattern),
                             (i * len_pattern) + len_pattern)] = tmp_patt_noise

        if fuse_raster_with_noise is True:
            art_raster_dur_noise = np.copy(art_raster_dur)
            art_raster_dur_noise[np.where(noise_matrix == 1)] = 1
            rand_art_raster_dur_noise = np.copy(rand_art_raster_dur)
            rand_art_raster_dur_noise[np.where(noise_matrix == 1)] = 1
            rand_art_raster_dur = rand_art_raster_dur_noise

        data = rand_art_raster_dur

        # PLOT ALL THESE RASTER #
        # noise-only pattern
        plot_spikes_raster(
            spike_nums=noise_matrix,
            param=None,
            file_name=f"poisson_noise_raster",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

        # Artificial raster dur ordered
        plot_spikes_raster(
            spike_nums=art_raster_dur,
            param=None,
            file_name=f"ordered_raster_with_patterns",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

        # Artificial raster dur with intra pattern shuffle of cell order
        plot_spikes_raster(
            spike_nums=art_raster_dur_pattern_shuffle,
            param=None,
            file_name=f"raster_with_one_shuffle_per_pattern",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

        # Add an additional shuflle on the order of all cell
        plot_spikes_raster(
            spike_nums=rand_art_raster_dur,
            param=None,
            file_name=f"raster_with_patterns_full_shuffle",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

        # Artificial raster dur ordered with random noise
        plot_spikes_raster(
            spike_nums=art_raster_dur_noise,
            param=None,
            file_name=f"ordered_raster_with_patterns_and_noise",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

        # Artificial raster dur shuffled with random noise
        plot_spikes_raster(
            spike_nums=rand_art_raster_dur_noise,
            param=None,
            file_name=f"raster_with_patterns_full_shuffle_and_noise",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

    #################################
    # WORK ON REAL DATA: RASTER_DUR #
    #################################
    if data_to_use == "raster_dur":
        print(f"Loading raster_dur")
        spike_nums_dur = load_data_rasterdur(ms)  # automatic way
        # spike_nums_dur = spike_nums_dur[:20, :2500]  # TO TEST THE CODE
        n_cells, n_frames = spike_nums_dur.shape
        print(f"spike_nums_dur has {n_cells} cells and {n_frames} frames")
        data = spike_nums_dur

    #############################
    # WORK ON REAL DATA: RASTER #
    #############################
    if data_to_use == "raster":
        print(f"Loading raster")
        spike_nums = load_data_raster(ms)  # automatic way
        # spike_nums = spike_nums[:20, :2500]  # TO TEST THE CODE
        n_cells, n_frames = spike_nums.shape
        print(f"spike_nums has {n_cells} cells and {n_frames} frames")
        data = spike_nums

    #############################
    # WORK ON REAL DATA: TRACES #
    #############################
    if data_to_use == "traces":
        print(f"Loading traces")
        traces = load_data_traces(ms)  # automatic way
        traces = traces[:100, :10000]  # TO TEST THE CODE
        n_cells, n_frames = traces.shape
        print(f"traces has {n_cells} cells and {n_frames} frames")
        data = traces

    #####################
    # COMPUTE DISTANCES #
    #####################
    n_epochs = n_frames // len_epoch
    # to make things easy for now, the number of frames should be divisible by the length of epochs
    if (n_frames % len_epoch) != 0:
        raise Exception(
            "number of frames {n_frames} not divisible by {len_epoch}")
    if method_battaglia:
        method = "battaglia"
        distances = SPOT_Dist_Battaglia(data, len_epoch=len_epoch)[0]
    if method_homemade:
        method = "homemade"
        distances = SPOT_Dist_JD_RD(data,
                                    len_epoch=len_epoch,
                                    distance_metric="EMD_Battaglia")

    # Plot Distance matrix
    # ax = sns.heatmap(distances, annot=True)
    ax = sns.heatmap(distances)
    fig = ax.get_figure()

    save_formats = ["pdf", "png"]
    if isinstance(save_formats, str):
        save_formats = [save_formats]

    for save_format in save_formats:
        fig.savefig(
            f'{path_results}/{ms.description}_distances_matrix_{method}_SPOTDist_on_{data_to_use}_with_{len_epoch}_frame_epochs'
            f'.{save_format}',
            format=f"{save_format}",
            facecolor=fig.get_facecolor())
    plt.close()

    ##################
    ### CLUSTERING ###
    ##################

    # HDBSCAN is supposed to be be blind to Inf value, replace missing values by np.Inf for clustering
    distances[np.where(np.isnan(distances))] = np.Inf

    # DO HDBSCAN ON DISTANCES MATRIX - CONSIDER PRECOMPUTED DISTANCES
    clusterer = hdbscan.HDBSCAN(algorithm='best',
                                alpha=1.0,
                                approx_min_span_tree=True,
                                gen_min_span_tree=False,
                                leaf_size=40,
                                metric='precomputed',
                                min_cluster_size=2,
                                min_samples=None,
                                p=None)
    # metric='precomputed' euclidean

    clusterer.fit(distances)

    labels = clusterer.labels_
    # print(f"labels.shape: {labels.shape}")
    print(f"N clusters hdbscan: {labels.max()+1}")
    print(f"labels: {labels}")
    print(f"With no clusters hdbscan: {len(np.where(labels == -1)[0])}")
    n_clusters = 0
    if labels.max() + 1 > 0:
        n_clusters = labels.max() + 1

    if n_clusters > 0:
        n_epoch_by_cluster = [[len(np.where(labels == x)[0])]
                              for x in np.arange(n_clusters)]
        print(
            f"Number of epochs by clusters hdbscan: {' '.join(map(str, n_epoch_by_cluster))}"
        )

    distances_order = np.copy(distances)
    labels_indices_sorted = np.argsort(labels)
    distances_order = distances_order[labels_indices_sorted, :]
    distances_order = distances_order[:, labels_indices_sorted]

    # Generate figure: dissimilarity matrice ordered by cluster
    # Replace Inf values by NaN for better visualization
    distances_order[np.where(np.isinf(distances_order))] = np.nan
    # svm = sns.heatmap(distances_order, annot=True)  # if you want the value
    svm = sns.heatmap(distances_order)
    svm.set_yticklabels(labels_indices_sorted)
    svm.set_xticklabels(labels_indices_sorted)
    fig = svm.get_figure()
    # plt.show()
    save_formats = ["pdf", "png"]
    if isinstance(save_formats, str):
        save_formats = [save_formats]

    path_results = path_results
    for save_format in save_formats:
        fig.savefig(
            f'{path_results}/distances_matrix_hdbscan_ordered'
            f'.{save_format}',
            format=f"{save_format}",
            facecolor=fig.get_facecolor())
    plt.close()

    coords = []
    color = []
    for i in range(n_epochs):
        coords.append([[i * len_epoch, i * len_epoch + len_epoch]])
        color.append(
            cm.nipy_spectral(
                float(labels[i] + 2) / (len(np.unique(labels)) + 2)))
    if data_to_use == "artificial_raster":
        plot_spikes_raster(
            spike_nums=art_raster_dur_noise,
            param=None,
            file_name=f"raster_with_patterns_colored",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            span_area_coords=coords,
            span_area_colors=color,
            path_results=path_results,
            save_formats=["pdf", "png"])
    if data_to_use == "raster_dur" or data_to_use == "raster":
        plot_spikes_raster(
            spike_nums=data,
            param=None,
            file_name=f"raster_with_patterns_colored",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            span_area_coords=coords,
            span_area_colors=color,
            path_results=path_results,
            save_formats=["pdf", "png"])

    # IF NO NaN DO T-SNE CLUSTERING ON DISTANCES VALUES - EUCLIDEAN DISTANCES # todo: find a way to do t-SNE anyway
    missing_values = np.isnan(distances)
    inf_values = np.isinf(distances)

    if (not np.any(missing_values)) and (not np.any(inf_values)):
        do_tsne_clustering = True
        print(f" do tsne clustering is {do_tsne_clustering}")
    elif bool(np.any(missing_values)) or bool(np.any(inf_values)):
        do_tsne_clustering = False
        print(f" do tsne clustering is {do_tsne_clustering}")

    if do_tsne_clustering is True:
        tsne = t_sne(n_components=2, verbose=1, perplexity=40, n_iter=300)
        tsne_results = tsne.fit_transform(distances)

        # first figure: plot t-sne without color
        df_subset = pd.DataFrame()
        df_subset['tsne-2d-one'] = tsne_results[:, 0]
        df_subset['tsne-2d-two'] = tsne_results[:, 1]
        df_subset['color'] = labels
        plt.figure(figsize=(16, 10))
        svm = sns.scatterplot(x="tsne-2d-one",
                              y="tsne-2d-two",
                              data=df_subset,
                              legend="full",
                              alpha=1)
        fig = svm.get_figure()

        path_results = path_results
        for save_format in save_formats:
            fig.savefig(f'{path_results}/tsne_cluster'
                        f'.{save_format}',
                        format=f"{save_format}",
                        facecolor=fig.get_facecolor())
        plt.close()

        # second figure: plot t-sne with color from previous hdbscan result
        df_subset = pd.DataFrame()
        df_subset['tsne-2d-one'] = tsne_results[:, 0]
        df_subset['tsne-2d-two'] = tsne_results[:, 1]
        df_subset['color'] = labels

        plt.figure(figsize=(16, 10))
        svm = sns.scatterplot(x="tsne-2d-one",
                              y="tsne-2d-two",
                              hue="color",
                              palette=sns.color_palette(
                                  "hls", len(np.unique(labels))),
                              data=df_subset,
                              legend="full",
                              alpha=1)
        fig = svm.get_figure()

        path_results = path_results
        for save_format in save_formats:
            fig.savefig(
                f'{path_results}/tsne_colors_from_previous_hdbscan_clustering'
                f'.{save_format}',
                format=f"{save_format}",
                facecolor=fig.get_facecolor())
        plt.close()

        # DO CLUSTERING ON T-SNE RESULTS TO COLOR THE T-SNE FIGURE ##

        clusterer = hdbscan.HDBSCAN(algorithm='best',
                                    alpha=1.0,
                                    approx_min_span_tree=True,
                                    gen_min_span_tree=False,
                                    leaf_size=40,
                                    metric='euclidean',
                                    min_cluster_size=3,
                                    min_samples=None,
                                    p=None)
        clusterer.fit(tsne_results)
        labels_hdbscan_on_tsne = clusterer.labels_
        print(
            f"N clusters hdbscan on t-sne results: {labels_hdbscan_on_tsne.max()+1}"
        )
        # print(f"labels: {labels_hdbscan_on_tsne}")

        df_subset = pd.DataFrame()
        df_subset['tsne-2d-one'] = tsne_results[:, 0]
        df_subset['tsne-2d-two'] = tsne_results[:, 1]
        df_subset['color'] = labels_hdbscan_on_tsne

        plt.figure(figsize=(16, 10))
        svm = sns.scatterplot(x="tsne-2d-one",
                              y="tsne-2d-two",
                              hue="color",
                              palette=sns.color_palette(
                                  "hls",
                                  len(np.unique(labels_hdbscan_on_tsne))),
                              data=df_subset,
                              legend="full",
                              alpha=1)
        # plt.show()
        fig = svm.get_figure()

        path_results = path_results
        for save_format in save_formats:
            fig.savefig(
                f'{path_results}/tsne_colors_from_post_tsne_clustering'
                f'.{save_format}',
                format=f"{save_format}",
                facecolor=fig.get_facecolor())
        plt.close()

    ###############################################
    ######## RETRIEVE GOOD ORDER OF RASTER ########
    ###############################################

    # Get the number of "true clusters" epochs with label = -1 are not in cluster
    if labels.max() + 1 > 0:
        n_clusters = labels.max() + 1
    if n_clusters > 0:
        n_epoch_by_cluster = [[len(np.where(labels == x)[0])]
                              for x in np.arange(n_clusters)]

    # Keep all the epochs belongings to the same clusters
    kept_epochs = []
    for i in range(n_clusters):
        kept_epochs.append(np.where(labels == i))
    # print(f"Epochs kept to represent the clusters are {kept_epochs}")

    # Get 1 raster per cluster corresponding to the sum of all rasters from the epochs of the cluster
    patterns_raster = np.zeros((n_cells, len_epoch, n_clusters))
    for i in range(n_clusters):
        raster_cluster_i = np.zeros((n_cells, len_epoch))
        epochs_in_cluster_i = kept_epochs[i]
        epochs_in_cluster_i = epochs_in_cluster_i[0]
        n_epoch_in_cluster_i = n_epoch_by_cluster[i]
        n_epoch_in_cluster_i = n_epoch_in_cluster_i[0]
        # print(f" Epochs in cluster {i} are {epochs_in_cluster_i}")
        # print(f" Cluster {i} contain {n_epoch_in_cluster_i} epochs")
        for j in range(n_epoch_in_cluster_i):
            start_epoch = epochs_in_cluster_i[j] * len_epoch
            end_epoch = epochs_in_cluster_i[j] * len_epoch + len_epoch
            # print(f" Epoch {j} of cluster {i} starts at {start_epoch} ends at {end_epoch}")
            raster_cluster_i = raster_cluster_i + data[:,
                                                       start_epoch:end_epoch]
        raster_cluster_i = np.true_divide(raster_cluster_i,
                                          n_epoch_in_cluster_i)
        patterns_raster[:, :, i] = raster_cluster_i

    for i in range(n_clusters):
        pattern_i = patterns_raster[:, :, i]
        # Plot this raster
        plot_spikes_raster(
            spike_nums=pattern_i,
            param=None,
            file_name=f"raster_pattern_{i}",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            plot_with_amplitude=True,
            path_results=path_results,
            save_formats=["pdf", "png"])

    for i in range(n_clusters):
        pattern_i = patterns_raster[:, :, i]
        max_values_vector = np.amax(pattern_i, axis=1)
        order = np.argsort(-max_values_vector)
        sorted_pattern_i = np.copy(pattern_i)
        sorted_pattern_i = sorted_pattern_i[order, :]
        # Plot this raster
        plot_spikes_raster(
            spike_nums=sorted_pattern_i,
            param=None,
            file_name=f"raster_ordered_pattern_{i}",
            # y_ticks_labels=np.arange(n_cells),
            # y_ticks_labels_size=2,
            save_raster=True,
            show_raster=False,
            without_activity_sum=True,
            plot_with_amplitude=True,
            path_results=path_results,
            save_formats=["pdf", "png"])
예제 #10
0
def myTSNE(X,y):
    t1 = clock()
    clf = manifold.t_sne(n_components=4, init='pca', random_state=0)
    newRep = clf.fit_transform(X)
    t2 = clock()
    return t2-t1
    def analyze(self,
                dataset_id,
                f_analyzer,
                s_analyzer,
                log_id="log",
                save_results=False,
                embedding_dims=2,
                func_method="hadamard",
                func_incremental=False,
                clustering_method="t-sne"):
        # Get functional embedding and distance matrix
        f_embedding, f_code_trees, f_labels, f_steplabels, f_vectors = f_analyzer.analyze(
            dataset_id,
            log_id=log_id,
            method=func_method,
            incremental=func_incremental,
            save_results=False,
            embedding_dims=embedding_dims,
            clustering_method=clustering_method)
        f_dmat = pairwise_distances(f_vectors, metric="cosine")
        f_dmat = np.divide(f_dmat, scipy.linalg.norm(f_dmat))
        # plot distance matrix
        plt.title("functional distance matrix")
        plt.imshow(f_dmat, cmap='gist_ncar', interpolation='none')
        plt.show()
        # get structural embedding and distance
        s_embedding, s_dmat = s_analyzer.analyze(dataset_id,
                                                 log_id=log_id,
                                                 save_results=False,
                                                 embedding_dims=embedding_dims)
        s_dmat = np.divide(s_dmat, scipy.linalg.norm(s_dmat))
        # plot distance matrix
        plt.title("structural distance matrix")
        plt.imshow(s_dmat, cmap='gist_ncar', interpolation='none')
        plt.show()

        # Combine functional and structural distance matrix
        #dmat = np.divide(np.add(f_dmat, np.multiply(s_dmat, 1)), 2)
        #f_dmat = np.log(np.add(f_dmat, 1))
        #dmat = np.minimum(np.divide(f_dmat, scipy.linalg.norm(f_dmat)), s_dmat)
        #dmat = np.divide(np.add(np.log(np.add(np.multiply(f_dmat, 100), 1)), np.log(np.add(np.multiply(s_dmat, 100), 1))), 2)
        #dmat = np.divide(dmat, scipy.linalg.norm(dmat))  #normalize
        min_contrib = 100
        divisor = np.add(s_dmat, f_dmat)
        #f_weights = np.nan_to_num(np.divide(np.add(s_dmat, np.divide(divisor, min_contrib)), np.add(divisor, np.divide(divisor, min_contrib*2))))
        f_weights = np.nan_to_num(np.divide(s_dmat, divisor))
        #s_weights = np.nan_to_num(np.divide(np.add(f_dmat, np.divide(divisor, min_contrib)), np.add(divisor, np.divide(divisor, min_contrib*2))))
        s_weights = np.nan_to_num(np.divide(f_dmat, divisor))
        dmat = np.add(np.multiply(f_dmat, f_weights),
                      np.multiply(s_dmat, s_weights))

        plt.title("combined dmat")
        plt.imshow(dmat, cmap='gist_ncar', interpolation='none')
        plt.show()

        if clustering_method == "t-sne":
            # Cluster using t-SNE
            tsne = t_sne(n_components=embedding_dims,
                         metric="precomputed",
                         perplexity=10)
            embedding = tsne.fit_transform(dmat)

        elif clustering_method == "umap":
            reducer = umap.UMAP(metric="precomputed",
                                min_dist=0.99,
                                n_neighbors=100)
            embedding = reducer.fit_transform(dmat)

        if save_results:
            utils = AnalysisUtils()
            utils.save_experiment(self.experimentId, embedding, f_code_trees,
                                  f_labels, f_steplabels)

        print("done")
예제 #12
0
def perplexity(folder=None,
               modification='',
               per=np.arange(2, 150, 2),
               create=False,
               pkl=True,
               X=None,
               X_2d_tsne=None):
    """
    Function used to generate or load t-SNE transformations with a range of different perplexities. 
    Parameters
    -------------
    folder: The name of the folder the pickles should be put in / are in
    modification: the modification done to the dataset, a string used in the name of the corresponding pickles. 
    per: the perplexities you want to create transformations of
    create: true is you want to create the transformations, false if you want to load them
    pkl: true if you want to make a pickle for every value of per, otherwise false
    X: the data you want to transform
    X_2d_tsne: the original 2D version of X
    
    Output
    -------------
    p_Z: A ilst of the t-SNE transformations
    per: a vector with the corresponding values of perplexity
    p_times: a vector with the corresponding values of computational time
    p_kl_divergence: a vector with the corresponding values kl divergence
    p_differences: a vector with the corresponding values of difference in 2d distance
    
    """
    if create:
        p_Z = []
        p_times = np.zeros(len(per))
        p_kl_divergence = np.zeros(len(per))
        if X is None:
            X = pickle.load(open(folder + "/X" + modification + ".pkl", "rb"))
        for i, p in enumerate(per):
            tsne = t_sne(perplexity=p, random_state=123)
            start_time = time.time()
            p_Z.append(tsne.fit_transform(X))
            p_times[i] = time.time() - start_time
            p_kl_divergence[i] = tsne.kl_divergence_
        if pkl:
            pickle.dump(
                p_Z, open(folder + "/p_Z_tsne" + modification + ".pkl", "wb"))
            pickle.dump(per, open(folder + "/per" + modification + ".pkl",
                                  "wb"))
            pickle.dump(
                p_times, open(folder + "/p_times" + modification + ".pkl",
                              "wb"))
            pickle.dump(
                p_kl_divergence,
                open(folder + "/p_kl_divergence" + modification + ".pkl",
                     "wb"))
    else:
        p_Z = pickle.load(
            open(folder + "/p_Z_tsne" + modification + ".pkl", "rb"))
        per = pickle.load(open(folder + "/per" + modification + ".pkl", "rb"))
        p_times = pickle.load(
            open(folder + "/p_times" + modification + ".pkl", "rb"))
        p_kl_divergence = pickle.load(
            open(folder + "/p_kl_divergence" + modification + ".pkl", "rb"))
    if X_2d_tsne is None:
        X_2d_tsne = pickle.load(
            open(folder + "/X_2d" + modification + ".pkl", "rb"))
    p_differences = HL.get_differences(X_2d_tsne, p_Z)
    return p_Z, per, p_times, p_kl_divergence, p_differences