Пример #1
0
    def investigate_similarity_threshold2(self):
        for name, repr in self._repr_core.reprs.items():
            distances = []
            temp_df = repr.repr_df.copy()
            for indi, i in enumerate(range(0, 6000, 600)):
                for indj, j in enumerate(range(0, 6000, 600)):
                    if indj <= indi:
                        continue
                    dfs = []
                    dfs.append(temp_df[i:i + 600])
                    dfs.append(temp_df[j:j + 600])
                    most_dist, sim_array = get_mean_cdist(dfs)

                    # fracd  = abs(self._repr_descriptor.get_fractal_d(dfs[0]) - self._repr_descriptor.get_fractal_d(dfs[1]))
                    # distances[indi][indj] = fracd

                    # sim_array = get_hellinger_distance(dfs)
                    # distances.append(sim_array[0][1])
                    distances.append(most_dist)

        df = pd.DataFrame(data=distances)

        sns.displot(df, stat='probability', binwidth=0.001)
        plt.show()
        print(df.quantile(q=0.95))
Пример #2
0
 def investigate_similarity_threshold_w_noise(self):
     sim_dicts = []
     for name, repr in self._repr_core.reprs.items():
         temp_df = repr.repr_df.copy()
         first_sample = temp_df.drop(['species'], axis=1)
         for noise_lvl in np.arange(0, 1, 0.1):
             sim_dict = {}
             dfs = []
             dfs.append(first_sample)
             # apply noise
             noise_sample = first_sample.applymap(
                 lambda x: x + np.random.uniform(-1000, 1000)
                 if np.random.uniform() < noise_lvl else x)
             dfs.append(noise_sample)
             most_dist, sim_array = get_mean_cdist(dfs)
             sim = sim_array[0][1]
             sim_dict['Noise Probability'] = noise_lvl
             sim_dict['Cosine_Distance'] = sim
             sim_dict['5% most distant'] = most_dist
             sim_dicts.append(sim_dict)
     df = pd.DataFrame(sim_dicts)
     sns.lineplot(data=df,
                  x='Noise Probability',
                  y='Cosine_Distance',
                  label='mean')
     sns.lineplot(data=df,
                  x='Noise Probability',
                  y='5% most distant',
                  label='5% most distant')
     plt.legend()
     plt.show()
Пример #3
0
def test_sampling_methods(method):
    """
    :param method: permutation_pdist, KS_pdist
    :return:
    """
    print("~test_sampling_methods!")
    start = time()
    core = ThesisCore(in_package=True)
    core.create_reprs(kmer=3)
    for name, repr in core._repr_core.reprs.items():
        attempts = []
        time_list = []
        ind_list = []
        temp_df = repr.repr_df.copy()
        dfs = []
        feature_df_copy = temp_df.copy()
        num_of_datasets = 300
        first_sample = feature_df_copy.sample(frac=0.1)
        dfs.append(first_sample)
        for ind, i in enumerate(range(num_of_datasets - 1)):
            loop_start = time()
            found = False
            while not found:
                next_sample = feature_df_copy.sample(frac=0.1)
                found = True
                for dfr in dfs:
                    if method == 'permutation_pdist':
                        p_value = get_permutation_of_pdists(dfr, next_sample)
                        if p_value > 0.05:
                            attempts.append(p_value)
                            found = False
                            break
                    elif method == 'KS_pdist':
                        p_value = get_KS_of_pdists(dfr, next_sample)
                        if p_value > 0.05:
                            attempts.append(p_value)
                            found = False
                            break
                    elif method == 'mean_cdist':
                        _, dist = get_mean_cdist(dfr, next_sample)
                        if dist < 0.9997031552913787:
                            print(dist)
                            attempts.append(dist)
                            found = False
                            break
            dfs.append(next_sample)
            loop_end = time()
            time_diff = loop_end - loop_start
            # print('len dfs', len(dfs), time_diff)
            time_list.append(time_diff)
            ind_list.append(i)
        print('Failed attempts', len(attempts))
        # print(attempts)
        end = time()
        print(f"~test_sampling_methods ended!It took {end - start} seconds!")
        plt.scatter(ind_list, time_list)
        plt.show()
Пример #4
0
 def print_dataset_similarity(self):
     print("Similarities started!")
     np.set_printoptions(threshold=sys.maxsize)
     start = time()
     for name, repr in self._repr_core.reprs.items():
         df_list = repr.df_list
         sims = get_mean_cdist(df_list)
         # print(sims)
         # print(sims.reshape(600, 600))
     end = time()
     print(f"~Similarities ended! It took {end - start} seconds!")
Пример #5
0
 def investigate_similarity_threshold(self):
     """
     compare my dataset with a random one
     :return:
     """
     sim_dicts = []
     for name, repr in self._repr_core.reprs.items():
         sim_dict = {}
         temp_df = repr.repr_df.copy()
         first_sample = temp_df.drop(['species'], axis=1)
         random_df = pd.DataFrame(np.random.randint(
             -1000, 1000, size=first_sample.shape),
                                  columns=first_sample.columns)
         dfs = [first_sample, random_df]
         most_dist, sim = get_mean_cdist(dfs)
Пример #6
0
def investigate_similarity_threshold(repr1, repr2):
    distances = []
    temp_df1 = repr1.reprs['FCGSR'].repr_df.copy()
    temp_df2 = repr2.reprs['FCGSR'].repr_df.copy()
    dfs = [temp_df1, temp_df2]
    most_dist, sim_array = get_mean_cdist(dfs)
    # fracd  = abs(self._repr_descriptor.get_fractal_d(dfs[0]) - self._repr_descriptor.get_fractal_d(dfs[1]))
    # sim_array = get_hellinger_distance(dfs)
    # distances.append(sim_array[0][1])
    # distances[indi][indj] = fracd
    # df = pd.DataFrame(data=distances)
    # sns.displot(df, stat='probability', binwidth=0.001)
    # plt.show()
    # print(df.quantile(q=0.95))
    plt.show()
    return sim_array[0][1]
Пример #7
0
def test_distances_w_random(noise_lvl=0):
    """
    Test pairwise cosine distances between samples with distances between sample-random dataset
    with paired t-test or mann- whitney to see if the mean difference between the paired samples is zero
    :param noise_lvl: Noise should be in the form of a random value in the range of the random dataset
    :return:
    """
    core = ThesisCore(in_package=True)
    core.create_reprs(kmer=3)
    for name, repr in core._repr_core.reprs.items():
        # create 10 subsets and 10 random of the same size
        subsets = []
        randoms = []
        test_results = []
        temp_df = repr.repr_df.copy().drop(['species'], axis=1)
        for indi, i in enumerate(range(0, 6000, 600)):
            subset = temp_df[i:i + 600]
            # apply noise if available
            noisy_subset = subset.applymap(lambda x: x + np.random.uniform(
                -1000, 1000) if np.random.uniform() < noise_lvl else x)
            subsets.append(noisy_subset)
            randoms.append(
                pd.DataFrame(np.random.randint(-1000, 1000, size=subset.shape),
                             columns=subset.columns))
        for i in range(len(subsets)):
            m1 = []
            m2 = []
            for j in range(len(subsets)):
                if i == j:
                    continue
                _, intersubset_dist = get_mean_cdist(subsets[i], subsets[j])
                m1.append(intersubset_dist)
                _, subrandom_dist = get_mean_cdist(subsets[i], randoms[j])
                m2.append(subrandom_dist)
            # perform statistical test between the 2 lists
            # normality test
            shapiro_test_1 = shapiro(m1)
            shapiro_test_2 = shapiro(m2)
            result = 'cannot reject' if shapiro_test_1.pvalue > 0.05 and shapiro_test_2.pvalue > 0.05 else 'reject'
            # print('For subset', i, 'we', result, 'the H0 that the intersubset dists and '
            #                                      'the dists with the random were drawn from a
            #                                      normal distribution')
            test_result = 0
            if result == 'cannot reject':
                # paired t-test
                ttest = ttest_rel(m1, m2)
                test_result = 'cannot reject' if ttest.pvalue > 0.05 else 'reject'
                print(
                    'For subset', i, 'We', test_result,
                    'the H0 that intersubset dists and '
                    'the dists with the random have identical '
                    'average values, with pvalue', ttest.pvalue)
            else:
                print('cannot perform paired t-test')
                # wilcoxon for non normal dependent samples
                wtest = wilcoxon(m1, m2)
                test_result = 'cannot reject' if wtest.pvalue > 0.05 else 'reject'
                print(
                    'For subset', i, 'We', test_result,
                    'the H0 that intersubset dists and '
                    'the dists with the random have equal '
                    'median values, with pvalue', wtest.pvalue)
            test_results.append((test_result, np.mean(m1)))

        prevalent_result = max(set([x for x, _ in test_results]),
                               key=[x for x, _ in test_results].count)
        threshold = max([y for x, y in test_results if x == prevalent_result])

    return prevalent_result, threshold