def investigate_similarity_threshold2(self): for name, repr in self._repr_core.reprs.items(): distances = [] temp_df = repr.repr_df.copy() for indi, i in enumerate(range(0, 6000, 600)): for indj, j in enumerate(range(0, 6000, 600)): if indj <= indi: continue dfs = [] dfs.append(temp_df[i:i + 600]) dfs.append(temp_df[j:j + 600]) most_dist, sim_array = get_mean_cdist(dfs) # fracd = abs(self._repr_descriptor.get_fractal_d(dfs[0]) - self._repr_descriptor.get_fractal_d(dfs[1])) # distances[indi][indj] = fracd # sim_array = get_hellinger_distance(dfs) # distances.append(sim_array[0][1]) distances.append(most_dist) df = pd.DataFrame(data=distances) sns.displot(df, stat='probability', binwidth=0.001) plt.show() print(df.quantile(q=0.95))
def investigate_similarity_threshold_w_noise(self): sim_dicts = [] for name, repr in self._repr_core.reprs.items(): temp_df = repr.repr_df.copy() first_sample = temp_df.drop(['species'], axis=1) for noise_lvl in np.arange(0, 1, 0.1): sim_dict = {} dfs = [] dfs.append(first_sample) # apply noise noise_sample = first_sample.applymap( lambda x: x + np.random.uniform(-1000, 1000) if np.random.uniform() < noise_lvl else x) dfs.append(noise_sample) most_dist, sim_array = get_mean_cdist(dfs) sim = sim_array[0][1] sim_dict['Noise Probability'] = noise_lvl sim_dict['Cosine_Distance'] = sim sim_dict['5% most distant'] = most_dist sim_dicts.append(sim_dict) df = pd.DataFrame(sim_dicts) sns.lineplot(data=df, x='Noise Probability', y='Cosine_Distance', label='mean') sns.lineplot(data=df, x='Noise Probability', y='5% most distant', label='5% most distant') plt.legend() plt.show()
def test_sampling_methods(method): """ :param method: permutation_pdist, KS_pdist :return: """ print("~test_sampling_methods!") start = time() core = ThesisCore(in_package=True) core.create_reprs(kmer=3) for name, repr in core._repr_core.reprs.items(): attempts = [] time_list = [] ind_list = [] temp_df = repr.repr_df.copy() dfs = [] feature_df_copy = temp_df.copy() num_of_datasets = 300 first_sample = feature_df_copy.sample(frac=0.1) dfs.append(first_sample) for ind, i in enumerate(range(num_of_datasets - 1)): loop_start = time() found = False while not found: next_sample = feature_df_copy.sample(frac=0.1) found = True for dfr in dfs: if method == 'permutation_pdist': p_value = get_permutation_of_pdists(dfr, next_sample) if p_value > 0.05: attempts.append(p_value) found = False break elif method == 'KS_pdist': p_value = get_KS_of_pdists(dfr, next_sample) if p_value > 0.05: attempts.append(p_value) found = False break elif method == 'mean_cdist': _, dist = get_mean_cdist(dfr, next_sample) if dist < 0.9997031552913787: print(dist) attempts.append(dist) found = False break dfs.append(next_sample) loop_end = time() time_diff = loop_end - loop_start # print('len dfs', len(dfs), time_diff) time_list.append(time_diff) ind_list.append(i) print('Failed attempts', len(attempts)) # print(attempts) end = time() print(f"~test_sampling_methods ended!It took {end - start} seconds!") plt.scatter(ind_list, time_list) plt.show()
def print_dataset_similarity(self): print("Similarities started!") np.set_printoptions(threshold=sys.maxsize) start = time() for name, repr in self._repr_core.reprs.items(): df_list = repr.df_list sims = get_mean_cdist(df_list) # print(sims) # print(sims.reshape(600, 600)) end = time() print(f"~Similarities ended! It took {end - start} seconds!")
def investigate_similarity_threshold(self): """ compare my dataset with a random one :return: """ sim_dicts = [] for name, repr in self._repr_core.reprs.items(): sim_dict = {} temp_df = repr.repr_df.copy() first_sample = temp_df.drop(['species'], axis=1) random_df = pd.DataFrame(np.random.randint( -1000, 1000, size=first_sample.shape), columns=first_sample.columns) dfs = [first_sample, random_df] most_dist, sim = get_mean_cdist(dfs)
def investigate_similarity_threshold(repr1, repr2): distances = [] temp_df1 = repr1.reprs['FCGSR'].repr_df.copy() temp_df2 = repr2.reprs['FCGSR'].repr_df.copy() dfs = [temp_df1, temp_df2] most_dist, sim_array = get_mean_cdist(dfs) # fracd = abs(self._repr_descriptor.get_fractal_d(dfs[0]) - self._repr_descriptor.get_fractal_d(dfs[1])) # sim_array = get_hellinger_distance(dfs) # distances.append(sim_array[0][1]) # distances[indi][indj] = fracd # df = pd.DataFrame(data=distances) # sns.displot(df, stat='probability', binwidth=0.001) # plt.show() # print(df.quantile(q=0.95)) plt.show() return sim_array[0][1]
def test_distances_w_random(noise_lvl=0): """ Test pairwise cosine distances between samples with distances between sample-random dataset with paired t-test or mann- whitney to see if the mean difference between the paired samples is zero :param noise_lvl: Noise should be in the form of a random value in the range of the random dataset :return: """ core = ThesisCore(in_package=True) core.create_reprs(kmer=3) for name, repr in core._repr_core.reprs.items(): # create 10 subsets and 10 random of the same size subsets = [] randoms = [] test_results = [] temp_df = repr.repr_df.copy().drop(['species'], axis=1) for indi, i in enumerate(range(0, 6000, 600)): subset = temp_df[i:i + 600] # apply noise if available noisy_subset = subset.applymap(lambda x: x + np.random.uniform( -1000, 1000) if np.random.uniform() < noise_lvl else x) subsets.append(noisy_subset) randoms.append( pd.DataFrame(np.random.randint(-1000, 1000, size=subset.shape), columns=subset.columns)) for i in range(len(subsets)): m1 = [] m2 = [] for j in range(len(subsets)): if i == j: continue _, intersubset_dist = get_mean_cdist(subsets[i], subsets[j]) m1.append(intersubset_dist) _, subrandom_dist = get_mean_cdist(subsets[i], randoms[j]) m2.append(subrandom_dist) # perform statistical test between the 2 lists # normality test shapiro_test_1 = shapiro(m1) shapiro_test_2 = shapiro(m2) result = 'cannot reject' if shapiro_test_1.pvalue > 0.05 and shapiro_test_2.pvalue > 0.05 else 'reject' # print('For subset', i, 'we', result, 'the H0 that the intersubset dists and ' # 'the dists with the random were drawn from a # normal distribution') test_result = 0 if result == 'cannot reject': # paired t-test ttest = ttest_rel(m1, m2) test_result = 'cannot reject' if ttest.pvalue > 0.05 else 'reject' print( 'For subset', i, 'We', test_result, 'the H0 that intersubset dists and ' 'the dists with the random have identical ' 'average values, with pvalue', ttest.pvalue) else: print('cannot perform paired t-test') # wilcoxon for non normal dependent samples wtest = wilcoxon(m1, m2) test_result = 'cannot reject' if wtest.pvalue > 0.05 else 'reject' print( 'For subset', i, 'We', test_result, 'the H0 that intersubset dists and ' 'the dists with the random have equal ' 'median values, with pvalue', wtest.pvalue) test_results.append((test_result, np.mean(m1))) prevalent_result = max(set([x for x, _ in test_results]), key=[x for x, _ in test_results].count) threshold = max([y for x, y in test_results if x == prevalent_result]) return prevalent_result, threshold