def test_call_unequal_group_sizes(self): exp = pd.Series( index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 6, 3, 0.578848, 0.645, 999]) np.random.seed(0) obs = permanova(self.dm_unequal, self.grouping_unequal) self.assert_series_equal(obs, exp) np.random.seed(0) obs = permanova(self.dm_unequal, self.grouping_unequal_relabeled) self.assert_series_equal(obs, exp)
def test_call_unequal_group_sizes(self): exp = pd.Series(index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 6, 3, 0.578848, 0.645, 999]) np.random.seed(0) obs = permanova(self.dm_unequal, self.grouping_unequal) self.assert_series_equal(obs, exp) np.random.seed(0) obs = permanova(self.dm_unequal, self.grouping_unequal_relabeled) self.assert_series_equal(obs, exp)
def test_call_no_ties(self): exp = pd.Series( index=self.exp_index, data=["PERMANOVA", "pseudo-F", 4, 2, 4.4, 0.332, 999], name="PERMANOVA results" ) np.random.seed(0) obs = permanova(self.dm_no_ties, self.grouping_equal) self.assert_series_equal(obs, exp)
def test_call_ties(self): # Ensure we get the same results if we rerun the method using the same # inputs. Also ensure we get the same results if we run the method # using a grouping vector or a data frame with equivalent groupings. exp = pd.Series(index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 4, 2, 2.0, 0.671, 999]) for _ in range(2): np.random.seed(0) obs = permanova(self.dm_ties, self.grouping_equal) self.assert_series_equal(obs, exp) for _ in range(2): np.random.seed(0) obs = permanova(self.dm_ties, self.df, column='Group') self.assert_series_equal(obs, exp)
def test_call_no_ties(self): exp = pd.Series(index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, 0.332, 999], name='PERMANOVA results') np.random.seed(0) obs = permanova(self.dm_no_ties, self.grouping_equal) self.assert_series_equal(obs, exp)
def permanovaResult(args, current_wd, retrospect_dir, output_file_tag, notebook_name, suppress, silence, neglect): # python3 -m emmer.bake -m 'Permanova' -i emmer/data/bake_data_dir_6/filtered_infoRich__PCA_coordinates.csv permanova_args = PermanovaArgs(args=args, current_wd=current_wd, suppress=suppress, silence=silence) ## conduct PERMANOVA numpy.random.seed(0) result = permanova( permanova_args.dist_matrix, permanova_args.cluster, permutations=999) ## TODO: allow user-define $permutations and $seed print(result) notebook = UpdateNoteBook(notebook_name=notebook_name, neglect=neglect).updatePermanovaResult( set_seed='0', set_cluster=permanova_args.cluster, test_result=result) parameter_df = pandas.DataFrame({ 'individual': permanova_args.individual, 'cluster': permanova_args.cluster }) output_file_name = os.path.join( retrospect_dir, (output_file_tag + '_retrospect_permanova_parameter.csv')) parameter_df.to_csv(output_file_name)
def permanova_permdisp(self): # compute the permanova print('running permdisp\n\n') print(permdisp(distance_matrix=DistanceMatrix(self.dist_df), grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=999)) print('running permanova\n\n') print(permanova(distance_matrix=DistanceMatrix(self.dist_df), grouping=[_.split('_')[0] for _ in list(self.dist_df)], permutations=9999))
def testPer(self, dist, group): per = self.permanova(dist, group) print(per[0]) print(per[2]) print( permanova(DistanceMatrix(dist, range(len(group))), group, column=None, permutations=999))
def _beta(permutations, data, xvalues, yvalues): x_ids = list(xvalues.index.values) y_ids = list(yvalues.index.values) ids = x_ids + y_ids data_test = data.filter(ids) permanova_result = permanova( distance_matrix=data_test, # we can use use either x or y cause they are the same column=xvalues.name, grouping=pd.concat([xvalues, yvalues]).to_frame(), permutations=permutations).to_dict() xvals = list( data_test.filter(xvalues.index.values).to_series().dropna().values) yvals = list( data_test.filter(yvalues.index.values).to_series().dropna().values) return (permanova_result['p-value'], permanova_result['test statistic'], xvals, yvals)
def get_permanova_ranked_list(x, y, feature_list, label_set): x = x.transpose().values values = [] for f in range(len(feature_list)): sub_x = x[:, f] dist = pairwise_distances(sub_x.reshape(-1, 1), sub_x.reshape(-1, 1), metric="cityblock") dist = DistanceMatrix(data=dist) perm = permanova(dist, y) values.append(perm.loc["p-value"]) fdr_values = multipletests(values, method="fdr_bh")[1] permanova_df = pd.DataFrame(index=feature_list, data={ "p-value": np.array(values).reshape(-1), "Adj p-value": np.array(fdr_values).reshape(-1) }) return permanova_df
def pseudoF_permanova(points, labels): """ Statistical significance is assessed via a permutation test. The assignment of objects to groups (grouping) is randomly permuted a number of times (controlled via permutations). A pseudo-F statistic is computed for each permutation and the p-value is the proportion of permuted pseudo-F statisics that are equal to or greater than the original (unpermuted) pseudo-F statistic. (using sklearn pairwise euclidean_distance function) Parameters ---------- points : np.array np.array([N, p]) of all points labels: np.array np.array([N]) labels of all points """ distances = skbio.DistanceMatrix(points.as_matrix()) ks = np.sort(np.unique(labels)) pseudo_f = permanova(distances, labels) print(pseudo_f) return pseudo_f
sns.set(font_scale=1.5, style="ticks") g = sns.FacetGrid(tsne, hue="taxa", height=10, aspect=16 / 10) gm = g.map(plt.scatter, "x", "y", alpha=0.25) means = tsne.groupby(taxa).agg("median").reset_index() texts = means.apply(lambda df: plt.text(df.x, df.y, df.taxa, alpha=0.65), axis=1) texts = adjust_text( texts, force_text=(0.02, 0.5), arrowprops=dict(arrowstyle="-|>", alpha=0.5, color="k"), ) plt.savefig("figures/individual_media.png", dpi=200) plt.close() # Some statistics about metabolite usage # indicator matrix 0 = metabolite not consumed, 1 = metabolite consumed binary = mat.where(mat < -1e-6, 0).where(mat > -1e-6, 1) # Jaccard distances = 1 - percent overlap J = pdist(binary, "jaccard") print("Jaccard distances:", pd.Series(J).describe(), sep="\n") # euclidean distances E = pdist(mat, "euclidean") # Test whether genus explains a good amount of that variation p = permanova(DistanceMatrix(E), taxa) r2 = 1 - 1 / (1 + p[4] * p[3] / (p[2] - p[3] - 1)) p["R2"] = r2 print("PERMANOVA on euclidean distances:", p, sep="\n")
def test_call_no_permutations(self): exp = pd.Series(index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, np.nan, 0]) obs = permanova(self.dm_no_ties, self.grouping_equal, permutations=0) self.assert_series_equal(obs, exp)
def test_call_no_ties(self): exp = pd.Series(index=self.exp_index, data=['PERMANOVA', 'pseudo-F', 4, 2, 4.4, 0.332, 999]) np.random.seed(0) obs = permanova(self.dm_no_ties, self.grouping_equal) self.assert_series_equal(obs, exp)
clr_res = clr_inv(np.dot(np.dot(U, s), V.T)) # use just kl_div here because already closed kl_clr = entropy(closure(basetmp_sub).T, clr_res.T).mean() results[(rank_, power_, depth_, 'rclr', 'KL-Div')] = [kl_clr] # test KL without rclr X_spn = np.array(subtmp_sub.copy()).astype(float) X_spn[X_spn == 0] = np.nan U_, s_, V_ = OptSpace(iteration=1000).fit_transform(X_spn) res_raw = np.dot(np.dot(U_, s_), V_.T) res_raw[res_raw <= 0] = 1 kl_raw = entropy(closure(basetmp_sub).T, closure(res_raw).T).mean() results[(rank_, power_, depth_, 'Raw Counts', 'KL-Div')] = [kl_raw] # f-stat resfclr = permanova(DistanceMatrix(distance.cdist(U, U)), meta['group'])['test statistic'] rawfres = permanova(DistanceMatrix(distance.cdist(U_, U_)), meta['group'])['test statistic'] results[(rank_, power_, depth_, 'rclr', 'F-Statistic')] = [resfclr] results[(rank_, power_, depth_, 'Raw Counts', 'F-Statistic')] = [rawfres] # KNN for U_tmp, method in zip([U, U_], ['rclr', 'Raw Counts']): pcoa_tmp = pcoa(DistanceMatrix(distance.cdist(U_tmp, U_tmp))).samples pcoa_tmp.index = subtmp_sub.index # split X_train, X_test, y_train, y_test = train_test_split( pcoa_tmp, meta['group'].ravel(),
perm_res = {} perm_res_tmp = {} for dataset_, subs in distances.items(): perm_res[dataset_] = {} perm_res_tmp[dataset_] = {} for (fold_, Nsamp_), methods_ in subs.items(): meta_ = meta[dataset_][(fold_, Nsamp_)]['metadata'] if len(meta_.index) < Nsamp_: continue perm_res[dataset_][(fold_, Nsamp_)] = {} perm_res_tmp[dataset_][(fold_, Nsamp_)] = {} for method, dist_tmp in methods_.items(): perm_res[dataset_][(fold_, Nsamp_)][method] = {} dist_tmp = DistanceMatrix(dist_tmp) perm_tmp = permanova( dist_tmp, meta[dataset_][(fold_, Nsamp_)]['metadata'][ case_study[dataset_]['factor']].values) perm_res[dataset_][( fold_, Nsamp_)][method]['test statistic'] = perm_tmp['test statistic'] perm_res_tmp[dataset_][(fold_, Nsamp_)] = pd.DataFrame( perm_res[dataset_][(fold_, Nsamp_)]) both_perm_res[dataset_] = pd.concat(perm_res_tmp[dataset_]) # run calssiification import warnings warnings.simplefilter('ignore') #for PCoA warnings from skbio.stats.ordination import pcoa from sklearn import metrics from sklearn.cluster import KMeans
sample_id = each_sample_split[0] sample_group = each_sample_split[1] sample_id_list.append(sample_id) sample_group_list.append(sample_group) # read in data as dataframe df = pd.read_csv(infile_data, sep='\t') # get list of list from dataframe lol_data_in = [] for col_id in sample_id_list: column_num_list = (df[col_id].values).tolist() lol_data_in.append(column_num_list) # calculate distance matrix dist_arrary = pairwise_distances(lol_data_in, lol_data_in, metric=distance_metric) # add sample id to distance matrix dist_matrix = DistanceMatrix(dist_arrary, sample_id_list) # perform anosim test anosim_test = anosim(dist_matrix, sample_group_list, permutations=999) print(anosim_test) print() # perform permanova test permanova_test = permanova(dist_matrix, sample_group_list, permutations=999) print(permanova_test)
text= "ATTENTION: At least 1 of your eigenvalues is negative, potentially leading to problems! You may want to choose another metric for distance calculation or apply data transformation on the distance matrix (e.g. square root) to get rid of this problem." ) eig_dm = pd.DataFrame(pc.eigvals, columns=["Eigenvalue"]) eig_dm["Explained"] = pc.proportion_explained eig_dm["Summed_explanation"] = pc.proportion_explained.cumsum() if metric == "minkowski": eig_dm.to_csv("eigenvalues_" + mname + "_p" + str(p) + ".txt", sep="\t") else: eig_dm.to_csv("eigenvalues_" + mname + ".txt", sep="\t") #Statistics anos = anosim(div, map_DF, column=var, permutations=999) perm = permanova(div, map_DF, column=var, permutations=999) if metric == "minkowski": stat_file = "statistics_" + mname + "_p" + str(p) + "_" + var + ".txt" else: stat_file = "statistics_" + mname + "_" + var + ".txt" with open(stat_file, "w") as st: st.write("ANOSIM\tPermutations: 999\n\n") st.write("R\t" + str(anos["test statistic"]) + "\n") st.write("p-value\t" + str(anos["p-value"]) + "\n\n") st.write("PERMANOVA\tPermutations: 999\n\n") st.write("F\t" + str(perm["test statistic"]) + "\n") st.write("p-value\t" + str(perm["p-value"]) + "\n\n") end = time.time()
def test_call_no_permutations(self): exp = pd.Series( index=self.exp_index, data=["PERMANOVA", "pseudo-F", 4, 2, 4.4, np.nan, 0], name="PERMANOVA results" ) obs = permanova(self.dm_no_ties, self.grouping_equal, permutations=0) self.assert_series_equal(obs, exp)
for a in range(len(rows[0])): if a > 0: this_sample = [] for b in range(len(rows)): if b > 0: this_sample.append(float(rows[b][a])) samples.append(this_sample) """ only_samples = ['LR', 'SR'] new_samples, new_names = [], [] for a in range(len(sample_names)): for b in range(len(only_samples)): if sample_names[a] == only_samples[b]: new_samples.append(samples[a]) new_names.append(sample_names[a]) samples = new_samples sample_names = new_names print(len(samples), len(sample_names)) """ sam_dm = dm.from_iterable(samples, metric=braycurtis) pdisp = permdisp(sam_dm, sample_names, column=None, test='median', permutations=999) print(pdisp) asim = anosim(sam_dm, sample_names, column=None, permutations=999) print(asim) perm = permanova(sam_dm, sample_names, column=None, permutations=999) print(perm)