def diversity(df_sv_list): """ use skbio to compute different diversity metrics""" richness = pd.DataFrame(index=allsamples) shannon = pd.DataFrame(index=allsamples) bc_dm_list = [] for i, df in enumerate(df_sv_list): data = df.iloc[:, 1:].T.values #columns are the SVs and rows are the samples ids = df.columns[1:] #ids should have the same order as the data rows #richness richness = richness.merge(pd.DataFrame( alpha_diversity("observed_otus", data, ids)), how="left", left_index=True, right_index=True) richness.rename(columns={0: df_sv_list_names[i]}, inplace=True) #shannon shannon = shannon.merge(pd.DataFrame( alpha_diversity("shannon", data, ids)), how="left", left_index=True, right_index=True) shannon.rename(columns={0: df_sv_list_names[i]}, inplace=True) #bray-curtis distance matrix: bc_dm = beta_diversity("braycurtis", data, ids) temp_bc = pd.DataFrame(index=bc_dm.ids, columns=bc_dm.ids) temp_bc.iloc[:, :] = bc_dm.data bc_dm_list.append(temp_bc) return richness, shannon, bc_dm_list
def test_optimized(self): # calling optimized faith_pd gives same results as calling unoptimized # version optimized = alpha_diversity('faith_pd', self.table1, tree=self.tree1, otu_ids=self.oids1) unoptimized = alpha_diversity(faith_pd, self.table1, tree=self.tree1, otu_ids=self.oids1) assert_series_almost_equal(optimized, unoptimized)
def test_single_count_vector(self): actual = alpha_diversity('observed_otus', np.array([1, 0, 2])) expected = pd.Series([2]) assert_series_almost_equal(actual, expected) actual = alpha_diversity('faith_pd', np.array([1, 3, 0, 1, 0]), tree=self.tree1, otu_ids=self.oids1) self.assertAlmostEqual(actual[0], 4.5)
def test_observed_otus(self): # expected values hand-calculated expected = pd.Series([3, 3, 3, 3], index=self.sids1) actual = alpha_diversity('observed_otus', self.table1, self.sids1) assert_series_almost_equal(actual, expected) # function passed instead of string actual = alpha_diversity(observed_otus, self.table1, self.sids1) assert_series_almost_equal(actual, expected) # alt input table expected = pd.Series([2, 1, 0], index=self.sids2) actual = alpha_diversity('observed_otus', self.table2, self.sids2) assert_series_almost_equal(actual, expected)
def test_input_types(self): list_result = alpha_diversity('observed_otus', [1, 3, 0, 1, 0]) array_result = alpha_diversity('observed_otus', np.array([1, 3, 0, 1, 0])) self.assertAlmostEqual(list_result[0], 3) assert_series_almost_equal(list_result, array_result) list_result = alpha_diversity('faith_pd', [1, 3, 0, 1, 0], tree=self.tree1, otu_ids=self.oids1) array_result = alpha_diversity('faith_pd', np.array([1, 3, 0, 1, 0]), tree=self.tree1, otu_ids=self.oids1) self.assertAlmostEqual(list_result[0], 4.5) assert_series_almost_equal(list_result, array_result)
def compute_alphas(biom, tree=None, metrics=['chao1', 'faith_pd', 'observed_otus']): alphas = {} for metric in metrics: if metric == 'faith_pd': alphas[metric] = alpha_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns, otu_ids=biom.index, tree=tree) else: alphas[metric] = alpha_diversity(metric, counts=np.asarray(biom.T), ids=biom.columns) return alphas
def diversity_ana(metric,subsample,ids,**kwargs): if metric == 'faith_pd': each = alpha_diversity('faith_pd', subsample, ids=ids, otu_ids=kwargs['otu_ids'], tree=kwargs['tree']) elif metric == 'shannon': each = alpha_diversity('shannon', subsample, ids=ids) elif metric == 'observed_otus': each = alpha_diversity('observed_otus', subsample, ids=ids) else: try: each = alpha_diversity(metric, subsample, ids=ids) except: print 'Metric you can use is listed below: \n' + '\n'.join(get_alpha_diversity_metrics()) exit() return each
def alpha(table: biom.Table): """ :param table: :return: """ if table.is_empty(): raise ValueError("The provided table object is empty") table = get_biom_table(table) alpha_diversities = [] counts = table.matrix_data.toarray().astype(float).T sample_ids = table.ids(axis='sample') sample_metadata = dict(zip(table.ids(), table.metadata())) for metric in ALPHA_DIVERSITY_METHODS: result = alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric alpha_diversities.append(result) aggregated_diversity_results = aggregate_results(alpha_diversities, sample_ids) formatted_diversity_results = _format_alpha_results_to_json( aggregated_diversity_results, sample_metadata) return formatted_diversity_results
def __dapply__(self, experiment): otu_ids = experiment.data_df.index sample_ids = experiment.data_df.columns matrix = experiment.data_df.T.as_matrix() try: alpha = alpha_diversity(self.distance_metric, counts=matrix, ids=sample_ids, **self.kwargs) except ValueError as e: otu_ids_err_msg = "``otu_ids`` is required for phylogenetic diversity metrics." if str(e) == otu_ids_err_msg: alpha = alpha_diversity(self.distance_metric, counts=matrix, ids=sample_ids, otu_ids=otu_ids, **self.kwargs) else: raise(e) return alpha.to_frame(name=self.distance_metric).transpose()
def compute_alpha_diversity(self): """Compute and cache alpha diversity values This data is computed for the full dataset, not for a specific subsampling. Therefore once it is computed, we can later subsample from these vectors directly. See Also -------- Sculptor.compute_beta_diversity """ # is what's returned from to_frame a new copy? features = self._original_mf[[self.trajectory, self.gradient]].copy() X = self._original_bt.matrix_data.toarray().astype(np.int).T for metric in self._alpha_metrics: if metric == 'faith_pd': kws = { 'tree': self.tree, 'otu_ids': self._original_bt.ids('observation') } else: kws = {} features[metric] = alpha_diversity(metric, X, self._original_bt.ids('sample'), **kws) self._alpha_diversity_values = features
def compute_alpha_diversity(table, metric, **kwargs): """Compute Faith's phylogenetic diversity. Parameters ---------- table: biom.table.Table object BIOM table metric: str alpha diversity metric kwargs: dict, optional Metric-specific parameters Returns ------- results: pd.Series alpha diversity per sample """ sample_ids = table.ids(axis='sample') counts = table.matrix_data.astype(int).T.toarray() results = alpha_diversity(metric=metric, counts=counts, ids=sample_ids, validate=False, **kwargs) results.name = metric return results
def compute_alpha(self, metric="shannon"): if metric == 'shannon': otu_df_alpha = self.otu_df.replace(0, 1) else: otu_df_alpha = self.otu_df dist_series = alpha_diversity(metric, otu_df_alpha, self.sample_ids) return pd.Series(dist_series)
def test_faith_pd(self): # calling faith_pd through alpha_diversity gives same results as # calling it directly expected = [] for e in self.table1: expected.append(faith_pd(e, tree=self.tree1, otu_ids=self.oids1)) expected = pd.Series(expected) actual = alpha_diversity('faith_pd', self.table1, tree=self.tree1, otu_ids=self.oids1) assert_series_almost_equal(actual, expected) # alt input table and tree expected = [] for e in self.table2: expected.append(faith_pd(e, tree=self.tree2, otu_ids=self.oids2)) expected = pd.Series(expected) actual = alpha_diversity('faith_pd', self.table2, tree=self.tree2, otu_ids=self.oids2) assert_series_almost_equal(actual, expected)
def __dapply__(self, experiment): otu_ids = experiment.data_df.index sample_ids = experiment.data_df.columns matrix = experiment.data_df.T.as_matrix() try: alpha = alpha_diversity(self.distance_metric, counts=matrix, ids=sample_ids, **self.kwargs) except ValueError as e: otu_ids_err_msg = "``otu_ids`` is required for phylogenetic diversity metrics." if str(e) == otu_ids_err_msg: alpha = alpha_diversity(self.distance_metric, counts=matrix, ids=sample_ids, otu_ids=otu_ids, **self.kwargs) else: raise (e) return alpha.to_frame(name=self.distance_metric).transpose()
def test_empty(self): # empty vector actual = alpha_diversity('observed_otus', np.array([], dtype=np.int64)) expected = pd.Series([0]) assert_series_almost_equal(actual, expected) # array of empty vector actual = alpha_diversity('observed_otus', np.array([[]], dtype=np.int64)) expected = pd.Series([0]) assert_series_almost_equal(actual, expected) # array of empty vectors actual = alpha_diversity('observed_otus', np.array([[], []], dtype=np.int64)) expected = pd.Series([0, 0]) assert_series_almost_equal(actual, expected) # empty vector actual = alpha_diversity('faith_pd', np.array([], dtype=np.int64), tree=self.tree1, otu_ids=[]) expected = pd.Series([0.]) assert_series_almost_equal(actual, expected) # array of empty vector actual = alpha_diversity('faith_pd', np.array([[]], dtype=np.int64), tree=self.tree1, otu_ids=[]) expected = pd.Series([0.]) assert_series_almost_equal(actual, expected) # array of empty vectors actual = alpha_diversity('faith_pd', np.array([[], []], dtype=np.int64), tree=self.tree1, otu_ids=[]) expected = pd.Series([0., 0.]) assert_series_almost_equal(actual, expected)
#df1.to_csv("test2.tsv", sep="\t", header=1) dfdgo = np.loadtxt("tpm_np.tsv") dfd1 = dfd datad = dfdgo print(datad) #ids = df1.index.tolist() idsd = list(dfd1.index.values) print(idsd) ##################### # Diversity metrics # ##################### adiv_obs_otusd = alpha_diversity('observed_otus', datad, idsd) adiv_faith_pdd = alpha_diversity('faith_pd', datad, ids=idsd, otu_ids=dfd1.columns, tree=tree, validate=True) #bc_dm = beta_diversity("braycurtis", data, ids, validate=False) wu_dmd = beta_diversity("weighted_unifrac", datad, idsd, tree=tree, otu_ids=dfd1.columns,
def alpha_div(dataframe,list_s,fam_code,methode): df_matrix = dataframe.as_matrix() results = ((alpha_diversity(methode,df_matrix,ids=list_s,validate=True))) df_results = pd.DataFrame(results,columns=[fam_code]) df_results=df_results.fillna(0) return df_results, results
from skbio import TreeNode from skbio.diversity import alpha_diversity from biom import load_table import numpy as np import time import sys print('Running-faith_pd') args = sys.argv table = load_table(args[1]) otu_ids = table.ids('observation') counts = np.asarray([table.data(i) for i in table.ids()]) tree = TreeNode.read(args[2]) t0 = time.time() actual = alpha_diversity('faith_pd', counts, tree=tree, otu_ids=otu_ids) actual.to_csv( '/home/garmstro/faith_pd/large-data/redbiom-fetch/redbiom-fetch-faith-pd.txt' ) t1 = time.time() print('Python time-{}'.format(t1 - t0))
def run_fast_faith_shear(counts, otu_ids, tree): return list(alpha_diversity('fast_faith_pd', counts, tree=tree, otu_ids=otu_ids, shear=True).values)
ymtx = np.empty( ( num_repeats, len( xvals ) ), dtype=int ) for i in range( num_repeats ): ymtx[i] = np.asarray( [ subsample( si, n ) for n in xvals ], dtype=int ) yvals = ymtx.mean(0) def errfn(p, n, y): return ( ( ( p[0] * n / (p[1] + n ) ) - y ) ** 2 ).sum() #return ( ( p[0] * ( 1. - np.exp( n / p[1] ) ) - y ) ** 2 ).sum() params_guess = ( n_otu, int( round( n_otu / 2 ) ) ) print >>sys.stderr, yvals print >>sys.stderr, xvals mparams = fmin_powell( errfn, params_guess, ftol=1e-5, args=(xvals, yvals), disp = False ) ef = 2 sv = "%.2f %.2f %.2f" % ( mparams[0], mparams[1], math.sqrt( errfn( mparams, xvals, yvals) / len( xvals ) ) ) else: v = skdiv.alpha_diversity( slist[ k ], si ).values[0] except ( ValueError, TypeError, ZeroDivisionError ) as err: estr = str( err ) ef = 1 #except TypeError as err: # ef = 1 #except ZeroDivisionError as err: # ef = 1 if ef == 0: sv = "%.2f" % v if v == float( int( v ) ): sv = "%d" % int( v ) print "<td>" + sv elif ef == 1: print "<td>??" print >>sys.stderr, estr
TransForm.columns = TransForm.iloc[0] TransForm = pd.DataFrame.drop(TransForm, 'formula_isotopefree', axis=0) TransForm = TransForm.fillna(0) TransForm = TransForm.reset_index() new = TransForm['index'].str.split('(\d+)([A-Za-z]+)', n=3, expand=True) new['SiteName'] = new[0] + new[1] new = new.rename(columns={2: 'Position', 3: 'Depth'}) ids = pd.DataFrame.drop(new, [0, 1], axis=1) OTU_equivalent = pd.DataFrame.drop(TransForm, 'index', axis=1) Formulae = TransForm.reset_index() Formulae = pd.DataFrame.drop(Formulae, 'index', axis=1) for col in ids: adiv_obs_otus = alpha_diversity('observed_otus', OTU_equivalent, ids[col]) alpha = adiv_obs_otus.reset_index() alpha = alpha.rename(columns={0: 'Count'}) alpha['Rank'] = alpha['Count'].rank() alpha.plot(x='Rank', y=['Count'], kind='bar') plt.show() #bc_dm = beta_diversity("braycurtis", OTU_equivalent, TransForm['index']) #wu_pc = pcoa(bc_dm) #fig = wu_pc.plot(new ,'Position') my_dpi = 96 plt.figure(figsize=(480 / my_dpi, 480 / my_dpi), dpi=my_dpi) # Keep the 'species' column appart + make it numeric for coloring ids['Depth'] = pd.Categorical(ids['Depth'])
#df1.to_csv("test2.tsv", sep="\t", header=1) dfsgo = np.loadtxt("tpm_np.tsv") dfs1 = dfs datas = dfsgo print(datas) #ids = df1.index.tolist() idss = list(dfs1.index.values) print(idss) ##################### # Diversity metrics # ##################### adiv_obs_otuss = alpha_diversity('observed_otus', datas, idss) adiv_faith_pds = alpha_diversity('faith_pd', datas, ids=idss, otu_ids=dfs1.columns, tree=tree, validate=False) #bc_dm = beta_diversity("braycurtis", data, ids, validate=False) wu_dms = beta_diversity("weighted_unifrac", datas, idss, tree=tree, otu_ids=dfs1.columns,
var1 = np.array(normal).var() var2 = np.array(surgery).var() n1, n2 = len(normal), len(surgery) diff = float(mean_surgery - mean_normal) pooled_var = ((n2 * var2) + (n1 * var1)) / (n1 + n2) odds_score = float(0) effect_size = diff / sqrt(pooled_var) calc_dict['effect size'] = effect_size return calc_dict normal = [] surgery = [] #p_value = 0.0 results_index = dict( alpha_diversity(str(index), sample_matrix, ids=list_sample, validate=True)) calcres = stat_diff(results_index, normal, surgery) y = str(index) + '_index' column_name = ['Sample', y] df_final1 = pd.DataFrame(results_index.items(), columns=column_name) df_final1.to_csv(outtitle1, header=True, index=True, sep='\t') list_stage = [] list_stage_alpha = df_final1.index.values df_calc = pd.DataFrame.from_dict(calcres, orient='index') df_calc.to_csv(out_calc, header=True, index=True, sep='\t') for x in list_stage_alpha: stage_x = (df_final1.loc[x, 'Sample']) stage = stage_x.split('.')[1] list_stage.append(stage)
def test_no_ids(self): # expected values hand-calculated expected = pd.Series([3, 3, 3, 3]) actual = alpha_diversity('observed_otus', self.table1) assert_series_almost_equal(actual, expected)
#df1.to_csv("test2.tsv", sep="\t", header=1) dftgo = np.loadtxt("tpm_np.tsv") dft1 = dft datat = dftgo print(datat) #ids = df1.index.tolist() idst = list(dft1.index.values) print(idst) ##################### # Diversity metrics # ##################### adiv_obs_otust = alpha_diversity('observed_otus', datat, idst) adiv_faith_pdt = alpha_diversity('faith_pd', datat, ids=idst, otu_ids=dft1.columns, tree=tree, validate=False) #bc_dm = beta_diversity("braycurtis", data, ids, validate=False) wu_dmt = beta_diversity("weighted_unifrac", datat, idst, tree=tree, otu_ids=dft1.columns,
with open(table) as inf: inf.readline() names = inf.readline().split("\t")[1:-1] for line in inf: data.append([int(float(x)) for x in line.split("\t")[1:-1]]) otus.append(line.split("\t")[0]) otus = [x.replace("_", " ") for x in otus] df = pd.DataFrame(data, index=otus, columns=names) if metric == "faith_pd": tree = TreeNode.read(tree_file) div = alpha_diversity(metric, df.T, ids=names, otu_ids=otus, tree=tree, validate=False) else: div = alpha_diversity(metric, df.T, names) div_df = pd.DataFrame(div, columns=[metric]) div_df.to_csv("alphadiversity_" + mname + ".txt", sep="\t") sns.set_style("ticks", {"ytick.major.size": "2.0"}) ax = sns.barplot(data=div_df.T, color=col) sns.despine(right=True) plt.ylabel(label) plt.savefig(figname, dpi=dpi)
def run_faith(counts, otu_ids, tree): return list(alpha_diversity('faith_pd', counts, tree=tree, otu_ids=otu_ids).values)
asv_df, tax_df, taxlevel='ASVg' ) #### change ASVs hash to ASVg label for readability in downstream analysis !!!!!! tax_df = tax_df.reset_index().set_index('ASVg') ## same for taxonomy tax_df = tax_df.loc[asv_df.columns] ## include only ASVs in the CountTable asv_df.to_csv(tmpDir + 'CountTable.tsv', sep='\t', index_label='SampleID' ) ## use this table in other notebooks for this analysis tax_df.to_csv(tmpDir + 'TaxonomyTable.tsv', sep='\t', index_label="ASVg") ## export taxonomy table prepare_counts_for_ecol_inference(asv_df).to_csv( tmpDir + 'comm.tsv', sep='\t') ### change the ASV names for R meta_df = meta_df.reindex( asv_df.index) ## include only samples in the CountTable """ Add alpha-diversity indexes to the metatable """ ### Update the meta table with alpha diversities for each sample. meta_df = meta_df.assign(AlphaShannon=div.alpha_diversity( "shannon", asv_df.astype(int).values, asv_df.index, base=np.e)) meta_df = meta_df.assign( AlphaSimpson=div.alpha_diversity("simpson", asv_df.astype(int).values, asv_df.index)) meta_df = meta_df.assign( AlphaChao=div.alpha_diversity("chao1", asv_df.astype(int).values, asv_df.index)) #meta_df = meta_df.assign(Richness = asv_df.astype(bool).sum(axis=1)) meta_df = meta_df.assign( Richness=div.alpha_diversity('observed_otus', asv_df.astype(int).values, asv_df.index)) meta_df['Eveness'] = meta_df.AlphaShannon / (np.log(meta_df.Richness)) ### Update the meta table that was exported earlier meta_df.to_csv(tmpDir + 'MetaTable.tsv', sep='\t', index_label='SampleID') print(color.BOLD + color.BLUE + "\nClustering analysis...." + color.END)