def __init__(self, metadata_file='', otu_file='', taxonomy_file='', ssu_type='', nmds_filter=0, diversity_filter=0, metadata_col='', query_control='', query_treatment='', metadata_axis_label='' ): """ data_nmds has one more sample than the data_diversity which has ~ 1000 reads """ self.r= RAdapter() self.level= "phylum" self.ssu_type= ssu_type self.metadata_file = metadata_file self.otu_file = otu_file self.taxonomy_file = taxonomy_file self.nmds_filter = nmds_filter self.diversity_filter = diversity_filter self.metadata_col = metadata_col self.query_control = query_control self.query_treatment = query_treatment self.metadata_axis_label = metadata_axis_label self.prepData()
class Main: def __init__(self, metadata_file='', otu_file='', taxonomy_file='', ssu_type='', nmds_filter=0, diversity_filter=0, metadata_col='', query_control='', query_treatment='', metadata_axis_label='' ): """ data_nmds has one more sample than the data_diversity which has ~ 1000 reads """ self.r= RAdapter() self.level= "phylum" self.ssu_type= ssu_type self.metadata_file = metadata_file self.otu_file = otu_file self.taxonomy_file = taxonomy_file self.nmds_filter = nmds_filter self.diversity_filter = diversity_filter self.metadata_col = metadata_col self.query_control = query_control self.query_treatment = query_treatment self.metadata_axis_label = metadata_axis_label self.prepData() def run(self): metadata, otu, taxonomy, analyses= self.prepData() self.r.initData(metadata, otu, taxonomy) for analysis_type in analyses: self.analyze(analysis_type, self.metadata_col, self.query_control, self.query_treatment, self.metadata_axis_label) def prepData(self): Input=namedtuple('Input','file index function') #level= "phylum" if self.ssu_type == '16S': sub_taxa= photosynthetic_bacteria=['Proteobacteria','Cyanobacteria', 'Candidate division OD1','BD1_5', 'Actinobacteria','Chloroflexi', 'Bacteroidetes','Planctomycetes'] f_metadata, f_otu, f_taxonomy= f_metadata_16S, f_otu_16S, f_taxonomy_16S elif self.ssu_type == "18S": sub_taxa= ['Metazoa','Fungi','Rhizaria','Alveolata','Ichthyosporea','Rhodophyta','Viridiplantae','Cryptophyta','Haptophyceae', 'stramenopiles','Euglenozoa'] f_metadata, f_otu, f_taxonomy= f_metadata_18S, f_otu_18S, f_taxonomy_18S metadata= Input(self.metadata_file,"sample short_name",f_metadata) otu= Input(self.otu_file, 0, f_otu) taxonomy= Input(self.taxonomy_file, 0, f_taxonomy) analyses= [self.ssu_type, self.level]+sub_taxa return (metadata, otu, taxonomy, analyses) def analyze(self, analysis_type, metadata_type, filter_controls, filter_treatments, xlabel): self.analysis_type= analysis_type self.buildDirs(analysis_type) try: self.buildTables(filter_controls, filter_treatments, analysis_type, metadata_type) except: pass self.buildPlots(analysis_type, metadata_type, xlabel) def buildDirs(self, analysis_type): self.analysis_dir= os.path.join('results_%s' % self.ssu_type, analysis_type) if os.path.exists(self.analysis_dir): shutil.rmtree(self.analysis_dir) self.dirTables= os.path.join(self.analysis_dir, 'tables') os.makedirs(os.path.join(self.dirTables)) def toRelative(self, df): df= df.T.apply(lambda x:x/x.sum()).T return df def buildTables(self, filterControls, filterTreatments, analysis_type, metadata_type): ### apply filters and transformations on nmds data if analysis_type == "16S" or analysis_type == "18S" : ### regression data df_regression , filtered_regression= self.r.filterByTotalOTUCount(self.r.df_otu, self.diversity_filter) df_regression = self.r.transformDFByRFunction(df_regression, 'rrarefy', 'vegan', min(df_regression.T.sum())) #########LOG ### comparison data df_comparison= df_regression.copy() metadata= self.r.df_metadata.ix[df_regression.index] elif analysis_type == self.level: ### regression data: relative abundance df_regression, groups = self.r.getAbundanceByLevel(self.r.df_taxonomy, self.r.df_otu, self.level) groups.to_csv(os.path.join(self.analysis_dir,'tables', '%s_phylum_otu_richness.tsv' %self.ssu_type), sep="\t", index_label="Name") df_regression.to_csv(os.path.join(self.analysis_dir,'tables','%s_phylum_otu_absolute_counts.tsv' %self.ssu_type), sep="\t", index_label="Name") df_regression = self.toRelative(df_regression) ### comparison data: relative abundance df_comparison = df_regression.copy() metadata= self.r.df_metadata.ix[df_regression.index] else: ### regression data df_otu = self.r.getAbundanceByTaxon(self.r.df_taxonomy, self.r.df_otu, self.level, analysis_type) df_regression , filtered_regression= self.r.filterByTotalOTUCount(df_otu, 50) df_regression = self.r.transformDFByRFunction(df_regression, 'rrarefy', 'vegan', min(df_regression.T.sum())) #########LOG #pdb.set_trace() ### comparison data df_comparison= df_regression.copy() metadata= self.r.df_metadata.ix[df_regression.index] controls= metadata.query(filterControls).index treatments=metadata.query(filterTreatments).index g1= df_comparison.ix[controls] g2= df_comparison.ix[treatments] result = mannWhitneyUTest(g1.T, g2.T) ### regression md= metadata[metadata_type] if analysis_type == self.level: self.comp = result[result['fdr'] <=0.1] self.corr = spearmanCorr(df_regression.T, md.T) else: self.comp = result[result['fdr'] <=0.1].join(self.r.df_taxonomy).drop(['domain','superkingdom'], axis=1) self.corr = spearmanCorr(df_regression.T, md.T).join(self.r.df_taxonomy).drop(['domain','superkingdom'], axis=1) self.comp.to_excel(os.path.join(self.dirTables, 'otu_comparison.xlsx'), index_label="OTU") self.corr.to_excel(os.path.join(self.dirTables, 'otu_correlation.xlsx'), index_label="OTU") def buildPlots(self, analysis_type, metadata_type, xlabel): if analysis_type == "16S" or analysis_type == "18S" : ### nmds data df_nmds, filtered_nmds = self.r.filterByTotalOTUCount(self.r.df_otu, self.nmds_filter) df_nmds= self.r.transformDFByRFunction(df_nmds, 'rrarefy', 'vegan', min(df_nmds.T.sum())) #########LOG ### diversity data df_diversity, filtered_diversity = self.r.filterByTotalOTUCount(self.r.df_otu, self.diversity_filter) df_diversity = self.r.transformDFByRFunction(df_diversity, 'rrarefy', 'vegan', min(df_diversity.T.sum())) #########LOG ### community response data df_community= df_nmds.copy() elif analysis_type == self.level: ### nmds data: relative abundance df_nmds, groups = self.r.getAbundanceByLevel(self.r.df_taxonomy, self.r.df_otu, self.level) groups.to_csv(os.path.join(self.analysis_dir,'%s_phylum_otu_richness.tsv' %self.ssu_type), sep="\t", index_label="Name") df_nmds, filtered_nmds= self.r.filterByTotalOTUCount(df_nmds, self.nmds_filter) df_nmds.to_csv(os.path.join(self.analysis_dir,'%s_phylum_otu_absolute_counts.tsv' %self.ssu_type), sep="\t", index_label="Name") df_nmds = self.toRelative(df_nmds) ### diversity data: relative abundance df_diversity = df_nmds.copy() ### community response data df_community= df_nmds.copy() else: ### nmds data df_otu = self.r.getAbundanceByTaxon(self.r.df_taxonomy, self.r.df_otu, self.level, analysis_type) df_nmds, filtered_nmds = self.r.filterByTotalOTUCount(df_otu, 50) df_nmds= self.toRelative(df_nmds) #########LOG ### no point for the diversity index df_diversity, filtered_diversity= self.r.filterByTotalOTUCount(df_otu, 50) try: df_diversity = self.r.transformDFByRFunction(df_diversity, 'rrarefy', 'vegan', min(df_diversity.T.sum())) #########LOG except: pass ### community response data df_community= df_nmds.copy() if not analysis_type == self.level: try: self.plotIndex(df_diversity, self.r.df_metadata[metadata_type], xlabel) except: pass else: #self.level try: df_abundance = df_nmds df_abundance= df_abundance.join(self.r.df_metadata[metadata_type]).sort(metadata_type) df_abundance.index= [ind.replace("alex","sample") for ind in df_abundance.index] df_abundance.index = df_abundance.index +" "+ df_abundance[metadata_type].apply(str) df_abundance= df_abundance.drop(metadata_type, axis= 1) col_order=df_abundance.mean().order(ascending=False).index df_abundance= df_abundance[col_order] df_abundance.to_csv(os.path.join(self.analysis_dir,'tables' ,"%s_phylum_relative_abundance.tsv" %self.ssu_type) , sep="\t", index_label= "Name") self.plotStackedBar(df_abundance, analysis_type) except Exception, e: print e try: self.plotNMDS(df_nmds, self.r.df_metadata[metadata_type], analysis_type) except: pass try: self.plotRarefaction(df_nmds) except: pass try: self.plotCommunityResponse(df_community, self.r.df_metadata[metadata_type]) except: pass