示例#1
0
文件: main.py 项目: ecotox/balder
 def __init__(self, metadata_file='', otu_file='', taxonomy_file='', ssu_type='', nmds_filter=0, diversity_filter=0, metadata_col='', query_control='', query_treatment='', metadata_axis_label='' ):
     """
         data_nmds has one more sample than the data_diversity which has ~ 1000 reads
     """
     self.r= RAdapter()
     
     self.level= "phylum"
     self.ssu_type= ssu_type
      
     self.metadata_file =  metadata_file
     self.otu_file      =  otu_file
     self.taxonomy_file =  taxonomy_file
     self.nmds_filter   =  nmds_filter
     self.diversity_filter = diversity_filter
     self.metadata_col  =  metadata_col
     self.query_control =  query_control
     self.query_treatment  = query_treatment
     self.metadata_axis_label = metadata_axis_label
    
     self.prepData()
示例#2
0
文件: main.py 项目: ecotox/balder
class Main:
    
    def __init__(self, metadata_file='', otu_file='', taxonomy_file='', ssu_type='', nmds_filter=0, diversity_filter=0, metadata_col='', query_control='', query_treatment='', metadata_axis_label='' ):
        """
            data_nmds has one more sample than the data_diversity which has ~ 1000 reads
        """
        self.r= RAdapter()
        
        self.level= "phylum"
        self.ssu_type= ssu_type
         
        self.metadata_file =  metadata_file
        self.otu_file      =  otu_file
        self.taxonomy_file =  taxonomy_file
        self.nmds_filter   =  nmds_filter
        self.diversity_filter = diversity_filter
        self.metadata_col  =  metadata_col
        self.query_control =  query_control
        self.query_treatment  = query_treatment
        self.metadata_axis_label = metadata_axis_label
       
        self.prepData()


    def run(self):
        
        metadata, otu, taxonomy, analyses= self.prepData()
        self.r.initData(metadata, otu, taxonomy) 
        
        for analysis_type in analyses:
            self.analyze(analysis_type, self.metadata_col, self.query_control, self.query_treatment, self.metadata_axis_label)

    def prepData(self):
        Input=namedtuple('Input','file index function')
        #level= "phylum"
        
        if self.ssu_type == '16S':
            sub_taxa= photosynthetic_bacteria=['Proteobacteria','Cyanobacteria', 'Candidate division OD1','BD1_5', 'Actinobacteria','Chloroflexi', 'Bacteroidetes','Planctomycetes']
            f_metadata, f_otu, f_taxonomy=   f_metadata_16S, f_otu_16S, f_taxonomy_16S        
        
        elif self.ssu_type == "18S":
            sub_taxa= ['Metazoa','Fungi','Rhizaria','Alveolata','Ichthyosporea','Rhodophyta','Viridiplantae','Cryptophyta','Haptophyceae', 'stramenopiles','Euglenozoa']
            f_metadata, f_otu, f_taxonomy=   f_metadata_18S, f_otu_18S, f_taxonomy_18S        

        metadata= Input(self.metadata_file,"sample short_name",f_metadata)
        otu= Input(self.otu_file, 0, f_otu)
        taxonomy= Input(self.taxonomy_file, 0, f_taxonomy)
        analyses= [self.ssu_type, self.level]+sub_taxa
        return (metadata, otu, taxonomy, analyses)




    def analyze(self, analysis_type, metadata_type, filter_controls, filter_treatments, xlabel):
        self.analysis_type= analysis_type
        self.buildDirs(analysis_type)
        try:
            self.buildTables(filter_controls, filter_treatments, analysis_type, metadata_type)
        except:
            pass
        self.buildPlots(analysis_type, metadata_type, xlabel)


    def buildDirs(self, analysis_type):
        self.analysis_dir= os.path.join('results_%s' % self.ssu_type, analysis_type) 
        
        if os.path.exists(self.analysis_dir):
            shutil.rmtree(self.analysis_dir)
        
        self.dirTables= os.path.join(self.analysis_dir, 'tables')
        os.makedirs(os.path.join(self.dirTables))
       

    def toRelative(self, df):
        df= df.T.apply(lambda x:x/x.sum()).T
        return df


    def buildTables(self, filterControls, filterTreatments, analysis_type, metadata_type):
        ### apply filters and transformations on nmds data
        if analysis_type == "16S" or analysis_type == "18S" :
        ### regression data
            df_regression , filtered_regression= self.r.filterByTotalOTUCount(self.r.df_otu, self.diversity_filter)
            df_regression = self.r.transformDFByRFunction(df_regression, 'rrarefy', 'vegan', min(df_regression.T.sum()))
            #########LOG
        ### comparison data
            df_comparison= df_regression.copy()
            metadata= self.r.df_metadata.ix[df_regression.index]
       

        elif analysis_type == self.level:
        ### regression data: relative abundance    
            df_regression, groups = self.r.getAbundanceByLevel(self.r.df_taxonomy, self.r.df_otu, self.level)
            groups.to_csv(os.path.join(self.analysis_dir,'tables', '%s_phylum_otu_richness.tsv' %self.ssu_type), sep="\t", index_label="Name")
            df_regression.to_csv(os.path.join(self.analysis_dir,'tables','%s_phylum_otu_absolute_counts.tsv' %self.ssu_type), sep="\t", index_label="Name")
            df_regression = self.toRelative(df_regression)
        ### comparison data: relative abundance    
            df_comparison = df_regression.copy()
            metadata= self.r.df_metadata.ix[df_regression.index]
       

        else:
        ### regression data
            df_otu = self.r.getAbundanceByTaxon(self.r.df_taxonomy, self.r.df_otu, self.level, analysis_type)

            df_regression , filtered_regression= self.r.filterByTotalOTUCount(df_otu, 50)
            df_regression = self.r.transformDFByRFunction(df_regression, 'rrarefy', 'vegan', min(df_regression.T.sum()))
            #########LOG
            #pdb.set_trace()
        ### comparison data
            df_comparison= df_regression.copy()
            metadata= self.r.df_metadata.ix[df_regression.index]



        controls= metadata.query(filterControls).index
        treatments=metadata.query(filterTreatments).index 
        
        g1= df_comparison.ix[controls]
        g2= df_comparison.ix[treatments]
        
        result =  mannWhitneyUTest(g1.T, g2.T)
        
        ### regression 
        md= metadata[metadata_type]
        
        if analysis_type == self.level:
            self.comp = result[result['fdr'] <=0.1]
            self.corr = spearmanCorr(df_regression.T, md.T)
        else:    
            self.comp = result[result['fdr'] <=0.1].join(self.r.df_taxonomy).drop(['domain','superkingdom'], axis=1)
            self.corr = spearmanCorr(df_regression.T, md.T).join(self.r.df_taxonomy).drop(['domain','superkingdom'], axis=1)

        self.comp.to_excel(os.path.join(self.dirTables, 'otu_comparison.xlsx'), index_label="OTU")
        self.corr.to_excel(os.path.join(self.dirTables, 'otu_correlation.xlsx'), index_label="OTU")


    
    def buildPlots(self, analysis_type, metadata_type, xlabel):
        
        if analysis_type == "16S" or analysis_type == "18S" :
        ### nmds data
            df_nmds, filtered_nmds = self.r.filterByTotalOTUCount(self.r.df_otu, self.nmds_filter)
            df_nmds= self.r.transformDFByRFunction(df_nmds, 'rrarefy', 'vegan', min(df_nmds.T.sum()))
            #########LOG
        ### diversity data
            df_diversity, filtered_diversity = self.r.filterByTotalOTUCount(self.r.df_otu, self.diversity_filter)
            df_diversity = self.r.transformDFByRFunction(df_diversity, 'rrarefy', 'vegan', min(df_diversity.T.sum()))
            #########LOG
        ### community response data
            df_community= df_nmds.copy()
            
        
        elif analysis_type == self.level:
        ### nmds data: relative abundance    
            
            df_nmds, groups = self.r.getAbundanceByLevel(self.r.df_taxonomy, self.r.df_otu, self.level)
            groups.to_csv(os.path.join(self.analysis_dir,'%s_phylum_otu_richness.tsv' %self.ssu_type), sep="\t", index_label="Name")
            df_nmds, filtered_nmds= self.r.filterByTotalOTUCount(df_nmds, self.nmds_filter)
            
            df_nmds.to_csv(os.path.join(self.analysis_dir,'%s_phylum_otu_absolute_counts.tsv' %self.ssu_type), sep="\t", index_label="Name")
            df_nmds = self.toRelative(df_nmds)
        ### diversity data: relative abundance    
            df_diversity = df_nmds.copy()
       
        ### community response data
            df_community= df_nmds.copy()
            

        
        else:
        ### nmds data
            df_otu = self.r.getAbundanceByTaxon(self.r.df_taxonomy, self.r.df_otu, self.level, analysis_type)
            
            df_nmds, filtered_nmds = self.r.filterByTotalOTUCount(df_otu, 50)
            df_nmds= self.toRelative(df_nmds)
            #########LOG
        ### no point for the diversity index 
            df_diversity, filtered_diversity= self.r.filterByTotalOTUCount(df_otu, 50)
            
            try:
                df_diversity = self.r.transformDFByRFunction(df_diversity, 'rrarefy', 'vegan', min(df_diversity.T.sum()))
            #########LOG
            except:
                pass
        ### community response data
            df_community= df_nmds.copy()


        if not analysis_type == self.level:
            try:
                self.plotIndex(df_diversity, self.r.df_metadata[metadata_type], xlabel)
            except:
                pass
        
        else:
            #self.level
            try:
                df_abundance = df_nmds
                df_abundance= df_abundance.join(self.r.df_metadata[metadata_type]).sort(metadata_type)
                df_abundance.index= [ind.replace("alex","sample") for ind in df_abundance.index]
                df_abundance.index = df_abundance.index +" "+ df_abundance[metadata_type].apply(str)
                df_abundance= df_abundance.drop(metadata_type, axis= 1)
                
                col_order=df_abundance.mean().order(ascending=False).index
                df_abundance= df_abundance[col_order]
                df_abundance.to_csv(os.path.join(self.analysis_dir,'tables' ,"%s_phylum_relative_abundance.tsv" %self.ssu_type) , sep="\t", index_label= "Name") 
                self.plotStackedBar(df_abundance, analysis_type)
            except Exception, e:
                print e
        
        try:
            self.plotNMDS(df_nmds, self.r.df_metadata[metadata_type], analysis_type)
        except:
            pass
        

        try:
            self.plotRarefaction(df_nmds)
        except:
            pass
        
        try:
            self.plotCommunityResponse(df_community, self.r.df_metadata[metadata_type])
        except:
            pass