示例#1
0
 def __init__(self, S, reaction_ids, compound_ids,
              fluxes=None, name=None):
     """Initialize the stoichiometric model.
     
     Args:
         S: the stoichiometrix matrix.
            Reactions are on the rows, compounds on the columns.
         reaction_ids: the ids/names of the reactions (rows).
         compound_ids: the ids/names of the compounds (columns).
         fluxes: the list of relative fluxes through all reactions.
                 if not supplied, assumed to be 1.0 for all reactions.
         name: a string name for this model.
     """
     self.kegg = Kegg.getInstance()
     self.S = S
     self.reaction_ids = reaction_ids
     self.compound_ids = compound_ids
     self.Nr = len(self.reaction_ids)
     self.Nc = len(self.compound_ids)
     self.name = name
     self.slug_name = util.slugify(self.name)
     
     self.fluxes = np.array(fluxes)
     if fluxes is None:
         self.fluxes = np.ones((1, self.Nr))
     
     expected_Nc, expected_Nr = self.S.shape
     if self.Nr != expected_Nr:
         raise ValueError('Number of columns does not match number of reactions')
     if self.Nc != expected_Nc:
         raise ValueError('Number of rows does not match number of compounds')
     
     if self.fluxes is None:
         self.fluxes = np.ones((self.Nr, 1)) 
    def __init__(self, label):
        self.label = label
        self.conditions_to_raw_culture_levels = {}
        self.conditions_to_raw_reporter_levels = {}
        self.conditions_to_raw_times = {}
        self.conditions_to_culture_levels = {}
        self.conditions_to_reporter_levels = {}
        self.conditions_to_times = {}
        self.conditions_to_activities = {}
        self.conditions_to_smooth_activities = {}

        self.slug_label = util.slugify(self.label)
        self.raw_levels_fname = '%s_raw_levels.png' % self.slug_label
        self.levels_fname = '%s_levels.png' % self.slug_label
        self.vs_bg_fname = '%s_vs_bg.png' % self.slug_label
        self.activity_fname = '%s_activity.png' % self.slug_label
 def __init__(self, label):
     self.label = label
     self.conditions_to_raw_culture_levels = {}
     self.conditions_to_raw_reporter_levels = {}
     self.conditions_to_raw_times = {}
     self.conditions_to_culture_levels = {}
     self.conditions_to_reporter_levels = {}
     self.conditions_to_times = {}
     self.conditions_to_activities = {}
     self.conditions_to_smooth_activities = {}
     
     self.slug_label = util.slugify(self.label)
     self.raw_levels_fname = '%s_raw_levels.png' % self.slug_label
     self.levels_fname = '%s_levels.png' % self.slug_label
     self.vs_bg_fname = '%s_vs_bg.png' % self.slug_label
     self.activity_fname = '%s_activity.png' % self.slug_label
示例#4
0
    def __init__(
        self,
        model,
        thermodynamic_data,
        metabolite_concentration_bounds,
        optimization_status=OptimizationStatus.Successful(),
        optimal_value=None,
        optimal_ln_metabolite_concentrations=None,
    ):
        self.model = model
        self.thermo = thermodynamic_data
        self.bounds = metabolite_concentration_bounds
        self.S = model.GetStoichiometricMatrix()
        self.Ncompounds, self.Nreactions = self.S.shape
        self.status = optimization_status
        self.opt_val = optimal_value
        self.ln_concentrations = optimal_ln_metabolite_concentrations

        self.dGr0_tag = np.array(thermodynamic_data.GetDGrTagZero_ForModel(self.model))
        self.dGr0_tag_list = list(self.dGr0_tag.flatten())
        self.compound_ids = self.model.GetCompoundIDs()
        self.reaction_ids = self.model.GetReactionIDs()
        self.fluxes = self.model.GetFluxes()

        self.slug_name = util.slugify(model.name)
        self.pathway_graph_filename = "%s_graph.svg" % self.slug_name
        self.thermo_profile_filename = "%s_thermo_profile.png" % self.slug_name
        self.conc_profile_filename = "%s_conc_profile.png" % self.slug_name
        self.kegg = Kegg.getInstance()

        self.concentrations = None
        self.dGr_tag = None
        self.dGr_tag_list = None
        self.dGr_bio = None
        self.dGr_bio_list = None

        if self.ln_concentrations is not None and self.dGr0_tag is not None:
            self.concentrations = np.exp(self.ln_concentrations)
            conc_correction = RT * self.ln_concentrations * self.S
            self.dGr_tag = np.array(self.dGr0_tag + conc_correction)
            self.dGr_tag_list = list(self.dGr_tag.flatten())

            bio_concs = self.bounds.GetBoundsWithDefault(self.compound_ids, default=1e-3)
            bio_correction = RT * np.dot(np.log(bio_concs), self.S)
            self.dGr_bio = np.array(self.dGr0_tag + bio_correction)
            self.dGr_bio_list = list(self.dGr_bio.flatten())
示例#5
0
    def __init__(self,
                 model,
                 thermodynamic_data,
                 metabolite_concentration_bounds,
                 optimization_status=OptimizationStatus.Successful(),
                 optimal_value=None,
                 optimal_ln_metabolite_concentrations=None):
        self.model = model
        self.thermo = thermodynamic_data
        self.bounds = metabolite_concentration_bounds
        self.S = model.GetStoichiometricMatrix()
        self.Ncompounds, self.Nreactions = self.S.shape
        self.status = optimization_status
        self.opt_val = optimal_value
        self.ln_concentrations = optimal_ln_metabolite_concentrations

        self.dGr0_tag = np.array(
            thermodynamic_data.GetDGrTagZero_ForModel(self.model))
        self.dGr0_tag_list = list(self.dGr0_tag.flatten())
        self.compound_ids = self.model.GetCompoundIDs()
        self.reaction_ids = self.model.GetReactionIDs()
        self.fluxes = self.model.GetFluxes()

        self.slug_name = util.slugify(model.name)
        self.pathway_graph_filename = '%s_graph.svg' % self.slug_name
        self.thermo_profile_filename = '%s_thermo_profile.png' % self.slug_name
        self.conc_profile_filename = '%s_conc_profile.png' % self.slug_name
        self.kegg = Kegg.getInstance()

        self.concentrations = None
        self.dGr_tag = None
        self.dGr_tag_list = None
        self.dGr_bio = None
        self.dGr_bio_list = None

        if (self.ln_concentrations is not None and self.dGr0_tag is not None):
            self.concentrations = np.exp(self.ln_concentrations)
            conc_correction = RT * self.ln_concentrations * self.S
            self.dGr_tag = np.array(self.dGr0_tag + conc_correction)
            self.dGr_tag_list = list(self.dGr_tag.flatten())

            bio_concs = self.bounds.GetBoundsWithDefault(self.compound_ids,
                                                         default=1e-3)
            bio_correction = RT * np.dot(np.log(bio_concs), self.S)
            self.dGr_bio = np.array(self.dGr0_tag + bio_correction)
            self.dGr_bio_list = list(self.dGr_bio.flatten())
    def MakePerPlateFigures(self, dirname):
        fnames = []
        for condition, plates in self.plates.iteritems():
            for plate in plates:
                
                labels = plate.filtered_labels
                order = np.argsort(labels)
                smooth_activity = plate.smooth_filtered_activities
                max_activity = plate.filtered_max_activities
                left_mat = np.diag(1/max_activity)
                scaled_activity = np.dot(left_mat, smooth_activity)

                scaled_culture = plate.scaled_culture_levels
                max_culture_idx = np.argmin(np.abs(scaled_culture - 1.0), 1)
                for i, j in enumerate(max_culture_idx):
                    scaled_culture[i,j+1:] = 100
                
                Nr, _ = scaled_activity.shape
                od_fracs = np.arange(0.0, 1.0, 0.001)
                activity_per_od_frac = np.zeros((Nr, len(od_fracs)))
                for j, frac in enumerate(od_fracs):
                    abs_min = np.abs(scaled_culture - frac)
                    idxs = np.argmin(abs_min, 1)
                    for i in order:
                        idx = idxs[i]
                        activity_per_od_frac[i,j] = scaled_activity[i, idx]
                
                pylab.figure()
                pylab.title(condition)
                pylab.imshow(activity_per_od_frac, aspect='auto')
                
                condition_plate_name = util.slugify(condition)
                fname = '%s.png' % condition_plate_name
                pylab.savefig(path.join(dirname, fname),
                              format='png')
                
                fnames.append(fname)
            
        return fnames
    def MakePerPlateFigures(self, dirname):
        fnames = []
        for condition, plates in self.plates.iteritems():
            for plate in plates:

                labels = plate.filtered_labels
                order = np.argsort(labels)
                smooth_activity = plate.smooth_filtered_activities
                max_activity = plate.filtered_max_activities
                left_mat = np.diag(1 / max_activity)
                scaled_activity = np.dot(left_mat, smooth_activity)

                scaled_culture = plate.scaled_culture_levels
                max_culture_idx = np.argmin(np.abs(scaled_culture - 1.0), 1)
                for i, j in enumerate(max_culture_idx):
                    scaled_culture[i, j + 1:] = 100

                Nr, _ = scaled_activity.shape
                od_fracs = np.arange(0.0, 1.0, 0.001)
                activity_per_od_frac = np.zeros((Nr, len(od_fracs)))
                for j, frac in enumerate(od_fracs):
                    abs_min = np.abs(scaled_culture - frac)
                    idxs = np.argmin(abs_min, 1)
                    for i in order:
                        idx = idxs[i]
                        activity_per_od_frac[i, j] = scaled_activity[i, idx]

                pylab.figure()
                pylab.title(condition)
                pylab.imshow(activity_per_od_frac, aspect='auto')

                condition_plate_name = util.slugify(condition)
                fname = '%s.png' % condition_plate_name
                pylab.savefig(path.join(dirname, fname), format='png')

                fnames.append(fname)

        return fnames
def Main():
    options, _ = MakeOpts().parse_args(sys.argv)
    assert options.tree_filename and options.tree_format
    assert options.pathways_filename and options.genome_db_filename
    assert options.output_filename

    print 'Reading tree from', options.tree_filename
    tree = dendropy.Tree()
    tree.read_from_path(options.tree_filename,
                        options.tree_format,
                        extract_comment_metadata=True)

    leaves = tree.leaf_nodes()
    print 'Tree has %d leaf nodes' % len(leaves)

    pathways = pathway.LoadPathways(options.pathways_filename)
    db = genome_db.GenomeDB(options.genome_db_filename)

    pathway_names = [util.slugify(p.name) for p in pathways]
    org_2_pathways = {}
    for path in pathways:
        org_counts = Counter()
        for enz_set in path.enzyme_sets:
            orgs_w_enz = set()
            for ec in enz_set:
                orgs_w_enz.update(list(db.OrganismsForEC(ec)))
            org_counts += Counter(orgs_w_enz)

        orgs_w_pathway = [
            o for o, c in org_counts.iteritems() if c == len(path.enzyme_sets)
        ]
        orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway))

        for org in orgs_w_pathway:
            org_2_pathways.setdefault(org, set()).add(util.slugify(path.name))

    # Find the organisms that have pathway tags.
    all_labels = set([l.taxon.label for l in leaves])
    pathway_orgs = set(org_2_pathways.keys())
    intersect = all_labels.intersection(pathway_orgs)
    print len(intersect), 'pathway orgs found'
    print len(pathway_orgs) - len(intersect), 'pathway orgs not found'

    # Find organisms that are heterotrophs
    if options.only_heterotrophs:
        print 'Pruning non-heterotrophs'
        q = db.db.Execute(
            'SELECT ncbi_taxon_id, energy_category from organisms')
        ncbi_to_keep = set()
        for row in q:
            ncbi_id, energy_cat = row
            if energy_cat and energy_cat.lower() == 'organic':
                ncbi_to_keep.add(ncbi_id.strip())
        tree.retain_taxa_with_labels(ncbi_to_keep)

        leaves = tree.leaf_nodes()
        print 'Tree now contains', len(leaves), 'leaves'

    lengths = []
    for e in tree.leaf_edge_iter():
        lengths.append(e.length)
    lengths = pylab.array(lengths)
    below_thresh = pylab.find(lengths < options.edge_length_threshold).size
    pct_below = 100.0 * float(below_thresh) / float(len(lengths))

    print 'Median length', pylab.median(lengths)
    print 'Mean length', pylab.mean(lengths)
    print pct_below, '% below threshold'

    print 'Pruning leaves'
    for l in tree.leaf_nodes():
        label = l.taxon.label
        pathways = org_2_pathways.get(label, set())
        l.pathways = pathways
        for pname in pathways:
            setattr(l, pname, True)
            l.annotate(pname)

        l.count = 1
        l.annotate('count')

    while True:
        changed = False
        for e in tree.leaf_edge_iter():
            if (e.tail_node is not None
                    and e.length < options.edge_length_threshold):
                changed |= MaybeMergeChildren(e.tail_node)

        if not changed:
            break

    tree.write_to_path(options.output_filename,
                       'nexus',
                       suppress_annotations=True)

    leaves = tree.leaf_nodes()
    print 'Tree now has %d leaf nodes' % len(leaves)

    colormap = {
        'upper_emp_unique': '#008837',
        'upper_ed_unique': '#7b3294',
        'pts': '#868800',
        'glucokinase': '#002288'
    }
    default_color = '#c0c3c7'

    name_2_count = {}
    path_vectors = {}

    for name in pathway_names:
        fname = util.slugify(name) + '.csv'
        f = open(fname, 'w')
        w = csv.writer(f)
        v = []
        for leaf in leaves:
            taxon_id = leaf.taxon.label
            pathways = leaf.pathways
            d = leaf.annotations()
            value = d.get(name, (False, None))[0]
            color = default_color
            if value is True:
                color = colormap[name]
                name_2_count[name] = name_2_count.get(name, 0) + 1
                v.append(1)
            else:
                v.append(0)
            w.writerow([taxon_id, color, value])
        path_vectors[name] = pylab.array(v)
        f.close()

    pts_vector = path_vectors['pts']
    glk_vector = path_vectors['glucokinase']
    ed_vector = path_vectors['upper_ed_unique']
    emp_vector = path_vectors['upper_emp_unique']
    ed_only = np.logical_and(ed_vector, np.logical_not(emp_vector))
    emp_only = np.logical_and(emp_vector, np.logical_not(ed_vector))

    print 'ED, EMP'
    r, p_val = stats.pearsonr(ed_vector, emp_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED, PTS'
    r, p_val = stats.pearsonr(ed_vector, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP, PTS'
    r, p_val = stats.pearsonr(emp_vector, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED, GLK'
    r, p_val = stats.pearsonr(ed_vector, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP, GLK'
    r, p_val = stats.pearsonr(emp_vector, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED only, GLK'
    r, p_val = stats.pearsonr(ed_only, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED only, PTS'
    r, p_val = stats.pearsonr(ed_only, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP only, GLK'
    r, p_val = stats.pearsonr(emp_only, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP only, PTS'
    r, p_val = stats.pearsonr(emp_only, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    nleaves = len(leaves)
    for name, count in name_2_count.iteritems():
        pct = 100.0 * float(count) / float(nleaves)
        print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves,
                                                     str(name))

    v_or_accumulator = pylab.zeros(nleaves)
    v_and_accumulator = pylab.ones(nleaves)
    for v in path_vectors.values():
        v_or_accumulator = np.logical_or(v_or_accumulator, v)
        v_and_accumulator = np.logical_and(v_and_accumulator, v)
    total_w_any = pylab.where(v_or_accumulator == True)[0].size
    total_w_all = pylab.where(v_and_accumulator == True)[0].size
    any_pct = 100.0 * float(total_w_any) / float(nleaves)
    all_pct = 100.0 * float(total_w_all) / float(nleaves)
    print '%.2f%% (%d of %d) have some pathway' % (any_pct, total_w_any,
                                                   nleaves)
    print '%.2f%% (%d of %d) have all pathways' % (all_pct, total_w_any,
                                                   nleaves)
    fname = 'Node_Counts.csv'
    f = open(fname, 'w')
    w = csv.writer(f)
    for leaf in leaves:
        taxon = leaf.taxon
        taxon_id = taxon.label
        w.writerow([taxon_id, leaf.count])
    f.close()
示例#9
0
def Main():
    options, _ = MakeOpts().parse_args(sys.argv)
    assert options.tree_filename and options.tree_format
    assert options.pathways_filename and options.genome_db_filename
    assert options.output_filename
    
    print 'Reading tree from', options.tree_filename
    tree = dendropy.Tree()
    tree.read_from_path(options.tree_filename, options.tree_format,
                        extract_comment_metadata=True)
    
    leaves = tree.leaf_nodes()
    print 'Tree has %d leaf nodes' % len(leaves)
    
    pathways = pathway.LoadPathways(options.pathways_filename)
    db = genome_db.GenomeDB(options.genome_db_filename)
    
    pathway_names = [util.slugify(p.name) for p in pathways]
    org_2_pathways = {}
    for path in pathways:
        org_counts = Counter()
        for enz_set in path.enzyme_sets:
            orgs_w_enz = set()
            for ec in enz_set:
                orgs_w_enz.update(list(db.OrganismsForEC(ec)))                
            org_counts += Counter(orgs_w_enz)
            
        orgs_w_pathway = [o for o,c in org_counts.iteritems()
                          if c == len(path.enzyme_sets)]
        orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway))
        
        for org in orgs_w_pathway:
            org_2_pathways.setdefault(org, set()).add(util.slugify(path.name))

    print 'Finding oxygen requirements'
    org_2_oxy_req = {}
    for row in db.SelectOrganismFields(['ncbi_taxon_id', 'broad_oxygen_requirement']):
        ncbi_id, oxy_req = row
        org_2_oxy_req[ncbi_id] = oxy_req
    observed_oxygen_reqs = set(org_2_oxy_req.values())
    print 'Observed oxygen requirements', observed_oxygen_reqs
    

    # Find the organisms that have pathway tags.         
    all_labels = set([l.taxon.label for l in leaves])
    pathway_orgs = set(org_2_pathways.keys())
    intersect = all_labels.intersection(pathway_orgs)
    print len(intersect), 'pathway orgs found'
    print len(pathway_orgs) - len(intersect), 'pathway orgs not found'
    
    # Find organisms that are heterotrophs
    if options.only_heterotrophs:
        print 'Pruning non-heterotrophs'
        q = db.db.Execute('SELECT ncbi_taxon_id, energy_category from organisms')
        ncbi_to_keep = set()
        for row in q:
            ncbi_id, energy_cat = row
            if energy_cat and energy_cat.lower() == 'organic':
                ncbi_to_keep.add(ncbi_id.strip())
        tree.retain_taxa_with_labels(ncbi_to_keep)
    
        leaves = tree.leaf_nodes()
        print 'Tree now contains', len(leaves), 'leaves'
    
    lengths = []
    for e in tree.leaf_edge_iter():
        lengths.append(e.length)
    lengths = pylab.array(lengths)
    below_thresh = pylab.find(lengths < options.edge_length_threshold).size
    pct_below = 100.0 * float(below_thresh) / float(len(lengths))
    
    print 'Median length', pylab.median(lengths)
    print 'Mean length', pylab.mean(lengths)
    print pct_below, '% below threshold'
    
    print 'Pruning leaves'            
    for l in tree.leaf_nodes():
        label = l.taxon.label
        pathways = org_2_pathways.get(label, set())
        l.pathways = pathways
        for pname in pathways:
            setattr(l, pname, True)
            l.annotate(pname)
        
        oxy_req = org_2_oxy_req.get(label, None)
        l.oxygen_req = {oxy_req: 1}
        l.annotate('oxygen_req')
        
        l.count = 1
        l.annotate('count')

    while True:
        changed = False
        for e in tree.leaf_edge_iter():
            if (e.tail_node is not None and
                e.length < options.edge_length_threshold):
                changed |= MaybeMergeChildren(e.tail_node)
        
        if not changed:
            break
        
    tree.write_to_path(options.output_filename, 'nexus',
                       suppress_annotations=True)

    leaves = tree.leaf_nodes()
    print 'Tree now has %d leaf nodes' % len(leaves)

    colormap = {'upper_emp_unique': '#008837',
                'upper_ed_unique': '#7b3294',
                'nonp_ed_unique': '#868800'}
    default_color = '#c0c3c7'
    
    name_2_count = {}
    path_vectors = {}
    
    for name in pathway_names:
        fname = util.slugify(name) + '.csv'
        f = open(fname, 'w')
        w = csv.writer(f)
        v = []
        for leaf in leaves:
            taxon_id = leaf.taxon.label
            pathways = leaf.pathways
            d = leaf.annotations()
            value = d.get(name, (False, None))[0]
            color = default_color
            if value is True:
                color = colormap.get(name, 'default')
                name_2_count[name] = name_2_count.get(name, 0) + 1
                v.append(1)
            else:
                v.append(0)
            w.writerow([taxon_id, color, value])
        path_vectors[name] = pylab.array(v)    
        f.close()
    
    ed_vector  = path_vectors['upper_ed_unique']
    emp_vector = path_vectors['upper_emp_unique']
    r, p_val = stats.pearsonr(ed_vector, emp_vector)
    print 'Pearson correlation coefficient (r)', r
    print 'R^2', r**2
    print 'p-value', p_val


    mat_shape = (len(observed_oxygen_reqs), 4)
    count_mat = np.matrix(np.zeros(mat_shape))
    prob_mat  = np.matrix(np.zeros(mat_shape))
    oxy_req_idx_map = dict((i, k) for i, k in enumerate(sorted(observed_oxygen_reqs)))
    column_labels = {0: 'None',
                     1: 'EMP Only',
                     2: 'ED Only',
                     3: 'Both'}
    
    for leaf_idx, leaf in enumerate(leaves):
        d = leaf.annotations()
        oxygen_reqs = d.get('oxygen_req', (None, None))[0]
        total_count = float(sum(oxygen_reqs.values()))
        
        ed_presence  = ed_vector[leaf_idx]
        emp_presence = emp_vector[leaf_idx]
        for i, oxy_req in oxy_req_idx_map.iteritems():
            oxy_frac = oxygen_reqs.get(oxy_req, 0) / total_count
            if ed_presence and emp_presence:
                count_mat[i, 3] += oxy_frac
            elif ed_presence:
                count_mat[i, 2] += oxy_frac
            elif emp_presence:
                count_mat[i, 1] += oxy_frac
            else:
                count_mat[i, 0] += oxy_frac
    
    total_samples = float(np.sum(count_mat))
    ed_samples    = float(np.sum(ed_vector))
    emp_samples   = float(np.sum(emp_vector))
    ed_prob       = ed_samples / total_samples
    emp_prob      = emp_samples / total_samples
    for i, oxy_req in oxy_req_idx_map.iteritems():
        oxy_req_count  = float(np.sum(count_mat[i,:]))
        oxy_req_prob   = oxy_req_count / total_samples
        # Probability of neither pathway
        prob_mat[i, 0] = (1-ed_prob) * (1-emp_prob) * oxy_req_prob
        # Probability of EMP only
        prob_mat[i, 1] = (1-ed_prob) * (emp_prob) * oxy_req_prob
        # Probability of ED only
        prob_mat[i, 2] = (ed_prob) * (1-emp_prob) * oxy_req_prob
        # Probability of both
        prob_mat[i, 3] = (ed_prob) * (emp_prob) * oxy_req_prob
    
    p_vals = mystats.CalcPValues(count_mat, prob_mat)
    print 'Counts'
    print count_mat
    print 'Probabilities assuming random sampling'
    print prob_mat
    print 'Mappings'
    print oxy_req_idx_map
    print column_labels
    print 'P-values'
    print p_vals
    
    # Plot the p-values
    pylab.figure()
    xs = sorted(column_labels.keys())
    xticks = [column_labels[i] for i in xs]
    ys = sorted(oxy_req_idx_map.keys())
    yticks = [oxy_req_idx_map[j] for j in ys]
    
    sigs = p_vals < 0.05
    super_sigs = p_vals < 0.001
    rows, cols = sigs.shape
    for i in xrange(rows):
        for j in xrange(cols):
            if super_sigs[i,j]:
                print oxy_req_idx_map[i], 'x', column_labels[j],
                print '**', p_vals[i,j]
                pylab.text(j, i, '**', color='w')
            elif sigs[i,j]:
                print oxy_req_idx_map[i], 'x', column_labels[j],
                print '*', p_vals[i,j]
                pylab.text(j, i, '*', color='w')
    
    
    pylab.imshow(p_vals, interpolation='nearest')
    pylab.xticks(xs, xticks)
    pylab.yticks(ys, yticks)
    pylab.colorbar()
    
    
    # Plot the bar plot breakdown.
    
    restricted_counts = np.matrix(np.zeros((3,3)))
    allowed_genotypes = ['ED Only', 'Both', 'EMP Only']
    allowed_phenotypes = ['anaerobe', 'facultative', 'aerobe']
    
    for j, genotype in column_labels.iteritems():
        if genotype not in allowed_genotypes:
            continue
        
        new_j = allowed_genotypes.index(genotype)
        
        for i, phenotype in oxy_req_idx_map.iteritems():
            if phenotype not in allowed_phenotypes:
                continue
            
            new_i = allowed_phenotypes.index(phenotype)
            restricted_counts[new_i, new_j] = count_mat[i,j]
    
    pcts_matrix = restricted_counts / np.sum(restricted_counts, 1)
    print 'Phenotypes (rows)'
    print allowed_phenotypes
    print 'Genotypes (cols)'
    print allowed_genotypes
    print 'Counts of interesting categories'
    print restricted_counts
    print 'PCTs including interesting categories'
    print pcts_matrix * 100.0
    print 'Effective number of organisms', np.sum(restricted_counts)
    
    colors = ['#37DD6F', '#FF5D40', '#4186D3']
    pylab.figure()
    current_bottom = pylab.zeros(3)
    rows, cols = pcts_matrix.shape
    for j in xrange(cols):
        heights = np.array(pcts_matrix[:,j].flat)
        xs = np.arange(heights.size)
        pylab.bar(xs, heights, bottom=current_bottom,
                  color=colors[j], edgecolor='w',
                  label=allowed_genotypes[j],
                  align='center')
        current_bottom += heights
    xs = pylab.arange(3) + 0.5
    pylab.xticks(xs, allowed_phenotypes)
    pylab.legend()
    pylab.show()
    
    
    colormap = {'Organic': '#ff0000',
                'Inorganic': '#00ff00',
                'aerobe': '#2861e4',
                'anaerobe': '#e44228',
                'facultative': '#e4c328'}
    fname = 'trophism.csv'
    f = open(fname, 'w')
    w = csv.writer(f)
    for l in leaves:
        label = l.taxon.label
        cat = db.NCBI2EnergyCategory(label)
        color = colormap.get(cat, default_color)
        w.writerow([label, color, cat])
    f.close()

    fname = 'oxy_req.csv'
    f = open(fname, 'w')
    w = csv.writer(f)
    for l in leaves:
        label = l.taxon.label
        cat = db.NCBI2BroadOxygenReq(label)
        color = colormap.get(cat, default_color)
        w.writerow([label, color, cat])
    f.close()
    
    nleaves = len(leaves)
    for name, count in name_2_count.iteritems():
        pct = 100.0 * float(count) / float(nleaves)
        print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves,
                                                    str(name))
    
    v_or_accumulator  = pylab.zeros(nleaves)
    v_and_accumulator = pylab.ones(nleaves)
    for v in path_vectors.values():
        v_or_accumulator  = np.logical_or(v_or_accumulator, v)
        v_and_accumulator = np.logical_and(v_and_accumulator, v)
    total_w_any = pylab.where(v_or_accumulator == True)[0].size
    total_w_all = pylab.where(v_and_accumulator == True)[0].size
    any_pct = 100.0 * float(total_w_any) / float(nleaves)
    all_pct = 100.0 * float(total_w_all) / float(nleaves)
    print '%.2f%% (%d of %d) have some pathway' % (any_pct,
                                                   total_w_any,
                                                   nleaves)
    print '%.2f%% (%d of %d) have all pathways' % (all_pct,
                                                   total_w_all,
                                                   nleaves)
    fname = 'Node_Counts.csv'
    f = open(fname, 'w')
    w = csv.writer(f)
    for leaf in leaves:
        taxon = leaf.taxon
        taxon_id = taxon.label
        w.writerow([taxon_id, leaf.count]) 
    f.close()
def Main():
    options, _ = MakeOpts().parse_args(sys.argv)
    assert options.tree_filename and options.tree_format
    assert options.pathways_filename and options.genome_db_filename
    assert options.output_filename
    
    print 'Reading tree from', options.tree_filename
    tree = dendropy.Tree()
    tree.read_from_path(options.tree_filename, options.tree_format,
                        extract_comment_metadata=True)
    
    leaves = tree.leaf_nodes()
    print 'Tree has %d leaf nodes' % len(leaves)
    
    pathways = pathway.LoadPathways(options.pathways_filename)
    db = genome_db.GenomeDB(options.genome_db_filename)
    
    pathway_names = [util.slugify(p.name) for p in pathways]
    org_2_pathways = {}
    for path in pathways:
        org_counts = Counter()
        for enz_set in path.enzyme_sets:
            orgs_w_enz = set()
            for ec in enz_set:
                orgs_w_enz.update(list(db.OrganismsForEC(ec)))                
            org_counts += Counter(orgs_w_enz)
            
        orgs_w_pathway = [o for o,c in org_counts.iteritems()
                          if c == len(path.enzyme_sets)]
        orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway))
        
        for org in orgs_w_pathway:
            org_2_pathways.setdefault(org, set()).add(util.slugify(path.name))

    # Find the organisms that have pathway tags.         
    all_labels = set([l.taxon.label for l in leaves])
    pathway_orgs = set(org_2_pathways.keys())
    intersect = all_labels.intersection(pathway_orgs)
    print len(intersect), 'pathway orgs found'
    print len(pathway_orgs) - len(intersect), 'pathway orgs not found'
    
    # Find organisms that are heterotrophs
    if options.only_heterotrophs:
        print 'Pruning non-heterotrophs'
        q = db.db.Execute('SELECT ncbi_taxon_id, energy_category from organisms')
        ncbi_to_keep = set()
        for row in q:
            ncbi_id, energy_cat = row
            if energy_cat and energy_cat.lower() == 'organic':
                ncbi_to_keep.add(ncbi_id.strip())
        tree.retain_taxa_with_labels(ncbi_to_keep)
    
        leaves = tree.leaf_nodes()
        print 'Tree now contains', len(leaves), 'leaves'
    
    lengths = []
    for e in tree.leaf_edge_iter():
        lengths.append(e.length)
    lengths = pylab.array(lengths)
    below_thresh = pylab.find(lengths < options.edge_length_threshold).size
    pct_below = 100.0 * float(below_thresh) / float(len(lengths))
    
    print 'Median length', pylab.median(lengths)
    print 'Mean length', pylab.mean(lengths)
    print pct_below, '% below threshold'
    
    print 'Pruning leaves'            
    for l in tree.leaf_nodes():
        label = l.taxon.label
        pathways = org_2_pathways.get(label, set())
        l.pathways = pathways
        for pname in pathways:
            setattr(l, pname, True)
            l.annotate(pname)
        
        l.count = 1
        l.annotate('count')

    while True:
        changed = False
        for e in tree.leaf_edge_iter():
            if (e.tail_node is not None and
                e.length < options.edge_length_threshold):
                changed |= MaybeMergeChildren(e.tail_node)
        
        if not changed:
            break
        
    tree.write_to_path(options.output_filename, 'nexus',
                       suppress_annotations=True)

    leaves = tree.leaf_nodes()
    print 'Tree now has %d leaf nodes' % len(leaves)

    colormap = {'upper_emp_unique': '#008837',
                'upper_ed_unique': '#7b3294',
                'pts': '#868800',
                'glucokinase': '#002288'}
    default_color = '#c0c3c7'
    
    name_2_count = {}
    path_vectors = {}
    
    for name in pathway_names:
        fname = util.slugify(name) + '.csv'
        f = open(fname, 'w')
        w = csv.writer(f)
        v = []
        for leaf in leaves:
            taxon_id = leaf.taxon.label
            pathways = leaf.pathways
            d = leaf.annotations()
            value = d.get(name, (False, None))[0]
            color = default_color
            if value is True:
                color = colormap[name]
                name_2_count[name] = name_2_count.get(name, 0) + 1
                v.append(1)
            else:
                v.append(0)
            w.writerow([taxon_id, color, value])
        path_vectors[name] = pylab.array(v)    
        f.close()
    
    pts_vector  = path_vectors['pts']
    glk_vector  = path_vectors['glucokinase']
    ed_vector = path_vectors['upper_ed_unique']
    emp_vector = path_vectors['upper_emp_unique']
    ed_only = np.logical_and(ed_vector, 
                             np.logical_not(emp_vector))
    emp_only = np.logical_and(emp_vector, 
                              np.logical_not(ed_vector))
    
    
    print 'ED, EMP'
    r, p_val = stats.pearsonr(ed_vector, emp_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val
    
    print 'ED, PTS'
    r, p_val = stats.pearsonr(ed_vector, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val
    
    print 'EMP, PTS'
    r, p_val = stats.pearsonr(emp_vector, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED, GLK'
    r, p_val = stats.pearsonr(ed_vector, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP, GLK'
    r, p_val = stats.pearsonr(emp_vector, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED only, GLK'
    r, p_val = stats.pearsonr(ed_only, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'ED only, PTS'
    r, p_val = stats.pearsonr(ed_only, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val
    
    print 'EMP only, GLK'
    r, p_val = stats.pearsonr(emp_only, glk_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val

    print 'EMP only, PTS'
    r, p_val = stats.pearsonr(emp_only, pts_vector)
    print 'R', r
    print 'R^2', r**2
    print 'p-value', p_val
    
    nleaves = len(leaves)
    for name, count in name_2_count.iteritems():
        pct = 100.0 * float(count) / float(nleaves)
        print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves,
                                                    str(name))
    
    v_or_accumulator  = pylab.zeros(nleaves)
    v_and_accumulator = pylab.ones(nleaves)
    for v in path_vectors.values():
        v_or_accumulator  = np.logical_or(v_or_accumulator, v)
        v_and_accumulator = np.logical_and(v_and_accumulator, v)
    total_w_any = pylab.where(v_or_accumulator == True)[0].size
    total_w_all = pylab.where(v_and_accumulator == True)[0].size
    any_pct = 100.0 * float(total_w_any) / float(nleaves)
    all_pct = 100.0 * float(total_w_all) / float(nleaves)
    print '%.2f%% (%d of %d) have some pathway' % (any_pct,
                                                   total_w_any,
                                                   nleaves)
    print '%.2f%% (%d of %d) have all pathways' % (all_pct,
                                                   total_w_any,
                                                   nleaves)
    fname = 'Node_Counts.csv'
    f = open(fname, 'w')
    w = csv.writer(f)
    for leaf in leaves:
        taxon = leaf.taxon
        taxon_id = taxon.label
        w.writerow([taxon_id, leaf.count]) 
    f.close()
示例#11
0
class GenomeDB(object):

    OXY_REQ = 'Oxygen Requirement'
    CSV_HEADERS = [
        'Genome Name', 'Strain', 'Genome Status', 'KEGG ID', 'NCBI Taxon ID',
        'Project ID', 'RefSeq Project ID', 'Super Kingdom', 'Genus',
        'Gram Stain', 'Shape', 'Motility', 'Pathogenic in', 'Genome Size',
        'GC Content', 'Salinity', 'Temperature Range', 'Habitat', OXY_REQ,
        'Energy Source', 'Energy Category', 'Metabolism', 'Sequencing Center'
    ]
    CSV_HEADER_MAPPING = dict((util.slugify(h), h) for h in CSV_HEADERS)
    CSV_HEADER_MAPPING['phylogenetic_group'] = 'Group'
    CSV_HEADER_MAPPING['phylogenetic_order'] = 'Order'
    ORG_TABLE_HEADERS = map(util.slugify,
                            CSV_HEADERS) + ['broad_oxygen_requirement']
    ENZ_TABLE_HEADERS = ['organism', 'EC']

    def __init__(self, db_filename):
        self.db = database.SqliteDatabase(db_filename)

    def _InitTables(self):
        self.db.CreateTable('organisms',
                            self.ORG_TABLE_HEADERS,
                            drop_if_exists=False)
        self.db.CreateTable('organism_enzymes',
                            self.ENZ_TABLE_HEADERS,
                            drop_if_exists=False)

    def OrganismsForEC(self, ec):
        q = self.db.Execute(
            "SELECT organism FROM organism_enzymes WHERE EC='%s'" % ec)
        for i in q:
            yield i[0]

    def KEGG2NCBI(self, kegg_id):
        q = self.db.Execute(
            "SELECT ncbi_taxon_id FROM organisms WHERE kegg_id='%s'" % kegg_id)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def NCBI2EnergyCategory(self, ncbi_taxon):
        q = self.db.Execute(
            "SELECT energy_category FROM organisms WHERE ncbi_taxon_id='%s'" %
            ncbi_taxon)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def NCBI2BroadOxygenReq(self, ncbi_taxon):
        q = self.db.Execute(
            "SELECT broad_oxygen_requirement FROM organisms WHERE ncbi_taxon_id='%s'"
            % ncbi_taxon)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def KEGG2BroadOxygenReq(self, kegg_id):
        q = self.db.Execute(
            "SELECT broad_oxygen_requirement FROM organisms WHERE kegg_id='%s'"
            % kegg_id)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def KEGG2EnergyCategory(self, kegg_id):
        q = self.db.Execute(
            "SELECT energy_category FROM organisms WHERE kegg_id='%s'" %
            kegg_id)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def KEGG2EnergySource(self, kegg_id):
        q = self.db.Execute(
            "SELECT energy_source FROM organisms WHERE kegg_id='%s'" % kegg_id)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def KEGG2Metabolism(self, kegg_id):
        q = self.db.Execute(
            "SELECT metabolism FROM organisms WHERE kegg_id='%s'" % kegg_id)
        q = list(q)
        if not q:
            return None
        return q[0][0]

    def SelectOrganismFields(self, fields):
        query = 'SELECT %s FROM organisms' % ', '.join(fields)
        return self.db.Execute(query)

    @staticmethod
    def GetBroadyOxyReq(req_str):
        if not req_str:
            return None

        l_req = req_str.lower()
        if 'anaer' in l_req:
            return 'anaerobe'
        if 'facult' in l_req:
            return 'facultative'
        if 'microaero' in l_req:
            return 'microaerophile'
        if 'aerob':
            return 'aerobe'

        raise ValueError('Couldnt Parse!')

    def Populate(self, filename):
        """Populates the database from files."""
        self._InitTables()

        f = open(filename)
        r = csv.DictReader(f)

        for row in r:
            insert_row = []
            for table_header in self.ORG_TABLE_HEADERS:
                if table_header not in self.CSV_HEADER_MAPPING:
                    insert_row.append(None)
                    continue

                csv_header = self.CSV_HEADER_MAPPING[table_header]
                val = row.get(csv_header, None)
                if val and val.strip():
                    insert_row.append(val)
                else:
                    insert_row.append(None)

            oxy_req = row.get(self.OXY_REQ, None)
            broad_req = self.GetBroadyOxyReq(oxy_req)
            insert_row[-1] = broad_req

            self.db.Insert('organisms', insert_row)
        f.close()

        k = Kegg.getInstance(loadFromAPI=False)
        enzyme_map = k.ec2enzyme_map
        for ec, enzyme in enzyme_map.iteritems():
            for org in enzyme.genes.keys():
                self.db.Insert('organism_enzymes', [org.lower(), ec])