Exemplo n.º 1
0
def main(otu_table, mapping_data, categories, output_dir, \
    samples_to_plot = None, legend = False, xaxis = True):
    """Creates stacked bar plots for an otu table
    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data 
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the 
                    corresponding sample IDs and taxonomy for a collapsed 
                    biom table

        output_dir -- the location of the directory where output files should be
                    saved. If this directory does not exist, it will be created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed, 
                    then all samples in the biom table are analyzed.

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved 
        in the output directory. These will follow the file name format 
        Figure_4_<SAMPLEID>.pdf
    """
    # Sets constants
    LEVEL = 2
    FILEPREFIX = 'Figure_4_'
    MICHAEL_POLLAN = '000007108.1075657'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7
    
    # Loads the mapping file
    map_dict = map_to_2D_dict(mapping_data)
    
    (common_taxa, whole_sample_ids, whole_summary) = \
        summarize_human_taxa(otu_table, LEVEL)

    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in common_taxa: 
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    common_taxa = common_phyla
   
    # Checks that the correct sample ids are plotted
    if samples_to_plot == None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    # Identifies Michael Pollan's pre-ABX sample
    mp_sample_pos = whole_sample_ids.index(MICHAEL_POLLAN)
    mp_sample_taxa = whole_summary[:,mp_sample_pos]

    # Loads the category dictionary
    categories = load_category_files(category_fp, LEVEL)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            # Preallocates a numpy array for the plotting data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))        
            meta_data = map_dict[sample_id] 
            cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI', 
                        'Same Gender', 'Similar Age', 
                        'Michael Pollan', '']

            #cat_list.append('Your Fecal Sample')
            #cat_list.append('Average Fecal Samples')
        
            tax_array[:,0] = whole_summary[:,idx]
            tax_array[:,1] = mean(whole_summary, 1)
        
            cat_watch = 2
            # Identifies the appropriate metadata categories
            for cat in categories:                      
                # Pulls metadata for the sample and category
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions for the category
                tax_summary = categories[cat]['Taxa Summary']
                group_descriptions = categories[cat]['Groups']               
                # Amends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError, 'The %s cannot be found in %s.' \
                    % (mapping_key, cat)
                tax_array[:,cat_watch] = tax_summary[:,mapping_col]

                cat_watch = cat_watch + 1

            tax_array[:,-1] = mp_sample_taxa
            # Plots the data
            filename = pjoin(output_dir, '%s%s.pdf' \
                % (FILEPREFIX, sample_id))
            plot_american_gut(tax_array, filename)
    def test_summarize_human_taxa(self):
       # Defines the known values
        known_ids = ['00010', '00100', '00200', '00111', '00112', '00211']

        known_taxa = [(u'k__Bacteria', u' p__Firmicutes'),
                      (u'k__Bacteria', u' p__Bacteroidetes'),
                      (u'k__Bacteria', u' p__Proteobacteria'),
                      (u'k__Bacteria', u' p__Actinobacteria'),
                      (u'k__Bacteria', u' p__Verrucomicrobia'),
                      (u'k__Bacteria', u' p__Tenericutes'),
                      (u'k__Bacteria', u' p__Cyanobacteria'),
                      (u'k__Bacteria', u' p__Fusobacteria'),
                      (u'k__Bacteria', u' p__Other')]
        
        table_known = array([[0.5000, 0.2500, 0.5500, 0.3000, 0.45000, 0.1000],
                             [0.3000, 0.5000, 0.3500, 0.6200, 0.45000, 0.7500],
                             [0.0000, 0.0000, 0.0000, 0.0000, 0.00000, 0.0000],
                             [0.0800, 0.0000, 0.0000, 0.0030, 0.00000, 0.0000],
                             [0.0500, 0.0100, 0.0020, 0.0010, 0.00000, 0.0000],
                             [0.0200, 0.0800, 0.0500, 0.0020, 0.05000, 0.0000],
                             [0.0010, 0.0200, 0.0000, 0.0000, 0.00010, 0.0000],
                             [0.0080, 0.0000, 0.0020, 0.0000, 0.00000, 0.0500],
                             [0.0210, 0.1100, 0.0460, 0.0240, 0.01990, 0.0000]])

        # Creates an otu table for testing which corresponds with the 
        sample_ids = ['00010', '00100', '00200', '00111', '00112', '00211']

        observation_ids = ['1001', '2001', '2002', '2003', '3001', '3003', 
                           '4001', '5001', '6001', '7001', '8001', '9001', 
                           '9002', '9003']

        observation_md = [{'taxonomy': (u'k__Bacteria', u' p__Bacteroidetes',
                                        u'c__Bacteroidia')},
                          {'taxonomy': (u'k__Bacteria', u' p__Firmicutes',
                                        u'c__Clostridia')},
                          {'taxonomy': (u'k__Bacteria', u' p__Firmicutes',
                                        u'c__Erysipelotrichi')},
                          {'taxonomy': (u'k__Bacteria', u' p__Firmicutes',
                                        u'c__Bacilli')},
                          {'taxonomy': (u'k__Bacteria', u' p__Proteobacteria',
                                        u'c__Alphaproteobacteria')},
                          {'taxonomy': (u'k__Bacteria', u' p__Proteobacteria', 
                                        u'c__Gammaproteobacteria')},
                          {'taxonomy': (u'k__Bacteria', u' p__Tenericutes', 
                                        u'c__Mollicutes')},
                          {'taxonomy': (u'k__Bacteria', u' p__Actinobacteria', 
                                        u'c__Coriobacteriia')},
                          {'taxonomy': (u'k__Bacteria', u' p__Verrucomicrobia', 
                                        u'c__Verrucomicrobiae')},
                          {'taxonomy': (u'k__Bacteria', u' p__Cyanobacteria', 
                                        u'c__4C0d-2')},
                          {'taxonomy': (u'k__Bacteria', u' p__Fusobacteria', 
                                        u'c__Fusobacteriia')},
                          {'taxonomy': (u'k__Bacteria', u' p__TM7', 
                                        u'c__TM7-2')},
                          {'taxonomy': (u'k__Bacteria', u' p__Acidobacteria', 
                                        u'c__Chloracidobacteria')},
                          {'taxonomy': (u'k__Bacteria', u' p__', u'c__')}]

        data = array([[ 1691,  3004, 18606,  6914,  1314, 22843],
                      [ 2019,  1091,  8163,  1112,   738,  2362],
                      [   67,     4,  2835,   310,    85,   161],
                      [  731,   407, 18240,  1924,   492,   522],
                      [    8,     1,     0,    53,     8,   275],
                      [  105,   179,     0,   504,    79,  2771],
                      [  451,     0,     0,    33,     0,     0],
                      [  282,    60,   106,    11,     0,     0],
                      [  113,   481,  2658,    22,   146,     0],
                      [    6,   120,     0,     0,     0,     0],
                      [   45,     0,   106,     0,     0,  1523],
                      [   39,   341,  1761,   139,    18,     0],
                      [   21,   268,   153,     8,    15,     0],
                      [   59,    51,   531,   120,    25,     0]])                     

        otu_table = table_factory(data = data, sample_ids = sample_ids, \
                                  observation_ids = observation_ids, \
                                  observation_metadata = observation_md, \
                                  constructor = SparseOTUTable)

        # Tests that the table is summarized correctly
        (test_taxa, test_ids, test_table) = summarize_human_taxa(otu_table, 2)

        # Checks that all the outputs are correct
        self.assertEqual(test_ids, known_ids)
        self.assertEqual(test_taxa, known_taxa)
        self.assertEqual(test_table.all(), table_known.all())