def test_summarize_common_categories(self):
        """Checks that summarize_common_categories is sane"""
        # Defines the known values
        known_ids = ('00010', '00100', '00200', '00111', '00112', '00211')

        table_known = array([[ 0.49973390, 0.25004162, 0.55001035,
                               0.30008969, 0.45034247, 0.09997702],
                             [ 0.29998226, 0.50008324, 0.35000658,
                               0.62008969, 0.45000000, 0.75000821],
                             [ 0.02004612, 0.02996504, 0.00000000,
                               0.04995516, 0.02979452, 0.10000985],
                             [ 0.05002661, 0.00998835, 0.00199402,
                               0.00098655, 0.00000000, 0.00000000],
                             [ 0.02004612, 0.08007325, 0.05000094,
                               0.00197309, 0.05000000, 0.00000000],
                             [ 0.08000710, 0.00000000, 0.00000000,
                               0.00295964, 0.00000000, 0.00000000],
                             [ 0.00106440, 0.01997669, 0.00000000,
                               0.00000000, 0.00000000, 0.00000000],
                             [ 0.00798297, 0.00000000, 0.00199402,
                               0.00000000, 0.00000000, 0.05000492],
                             [ 0.02111052, 0.10987182, 0.04599409,
                               0.02394619, 0.01986301, 0.00000000]])

        known_common_cats = [(u'k__Bacteria', u'p__Firmicutes'),
                             (u'k__Bacteria', u'p__Bacteroidetes'),
                             (u'k__Bacteria', u'p__Proteobacteria'),
                             (u'k__Bacteria', u'p__Actinobacteria'),
                             (u'k__Bacteria', u'p__Verrucomicrobia'),
                             (u'k__Bacteria', u'p__Tenericutes'),
                             (u'k__Bacteria', u'p__Cyanobacteria'),
                             (u'k__Bacteria', u'p__Fusobacteria'),
                             (u'k__Bacteria', u'p__Other')]

        # Checks that appropriate errors are raised when the wrong type of
        # argument is passed.
        with self.assertRaises(ValueError):
            summarize_common_categories(biom_table=self.otu_table,
                                        level=2,
                                        common_categories=self.common_cats,
                                        metadata_category='Billy_Joel_Song')

        # Calculates the test values
        [test_ids, test_table, test_common_cats] = \
            summarize_common_categories(biom_table=self.otu_table,
                                        level=2,
                                        common_categories=self.common_cats)

        # Checks that all the outputs are correct
        self.assertEqual(tuple(test_ids), known_ids)
        assert_almost_equal(test_table, table_known, decimal=4)
        self.assertEqual(test_common_cats, known_common_cats)
示例#2
0
    def test_summarize_common_categories(self):
        """Checks that summarize_common_categories is sane"""
        # Defines the known values
        known_ids = ('00010', '00100', '00200', '00111', '00112', '00211')

        table_known = array([[
            0.49973390, 0.25004162, 0.55001035, 0.30008969, 0.45034247,
            0.09997702
        ],
                             [
                                 0.29998226, 0.50008324, 0.35000658,
                                 0.62008969, 0.45000000, 0.75000821
                             ],
                             [
                                 0.02004612, 0.02996504, 0.00000000,
                                 0.04995516, 0.02979452, 0.10000985
                             ],
                             [
                                 0.05002661, 0.00998835, 0.00199402,
                                 0.00098655, 0.00000000, 0.00000000
                             ],
                             [
                                 0.02004612, 0.08007325, 0.05000094,
                                 0.00197309, 0.05000000, 0.00000000
                             ],
                             [
                                 0.08000710, 0.00000000, 0.00000000,
                                 0.00295964, 0.00000000, 0.00000000
                             ],
                             [
                                 0.00106440, 0.01997669, 0.00000000,
                                 0.00000000, 0.00000000, 0.00000000
                             ],
                             [
                                 0.00798297, 0.00000000, 0.00199402,
                                 0.00000000, 0.00000000, 0.05000492
                             ],
                             [
                                 0.02111052, 0.10987182, 0.04599409,
                                 0.02394619, 0.01986301, 0.00000000
                             ]])

        known_common_cats = [(u'k__Bacteria', u'p__Firmicutes'),
                             (u'k__Bacteria', u'p__Bacteroidetes'),
                             (u'k__Bacteria', u'p__Proteobacteria'),
                             (u'k__Bacteria', u'p__Actinobacteria'),
                             (u'k__Bacteria', u'p__Verrucomicrobia'),
                             (u'k__Bacteria', u'p__Tenericutes'),
                             (u'k__Bacteria', u'p__Cyanobacteria'),
                             (u'k__Bacteria', u'p__Fusobacteria'),
                             (u'k__Bacteria', u'p__Other')]

        # Checks that appropriate errors are raised when the wrong type of
        # argument is passed.
        with self.assertRaises(ValueError):
            summarize_common_categories(biom_table=self.otu_table,
                                        level=2,
                                        common_categories=self.common_cats,
                                        metadata_category='Billy_Joel_Song')

        # Calculates the test values
        [test_ids, test_table, test_common_cats] = \
            summarize_common_categories(biom_table=self.otu_table,
                                        level=2,
                                        common_categories=self.common_cats)

        # Checks that all the outputs are correct
        self.assertEqual(test_ids, known_ids)
        assert_almost_equal(test_table, table_known, decimal=4)
        self.assertEqual(test_common_cats, known_common_cats)
def main(otu_table,
         mapping_data,
         cat_tables,
         output_dir,
         sample_type='fecal',
         samples_to_plot=None,
         legend=False,
         xaxis=True,
         debug=False):
    """Creates stacked bar plots for an otu table

    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the
                    corresponding biom table

        output_dir -- the location of the directory where output files should
                    be saved. If this directory does not exist, it will be
                    created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed,
                    then all samples in the biom table are analyzed.

        debug -- ignore properly handling Michael Pollan's sample

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved
        in the output directory. These will follow the file name format
        Figure_4_<SAMPLEID>.pdf
    """

    # Sets constants for analyzing the data
    LEVEL = 2
    CATEGORY = 'taxonomy'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7

    # Sets up plotting constants
    COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627],
                      [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351],
                      [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431],
                      [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412],
                      [0.3333, 0.3333, 0.3333]])

    FIG_DIMS = (4.44444, 3.33333)
    AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]])

    # Common taxa are designated before processing to remain constant.
    COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'),
                   (u'k__Bacteria', u'p__Bacteroidetes'),
                   (u'k__Bacteria', u'p__Proteobacteria'),
                   (u'k__Bacteria', u'p__Actinobacteria'),
                   (u'k__Bacteria', u'p__Verrucomicrobia'),
                   (u'k__Bacteria', u'p__Tenericutes'),
                   (u'k__Bacteria', u'p__Cyanobacteria'),
                   (u'k__Bacteria', u'p__Fusobacteria')]

    SKIPSET = set(('Sample', 'Average', 'MP'))

    # Names categories being plotted
    if sample_type == 'fecal':
        michael_pollan = '10317.000007108'
        cat_list = [
            'You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender',
            'Similar Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'DIET_TYPE', 'BMI_CAT', 'SEX', 'AGE_CAT', 'MP'
        ]

    elif sample_type == 'skin':
        michael_pollan = '10317.000007113'
        cat_list = [
            'You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand',
            'Same Gender', 'Same Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX',
            'AGE_CAT', 'MP'
        ]

    elif sample_type == 'oral':
        michael_pollan = '10317.000007109'
        cat_list = [
            'You', 'Average', 'Similar Diet', 'Flossing Frequency',
            'Same Gender', 'Same Age', 'Michael Pollan'
        ]
        order = [
            'Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX',
            'AGE_CAT', 'MP'
        ]

    else:
        raise ValueError('%s is not a supported sample type.' % sample_type)

    # Gets the mapping file
    map_dict = map_to_2D_dict(mapping_data)

    # Gets the category file dictionary summarized with the common categories
    # Generates the category file dictionary
    categories = parse_category_files(raw_tables=cat_tables,
                                      common_groups=COMMON_TAXA[:8],
                                      level=LEVEL,
                                      metadata=CATEGORY)

    # Summarizes taxonomy for the category
    (whole_sample_ids, whole_summary, new_common_taxa) = \
        summarize_common_categories(biom_table=otu_table,
                                    level=LEVEL,
                                    common_categories=COMMON_TAXA[:8],
                                    metadata_category=CATEGORY)

    # Converts the final taxa to a cleaned up list
    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in new_common_taxa:
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    new_common_taxa = common_phyla

    # Checks that the crrect sample ids are plotted
    if samples_to_plot is None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    if len(sample_ids) > 1:
        # TODO: make the rest of the code reflect this...
        raise ValueError("SCRIPT NO LONGER SUPPORTS MULTIPLE SAMPLES")

    # Identifies Michael Pollan's pre-ABX sample
    if debug:
        mp_sample_pos = 2
    else:
        mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan)
    mp_sample_taxa = whole_summary[:, mp_sample_pos]

    # Gets the table average
    table_average = mean(whole_summary, 1)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            meta_data = map_dict[sample_id]
            # Prealocates a numpy array to hold the data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))

            # Adds preset values to the array so the first column is the sample
            # the second column is the average and the last column is Michael
            # Pollan
            tax_array[:, 0] = whole_summary[:, idx]
            tax_array[:, 1] = table_average
            tax_array[:, -1] = mp_sample_taxa

            # Adds the categories to the table in the listed order
            for idx, cat in enumerate(order):
                # Skips over undesired categories
                if cat in SKIPSET:
                    continue
                # Gets the sample metadata
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions
                tax_summary = categories[cat]['Summary']
                group_descriptions = categories[cat]['Groups'].tolist()
                # Appends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError('The %s cannot be found in %s.' %
                                     (mapping_key, cat))
                tax_array[:, idx] = tax_summary[:, mapping_col]

            # Sets up the file to save the data
            filename = pjoin(output_dir, 'figure4.pdf')

            # Plots the data
            render_barchart(data_table=tax_array,
                            x_axis=False,
                            group_names=new_common_taxa,
                            legend=False,
                            sample_names=cat_list,
                            y_axis=False,
                            axis_dims=AXIS_DIMS,
                            fig_dims=FIG_DIMS,
                            file_out=filename,
                            show_edge=False,
                            colors=COLORMAP)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal',
         samples_to_plot=None, legend=False, xaxis=True, debug=False):
    """Creates stacked bar plots for an otu table

    INPUTS:
        otu_table -- an open OTU table

        mapping_data -- a tab delimited string containing the mapping data
                    passed from the mapping file.

        categories -- a dictionary keying a mapping category to the
                    corresponding biom table

        output_dir -- the location of the directory where output files should
                    be saved. If this directory does not exist, it will be
                    created.

        samples_to_plot -- a list of sample ids to plot. If no value is passed,
                    then all samples in the biom table are analyzed.

        debug -- ignore properly handling Michael Pollan's sample

    OUTPUTS:
        A pdf of stacked taxonomy will be generated for each sample and saved
        in the output directory. These will follow the file name format
        Figure_4_<SAMPLEID>.pdf
    """

    # Sets constants for analyzing the data
    LEVEL = 2
    CATEGORY = 'taxonomy'
    NUM_TAXA = 9
    NUM_CATS_TO_PLOT = 7

    # Sets up file name constants
    FILEPREFIX = 'Figure_4_'
    FILE_END = '.pdf'

    # Sets up plotting constants
    COLORMAP = array([[0.8353, 0.2421, 0.3098],
                      [0.9569, 0.4275, 0.2627],
                      [0.9922, 0.6824, 0.3804],
                      [0.9961, 0.8784, 0.5351],
                      [0.9020, 0.9608, 0.5961],
                      [0.6706, 0.8667, 0.6431],
                      [0.4000, 0.7608, 0.6471],
                      [0.1961, 0.5333, 0.7412],
                      [0.3333, 0.3333, 0.3333]])

    FIG_DIMS = (4.44444, 3.33333)
    AXIS_DIMS = array([[0.05, 0.05],
                       [0.95, 0.95]])

    # Common taxa are designated before processing to remain constant.
    COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'),
                   (u'k__Bacteria', u'p__Bacteroidetes'),
                   (u'k__Bacteria', u'p__Proteobacteria'),
                   (u'k__Bacteria', u'p__Actinobacteria'),
                   (u'k__Bacteria', u'p__Verrucomicrobia'),
                   (u'k__Bacteria', u'p__Tenericutes'),
                   (u'k__Bacteria', u'p__Cyanobacteria'),
                   (u'k__Bacteria', u'p__Fusobacteria')]

    SKIPSET = set(('Sample', 'Average', 'MP'))

    # Names categories being plotted
    if sample_type == 'fecal':
        michael_pollan = '000007108.1075657'
        cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI',
                    'Same Gender', 'Similar Age', 'Michael Pollan']
        order = ['Sample', 'Average', 'DIET_TYPE', 'BMI_CATEGORY', 'SEX',
                 'AGE_CATEGORY', 'MP']

    elif sample_type == 'skin':
        michael_pollan = '7113.1075702'
        cat_list = ['You', 'Average', 'Similar Cosmetic Use',
                    'Same Dominant Hand', 'Same Gender', 'Same Age',
                    'Michael Pollan']
        order = ['Sample', 'Average', 'COSMETICS_FREQUENCY',
                 'DOMINANT_HAND', 'SEX', 'AGE_CATEGORY', 'MP']

    elif sample_type == 'oral':
        michael_pollan = '7109.1075688'
        cat_list = ['You', 'Average', 'Similar Diet', 'Flossing Frequency',
                    'Same Gender', 'Same Age', 'Michael Pollan']
        order = ['Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY',
                 'SEX', 'AGE_CATEGORY', 'MP']

    else:
        raise ValueError('%s is not a supported sample type.' % sample_type)

    # Gets the mapping file
    map_dict = map_to_2D_dict(mapping_data)

    # Gets the category file dictionary summarized with the common categories
    # Generates the category file dictionary
    categories = parse_category_files(raw_tables=cat_tables,
                                      common_groups=COMMON_TAXA[:8],
                                      level=LEVEL,
                                      metadata=CATEGORY)

    # Summarizes taxonomy for the category
    (whole_sample_ids, whole_summary, new_common_taxa) = \
        summarize_common_categories(biom_table=otu_table,
                                    level=LEVEL,
                                    common_categories=COMMON_TAXA[:8],
                                    metadata_category=CATEGORY)

    # Converts the final taxa to a cleaned up list
    # Converts final taxa to a clean list
    common_phyla = []
    for taxon in new_common_taxa:
        common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']'))
    new_common_taxa = common_phyla

    # Checks that the crrect sample ids are plotted
    if samples_to_plot is None:
        sample_ids = whole_sample_ids
    else:
        sample_ids = samples_to_plot

    # Identifies Michael Pollan's pre-ABX sample
    if debug:
        mp_sample_pos = 2
    else:
        mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan)
    mp_sample_taxa = whole_summary[:, mp_sample_pos]

    # Gets the table average
    table_average = mean(whole_summary, 1)

    # Generates a figure for each sample
    for idx, sample_id in enumerate(whole_sample_ids):
        if sample_id in sample_ids:
            meta_data = map_dict[sample_id]
            # Prealocates a numpy array to hold the data
            tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT))

            # Adds preset values to the array so the first column is the sample
            # the second column is the average and the last column is Michael
            # Pollan
            tax_array[:, 0] = whole_summary[:, idx]
            tax_array[:, 1] = table_average
            tax_array[:, -1] = mp_sample_taxa

            # Adds the categories to the table in the listed order
            for idx, cat in enumerate(order):
                # Skips over undesired categories
                if cat in SKIPSET:
                    continue
                # Gets the sample metadata
                mapping_key = meta_data[cat]
                # Pulls taxonomic summary and group descriptions
                tax_summary = categories[cat]['Summary']
                group_descriptions = categories[cat]['Groups'].tolist()
                # Appends plotting tables
                try:
                    mapping_col = group_descriptions.index(mapping_key)
                except:
                    raise ValueError('The %s cannot be found in %s.'
                                     % (mapping_key, cat))
                tax_array[:, idx] = tax_summary[:, mapping_col]

            # Sets up the file to save the data
            filename = pjoin(output_dir, '%s%s%s'
                             % (FILEPREFIX, sample_id, FILE_END))

            # Plots the data
            render_barchart(data_table=tax_array,
                            x_axis=False,
                            group_names=new_common_taxa,
                            legend=False,
                            sample_names=cat_list,
                            y_axis=False,
                            axis_dims=AXIS_DIMS,
                            fig_dims=FIG_DIMS,
                            file_out=filename,
                            show_edge=False,
                            colors=COLORMAP)