def test_summarize_common_categories(self): """Checks that summarize_common_categories is sane""" # Defines the known values known_ids = ('00010', '00100', '00200', '00111', '00112', '00211') table_known = array([[ 0.49973390, 0.25004162, 0.55001035, 0.30008969, 0.45034247, 0.09997702], [ 0.29998226, 0.50008324, 0.35000658, 0.62008969, 0.45000000, 0.75000821], [ 0.02004612, 0.02996504, 0.00000000, 0.04995516, 0.02979452, 0.10000985], [ 0.05002661, 0.00998835, 0.00199402, 0.00098655, 0.00000000, 0.00000000], [ 0.02004612, 0.08007325, 0.05000094, 0.00197309, 0.05000000, 0.00000000], [ 0.08000710, 0.00000000, 0.00000000, 0.00295964, 0.00000000, 0.00000000], [ 0.00106440, 0.01997669, 0.00000000, 0.00000000, 0.00000000, 0.00000000], [ 0.00798297, 0.00000000, 0.00199402, 0.00000000, 0.00000000, 0.05000492], [ 0.02111052, 0.10987182, 0.04599409, 0.02394619, 0.01986301, 0.00000000]]) known_common_cats = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria'), (u'k__Bacteria', u'p__Other')] # Checks that appropriate errors are raised when the wrong type of # argument is passed. with self.assertRaises(ValueError): summarize_common_categories(biom_table=self.otu_table, level=2, common_categories=self.common_cats, metadata_category='Billy_Joel_Song') # Calculates the test values [test_ids, test_table, test_common_cats] = \ summarize_common_categories(biom_table=self.otu_table, level=2, common_categories=self.common_cats) # Checks that all the outputs are correct self.assertEqual(tuple(test_ids), known_ids) assert_almost_equal(test_table, table_known, decimal=4) self.assertEqual(test_common_cats, known_common_cats)
def test_summarize_common_categories(self): """Checks that summarize_common_categories is sane""" # Defines the known values known_ids = ('00010', '00100', '00200', '00111', '00112', '00211') table_known = array([[ 0.49973390, 0.25004162, 0.55001035, 0.30008969, 0.45034247, 0.09997702 ], [ 0.29998226, 0.50008324, 0.35000658, 0.62008969, 0.45000000, 0.75000821 ], [ 0.02004612, 0.02996504, 0.00000000, 0.04995516, 0.02979452, 0.10000985 ], [ 0.05002661, 0.00998835, 0.00199402, 0.00098655, 0.00000000, 0.00000000 ], [ 0.02004612, 0.08007325, 0.05000094, 0.00197309, 0.05000000, 0.00000000 ], [ 0.08000710, 0.00000000, 0.00000000, 0.00295964, 0.00000000, 0.00000000 ], [ 0.00106440, 0.01997669, 0.00000000, 0.00000000, 0.00000000, 0.00000000 ], [ 0.00798297, 0.00000000, 0.00199402, 0.00000000, 0.00000000, 0.05000492 ], [ 0.02111052, 0.10987182, 0.04599409, 0.02394619, 0.01986301, 0.00000000 ]]) known_common_cats = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria'), (u'k__Bacteria', u'p__Other')] # Checks that appropriate errors are raised when the wrong type of # argument is passed. with self.assertRaises(ValueError): summarize_common_categories(biom_table=self.otu_table, level=2, common_categories=self.common_cats, metadata_category='Billy_Joel_Song') # Calculates the test values [test_ids, test_table, test_common_cats] = \ summarize_common_categories(biom_table=self.otu_table, level=2, common_categories=self.common_cats) # Checks that all the outputs are correct self.assertEqual(test_ids, known_ids) assert_almost_equal(test_table, table_known, decimal=4) self.assertEqual(test_common_cats, known_common_cats)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal', samples_to_plot=None, legend=False, xaxis=True, debug=False): """Creates stacked bar plots for an otu table INPUTS: otu_table -- an open OTU table mapping_data -- a tab delimited string containing the mapping data passed from the mapping file. categories -- a dictionary keying a mapping category to the corresponding biom table output_dir -- the location of the directory where output files should be saved. If this directory does not exist, it will be created. samples_to_plot -- a list of sample ids to plot. If no value is passed, then all samples in the biom table are analyzed. debug -- ignore properly handling Michael Pollan's sample OUTPUTS: A pdf of stacked taxonomy will be generated for each sample and saved in the output directory. These will follow the file name format Figure_4_<SAMPLEID>.pdf """ # Sets constants for analyzing the data LEVEL = 2 CATEGORY = 'taxonomy' NUM_TAXA = 9 NUM_CATS_TO_PLOT = 7 # Sets up plotting constants COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627], [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351], [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431], [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412], [0.3333, 0.3333, 0.3333]]) FIG_DIMS = (4.44444, 3.33333) AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]]) # Common taxa are designated before processing to remain constant. COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria')] SKIPSET = set(('Sample', 'Average', 'MP')) # Names categories being plotted if sample_type == 'fecal': michael_pollan = '10317.000007108' cat_list = [ 'You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender', 'Similar Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'DIET_TYPE', 'BMI_CAT', 'SEX', 'AGE_CAT', 'MP' ] elif sample_type == 'skin': michael_pollan = '10317.000007113' cat_list = [ 'You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand', 'Same Gender', 'Same Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX', 'AGE_CAT', 'MP' ] elif sample_type == 'oral': michael_pollan = '10317.000007109' cat_list = [ 'You', 'Average', 'Similar Diet', 'Flossing Frequency', 'Same Gender', 'Same Age', 'Michael Pollan' ] order = [ 'Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX', 'AGE_CAT', 'MP' ] else: raise ValueError('%s is not a supported sample type.' % sample_type) # Gets the mapping file map_dict = map_to_2D_dict(mapping_data) # Gets the category file dictionary summarized with the common categories # Generates the category file dictionary categories = parse_category_files(raw_tables=cat_tables, common_groups=COMMON_TAXA[:8], level=LEVEL, metadata=CATEGORY) # Summarizes taxonomy for the category (whole_sample_ids, whole_summary, new_common_taxa) = \ summarize_common_categories(biom_table=otu_table, level=LEVEL, common_categories=COMMON_TAXA[:8], metadata_category=CATEGORY) # Converts the final taxa to a cleaned up list # Converts final taxa to a clean list common_phyla = [] for taxon in new_common_taxa: common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']')) new_common_taxa = common_phyla # Checks that the crrect sample ids are plotted if samples_to_plot is None: sample_ids = whole_sample_ids else: sample_ids = samples_to_plot if len(sample_ids) > 1: # TODO: make the rest of the code reflect this... raise ValueError("SCRIPT NO LONGER SUPPORTS MULTIPLE SAMPLES") # Identifies Michael Pollan's pre-ABX sample if debug: mp_sample_pos = 2 else: mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan) mp_sample_taxa = whole_summary[:, mp_sample_pos] # Gets the table average table_average = mean(whole_summary, 1) # Generates a figure for each sample for idx, sample_id in enumerate(whole_sample_ids): if sample_id in sample_ids: meta_data = map_dict[sample_id] # Prealocates a numpy array to hold the data tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT)) # Adds preset values to the array so the first column is the sample # the second column is the average and the last column is Michael # Pollan tax_array[:, 0] = whole_summary[:, idx] tax_array[:, 1] = table_average tax_array[:, -1] = mp_sample_taxa # Adds the categories to the table in the listed order for idx, cat in enumerate(order): # Skips over undesired categories if cat in SKIPSET: continue # Gets the sample metadata mapping_key = meta_data[cat] # Pulls taxonomic summary and group descriptions tax_summary = categories[cat]['Summary'] group_descriptions = categories[cat]['Groups'].tolist() # Appends plotting tables try: mapping_col = group_descriptions.index(mapping_key) except: raise ValueError('The %s cannot be found in %s.' % (mapping_key, cat)) tax_array[:, idx] = tax_summary[:, mapping_col] # Sets up the file to save the data filename = pjoin(output_dir, 'figure4.pdf') # Plots the data render_barchart(data_table=tax_array, x_axis=False, group_names=new_common_taxa, legend=False, sample_names=cat_list, y_axis=False, axis_dims=AXIS_DIMS, fig_dims=FIG_DIMS, file_out=filename, show_edge=False, colors=COLORMAP)
def main(otu_table, mapping_data, cat_tables, output_dir, sample_type='fecal', samples_to_plot=None, legend=False, xaxis=True, debug=False): """Creates stacked bar plots for an otu table INPUTS: otu_table -- an open OTU table mapping_data -- a tab delimited string containing the mapping data passed from the mapping file. categories -- a dictionary keying a mapping category to the corresponding biom table output_dir -- the location of the directory where output files should be saved. If this directory does not exist, it will be created. samples_to_plot -- a list of sample ids to plot. If no value is passed, then all samples in the biom table are analyzed. debug -- ignore properly handling Michael Pollan's sample OUTPUTS: A pdf of stacked taxonomy will be generated for each sample and saved in the output directory. These will follow the file name format Figure_4_<SAMPLEID>.pdf """ # Sets constants for analyzing the data LEVEL = 2 CATEGORY = 'taxonomy' NUM_TAXA = 9 NUM_CATS_TO_PLOT = 7 # Sets up file name constants FILEPREFIX = 'Figure_4_' FILE_END = '.pdf' # Sets up plotting constants COLORMAP = array([[0.8353, 0.2421, 0.3098], [0.9569, 0.4275, 0.2627], [0.9922, 0.6824, 0.3804], [0.9961, 0.8784, 0.5351], [0.9020, 0.9608, 0.5961], [0.6706, 0.8667, 0.6431], [0.4000, 0.7608, 0.6471], [0.1961, 0.5333, 0.7412], [0.3333, 0.3333, 0.3333]]) FIG_DIMS = (4.44444, 3.33333) AXIS_DIMS = array([[0.05, 0.05], [0.95, 0.95]]) # Common taxa are designated before processing to remain constant. COMMON_TAXA = [(u'k__Bacteria', u'p__Firmicutes'), (u'k__Bacteria', u'p__Bacteroidetes'), (u'k__Bacteria', u'p__Proteobacteria'), (u'k__Bacteria', u'p__Actinobacteria'), (u'k__Bacteria', u'p__Verrucomicrobia'), (u'k__Bacteria', u'p__Tenericutes'), (u'k__Bacteria', u'p__Cyanobacteria'), (u'k__Bacteria', u'p__Fusobacteria')] SKIPSET = set(('Sample', 'Average', 'MP')) # Names categories being plotted if sample_type == 'fecal': michael_pollan = '000007108.1075657' cat_list = ['You', 'Average', 'Similar Diet', ' Similar BMI', 'Same Gender', 'Similar Age', 'Michael Pollan'] order = ['Sample', 'Average', 'DIET_TYPE', 'BMI_CATEGORY', 'SEX', 'AGE_CATEGORY', 'MP'] elif sample_type == 'skin': michael_pollan = '7113.1075702' cat_list = ['You', 'Average', 'Similar Cosmetic Use', 'Same Dominant Hand', 'Same Gender', 'Same Age', 'Michael Pollan'] order = ['Sample', 'Average', 'COSMETICS_FREQUENCY', 'DOMINANT_HAND', 'SEX', 'AGE_CATEGORY', 'MP'] elif sample_type == 'oral': michael_pollan = '7109.1075688' cat_list = ['You', 'Average', 'Similar Diet', 'Flossing Frequency', 'Same Gender', 'Same Age', 'Michael Pollan'] order = ['Sample', 'Average', 'DIET_TYPE', 'FLOSSING_FREQUENCY', 'SEX', 'AGE_CATEGORY', 'MP'] else: raise ValueError('%s is not a supported sample type.' % sample_type) # Gets the mapping file map_dict = map_to_2D_dict(mapping_data) # Gets the category file dictionary summarized with the common categories # Generates the category file dictionary categories = parse_category_files(raw_tables=cat_tables, common_groups=COMMON_TAXA[:8], level=LEVEL, metadata=CATEGORY) # Summarizes taxonomy for the category (whole_sample_ids, whole_summary, new_common_taxa) = \ summarize_common_categories(biom_table=otu_table, level=LEVEL, common_categories=COMMON_TAXA[:8], metadata_category=CATEGORY) # Converts the final taxa to a cleaned up list # Converts final taxa to a clean list common_phyla = [] for taxon in new_common_taxa: common_phyla.append(taxon[1].strip(' p__').strip('[').strip(']')) new_common_taxa = common_phyla # Checks that the crrect sample ids are plotted if samples_to_plot is None: sample_ids = whole_sample_ids else: sample_ids = samples_to_plot # Identifies Michael Pollan's pre-ABX sample if debug: mp_sample_pos = 2 else: mp_sample_pos = whole_sample_ids.tolist().index(michael_pollan) mp_sample_taxa = whole_summary[:, mp_sample_pos] # Gets the table average table_average = mean(whole_summary, 1) # Generates a figure for each sample for idx, sample_id in enumerate(whole_sample_ids): if sample_id in sample_ids: meta_data = map_dict[sample_id] # Prealocates a numpy array to hold the data tax_array = zeros((NUM_TAXA, NUM_CATS_TO_PLOT)) # Adds preset values to the array so the first column is the sample # the second column is the average and the last column is Michael # Pollan tax_array[:, 0] = whole_summary[:, idx] tax_array[:, 1] = table_average tax_array[:, -1] = mp_sample_taxa # Adds the categories to the table in the listed order for idx, cat in enumerate(order): # Skips over undesired categories if cat in SKIPSET: continue # Gets the sample metadata mapping_key = meta_data[cat] # Pulls taxonomic summary and group descriptions tax_summary = categories[cat]['Summary'] group_descriptions = categories[cat]['Groups'].tolist() # Appends plotting tables try: mapping_col = group_descriptions.index(mapping_key) except: raise ValueError('The %s cannot be found in %s.' % (mapping_key, cat)) tax_array[:, idx] = tax_summary[:, mapping_col] # Sets up the file to save the data filename = pjoin(output_dir, '%s%s%s' % (FILEPREFIX, sample_id, FILE_END)) # Plots the data render_barchart(data_table=tax_array, x_axis=False, group_names=new_common_taxa, legend=False, sample_names=cat_list, y_axis=False, axis_dims=AXIS_DIMS, fig_dims=FIG_DIMS, file_out=filename, show_edge=False, colors=COLORMAP)