def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ext=path.splitext(opts.input_count_table)[1] if opts.verbose: print "Loading count table: ", opts.input_count_table if (ext == '.gz'): genome_table = parse_biom_table(gzip.open(opts.input_count_table,'rb')) else: genome_table = parse_biom_table(open(opts.input_count_table,'U')) if opts.verbose: print "Predicting the metagenome..." partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_metagenome_table make_output_dir_for_file(opts.output_metagenome_table) open(opts.output_metagenome_table,'w').write(output_text)
def test_partition_metagenome_contributions_with_taxonomy(self): obs = partition_metagenome_contributions(self.otu_table_with_taxonomy,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text_list = [map(str,r.split()) for r in self.predicted_gene_partition_table_with_taxonomy.split('\n')] #BIOM adds spaces to metadata fields (not sure why), so add them here just for the taxonomy fields for row in exp_text_list[1:]: row[9]=' '+row[9] row[10]=' '+row[10] row[11]=' '+row[11] row[12]=' '+row[12] row[13]=' '+row[13] row[14]=' '+row[14] exp_text="\n".join(["\t".join(i) for i in exp_text_list]) self.assertEqual(obs_text,exp_text)
def test_partition_metagenome_contributions_with_taxonomy(self): obs = partition_metagenome_contributions(self.otu_table_with_taxonomy,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text_list = [map(str,r.split()) for r in self.predicted_gene_partition_table_with_taxonomy.split('\n')] #BIOM adds spaces to metadata fields (not sure why), so add them here just for the taxonomy fields for row in exp_text_list[1:]: row[9]=' '+row[9] row[10]=' '+row[10] row[11]=' '+row[11] row[12]=' '+row[12] row[13]=' '+row[13] row[14]=' '+row[14] exp_text="\n".join(["\t".join(i) for i in exp_text_list]) self.assertEqual(obs_text,exp_text)
def test_partition_metagenome_contributions(self): """partition_metagenome_contributions functions with valid input""" #For reference, the OTU table should look like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #GG_OTU_1 1.0 2.0 3.0 5.0 #GG_OTU_2 5.0 1.0 0.0 2.0 #GG_OTU_3 0.0 0.0 1.0 4.0 #...and the genome table will look like this: ##OTU ID GG_OTU_1 GG_OTU_3 GG_OTU_2 #f1 1.0 2.0 3.0 #f2 0.0 1.0 0.0 #f3 0.0 0.0 1.0 #For which predict metagenomes should produce a table like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #f1 16.0 5.0 5.0 19.0 #f2 0.0 0.0 1.0 4.0 #f3 5.0 1.0 0.0 2.0 #First, sanity checks #We expect to see the contributions broken down by OTU metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1) obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \ self.predicted_gene_partition_table.split('\n')]) #Test that the percent of all samples is always smaller than #the percent of the current sample for l in obs[1:]: self.assertTrue(l[-1]<=l[-2]) #Test that the summed contributions equal the metagenome table value sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f1_sample1,16.0) sum_f2_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f2_sample1,0.0) sum_f3_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f3_sample1,5.0) for l in obs[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l #Test that genomes without genes don't contribute #Only GG_OTU_3 has f2, so for all others the gene #contribution should be 0,0 if gene == "f2" and OTU != "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Ditto for GG_OTU_2 and f3 if gene == "f3" and OTU != "GG_OTU_2": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Test that OTUs absent from a sample don't contribute if sample == "Sample1" and OTU == "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Having validated that this looks OK, just compare to #hand-checked result self.assertEqual(obs_text,exp_text) #Check if "limit to functions" works and retrieves the correct information obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"]) for l in obs_limited[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l self.assertEqual(gene,"f2")
def test_partition_metagenome_contributions(self): """partition_metagenome_contributions functions with valid input""" #For reference, the OTU table should look like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #GG_OTU_1 1.0 2.0 3.0 5.0 #GG_OTU_2 5.0 1.0 0.0 2.0 #GG_OTU_3 0.0 0.0 1.0 4.0 #...and the genome table will look like this: ##OTU ID GG_OTU_1 GG_OTU_3 GG_OTU_2 #f1 1.0 2.0 3.0 #f2 0.0 1.0 0.0 #f3 0.0 0.0 1.0 #For which predict metagenomes should produce a table like this: ##OTU ID Sample1 Sample2 Sample3 Sample4 #f1 16.0 5.0 5.0 19.0 #f2 0.0 0.0 1.0 4.0 #f3 5.0 1.0 0.0 2.0 #First, sanity checks #We expect to see the contributions broken down by OTU metagenome_table = predict_metagenomes(self.otu_table1,self.genome_table1) obs = partition_metagenome_contributions(self.otu_table1,self.genome_table1) obs_text = "\n".join(["\t".join(map(str,i)) for i in obs]) exp_text = "\n".join(["\t".join(map(str,r.split())) for r in \ self.predicted_gene_partition_table.split('\n')]) #Test that the percent of all samples is always smaller than #the percent of the current sample for l in obs[1:]: self.assertTrue(l[-1]<=l[-2]) #Test that the summed contributions equal the metagenome table value sum_f1_sample1 = sum([i[5] for i in obs[1:] if (i[0]=="f1" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f1_sample1,16.0) sum_f2_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f2" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f2_sample1,0.0) sum_f3_sample1 = sum(\ [i[5] for i in obs[1:] if (i[0]=="f3" and i[1]=="Sample1")]) self.assertFloatEqual(sum_f3_sample1,5.0) for l in obs[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l #Test that genomes without genes don't contribute #Only GG_OTU_3 has f2, so for all others the gene #contribution should be 0,0 if gene == "f2" and OTU != "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Ditto for GG_OTU_2 and f3 if gene == "f3" and OTU != "GG_OTU_2": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Test that OTUs absent from a sample don't contribute if sample == "Sample1" and OTU == "GG_OTU_3": self.assertFloatEqual(count,0.0) self.assertFloatEqual(percent,0.0) self.assertFloatEqual(percent_all,0.0) #Having validated that this looks OK, just compare to #hand-checked result self.assertEqual(obs_text,exp_text) #Check if "limit to functions" works and retrieves the correct information obs_limited = partition_metagenome_contributions(self.otu_table1,self.genome_table1,limit_to_functions=["f2"]) for l in obs_limited[1:]: gene,sample,OTU,gene_count_per_genome,otu_abundance_in_sample,count,percent,percent_all = l self.assertEqual(gene,"f2")
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = parse_biom_table(open(opts.input_otu_table,'U')) ids_to_load = otu_table.ObservationIds if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = parse_biom_table(genome_table_fh.read()) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.limit_to_function: limit_to_functions = opts.limit_to_function.split(',') if opts.verbose: print "Limiting output to only functions:",limit_to_functions else: limit_to_functions = [] if opts.verbose: print "Loading otu table: ",opts.input_otu_table otu_table = load_table(opts.input_otu_table) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_table is None): #precalc file has specific name (e.g. ko_13_5_precalculated.tab.gz) precalc_file_name='_'.join([opts.type_of_prediction,opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_table if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if opts.verbose: print "Loading count table: ", input_count_table if (ext == '.gz'): genome_table_fh = gzip.open(input_count_table,'rb') else: genome_table_fh = open(input_count_table,'U') #In the genome/trait table genomes are the samples and #genes are the observations if opts.load_precalc_file_in_biom: if not opts.suppress_subset_loading: #Now we want to use the OTU table information #to load only rows in the count table corresponding #to relevant OTUs if opts.verbose: print "Loading traits for %i organisms from the trait table" %len(ids_to_load) genome_table = load_subset_from_biom_str(genome_table_fh.read(),ids_to_load,axis='samples') else: if opts.verbose: print "Loading *full* count table because --suppress_subset_loading was passed. This may result in high memory usage" genome_table = load_table(genome_table_fh) else: genome_table = convert_precalc_to_biom(genome_table_fh,ids_to_load) ok_functional_categories = None metadata_type = None if opts.limit_to_functional_categories: ok_functional_categories = opts.limit_to_functional_categories.split("|") if opts.verbose: print "Limiting to functional categories: %s" %(str(ok_functional_categories)) # Either KEGG_Pathways or COG_Category needs # to be assigned to metadata_key to limit to # functional categories (not needed for # individual functions) if opts.type_of_prediction == "ko": metadata_type = "KEGG_Pathways" elif opts.type_of_prediction == "cog": metadata_type = "COG_Category" elif opts.type_of_prediction == "rfam": exit("Stopping program: when type of prediction is set to rfam you can only limit to individual functions (-l) rather than to functional categories (-f)") partitioned_metagenomes = partition_metagenome_contributions(otu_table,genome_table,limit_to_functions=limit_to_functions,\ limit_to_functional_categories = ok_functional_categories , metadata_key = metadata_type ) output_text = "\n".join(["\t".join(map(str,i)) for i in partitioned_metagenomes]) if opts.verbose: print "Writing results to output file: ",opts.output_fp make_output_dir_for_file(opts.output_fp) open(opts.output_fp,'w').write(output_text)