def partition_metagenome_contributions(otu_table,genome_table): """Return a list of the contribution of each organism to each function, per sample otu_table -- the BIOM Table object for the OTU table genome_table -- the BIOM Table object for the predicted genomes Output table as a list of lists with header Function\tOrganism\tSample\tCounts\tpercent_of_sample """ otu_data,genome_data,overlapping_ids = extract_otu_and_genome_data(otu_table,genome_table) #We have a list of data with abundances and gene copy numbers lines=[] result = [["Gene","Sample","OTU","GeneCountPerGenome",\ "OTUAbundanceInSample","CountContributedByOTU",\ "ContributionPercentOfSample","ContributionPercentOfAllSamples"]] #TODO refactor as array operations for speed #Zero-valued total counts will be set to epsilon epsilon = 1e-5 for j,gene_id in enumerate(genome_table.ObservationIds): all_gene_rows = [] for k,sample_id in enumerate(otu_table.SampleIds): #Add raw counts for the gene in this sample to a list sample_gene_rows = [] for i,otu_id in enumerate(overlapping_ids): otu_gene_count = genome_data[i][j] otu_abundance = otu_data[i][k] contribution = otu_gene_count * otu_abundance sample_gene_rows.append([gene_id,sample_id,otu_id,otu_gene_count,otu_abundance,contribution]) #Now get the percentage of each genes contribution to the sample overall total_counts =max(epsilon,sum([float(row[-1]) for row in sample_gene_rows])) for row in sample_gene_rows: percent_of_sample = float(row[-1])/total_counts row.append(percent_of_sample) all_gene_rows.extend(sample_gene_rows) count_idx = -2 #Counts are now in the next to last position in each row total_counts =max(epsilon,sum([float(row[count_idx]) for row in all_gene_rows])) for row in all_gene_rows: percent_of_sample = float(row[count_idx])/total_counts row.append(percent_of_sample) lines.extend(all_gene_rows) result.extend(lines) return result
def partition_metagenome_contributions(otu_table, genome_table, limit_to_functions=[], remove_zero_rows=True, verbose=True): """Return a list of the contribution of each organism to each function, per sample (rewritten version using numpy) otu_table -- the BIOM Table object for the OTU table genome_table -- the BIOM Table object for the predicted genomes limit_to_functions -- a list of function ids to include. If empty, include all function ids Output table as a list of lists with header Function\tOrganism\tSample\tCounts\tpercent_of_sample """ if limit_to_functions: if verbose: print "Filtering the genome table to include only user-specified functions:", limit_to_functions ok_ids = frozenset(map(str, limit_to_functions)) filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids #filter_by_set = lambda vals,gene_id,metadata: gene_id in ok_ids #if verbose: #print dir(genome_table) # print "Valid function ids:",genome_table.ObservationIds genome_table = genome_table.filterObservations(filter_by_set) if genome_table.isEmpty(): raise ValueError( "User filtering by functions (%s) removed all results from the genome table" % (str(limit_to_functions))) otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data( otu_table, genome_table) #We have a list of data with abundances and gene copy numbers lines = [] result = [["Gene","Sample","OTU","GeneCountPerGenome",\ "OTUAbundanceInSample","CountContributedByOTU",\ "ContributionPercentOfSample","ContributionPercentOfAllSamples"]] #TODO refactor as array operations for speed #Zero-valued total counts will be set to epsilon epsilon = 1e-5 for j, gene_id in enumerate(genome_table.ObservationIds): all_gene_rows = [] for k, sample_id in enumerate(otu_table.SampleIds): #Add raw counts for the gene in this sample to a list sample_gene_rows = [] for i, otu_id in enumerate(overlapping_ids): otu_gene_count = genome_data[i][j] otu_abundance = otu_data[i][k] contribution = otu_gene_count * otu_abundance if remove_zero_rows and contribution == 0.0: #skip zero contributions continue sample_gene_rows.append([ gene_id, sample_id, otu_id, otu_gene_count, otu_abundance, contribution ]) #Now get the percentage of each genes contribution to the sample overall total_counts = max( epsilon, sum([float(row[-1]) for row in sample_gene_rows])) for row in sample_gene_rows: percent_of_sample = float(row[-1]) / total_counts row.append(percent_of_sample) all_gene_rows.extend(sample_gene_rows) count_idx = -2 #Counts are now in the next to last position in each row total_counts = max( epsilon, sum([float(row[count_idx]) for row in all_gene_rows])) for row in all_gene_rows: percent_of_sample = float(row[count_idx]) / total_counts row.append(percent_of_sample) lines.extend(all_gene_rows) result.extend(lines) return result
def partition_metagenome_contributions( otu_table, genome_table, limit_to_functions=[], limit_to_functional_categories=[], metadata_key="KEGG_Pathways", remove_zero_rows=True, verbose=True, ): """Return a list of the contribution of each organism to each function, per sample (rewritten version using numpy) otu_table -- the BIOM Table object for the OTU table genome_table -- the BIOM Table object for the predicted genomes limit_to_functions -- a list of function ids to include. If empty, include all function ids limit_by_function_categories -- if provided limit by functional category. For example, this can be used to limit output by KEGG functional categories Output table as a list of lists with header Function\tOrganism\tSample\tCounts\tpercent_of_sample """ if limit_to_functions: if verbose: print "Filtering the genome table to include only user-specified functions:", limit_to_functions ok_ids = frozenset(map(str, limit_to_functions)) filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids genome_table = genome_table.filter(filter_by_set, axis="observation") if genome_table.is_empty(): raise ValueError( "User filtering by functions (%s) removed all results from the genome table" % (str(limit_to_functions)) ) if limit_to_functional_categories: fn_cat_filter = make_pathway_filter_fn( ok_values=frozenset(map(str, limit_to_functional_categories)), metadata_key=metadata_key ) genome_table = genome_table.filter(fn_cat_filter, axis="observation", inplace=False) if genome_table.is_empty(): raise ValueError( "User filtering by functional categories (%s) removed all results from the genome table" % (str(limit_to_functional_categories)) ) otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data(otu_table, genome_table) # We have a list of data with abundances and gene copy numbers lines = [] result = [ [ "Gene", "Sample", "OTU", "GeneCountPerGenome", "OTUAbundanceInSample", "CountContributedByOTU", "ContributionPercentOfSample", "ContributionPercentOfAllSamples", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", ] ] # Zero-valued total counts will be set to epsilon epsilon = 1e-5 for j, gene_id in enumerate(genome_table.ids(axis="observation")): all_gene_rows = [] for k, sample_id in enumerate(otu_table.ids()): # Add raw counts for the gene in this sample to a list sample_gene_rows = [] for i, otu_id in enumerate(overlapping_ids): otu_gene_count = genome_data[i][j] otu_abundance = otu_data[i][k] contribution = otu_gene_count * otu_abundance if remove_zero_rows and contribution == 0.0: # skip zero contributions continue sample_gene_rows.append([gene_id, sample_id, otu_id, otu_gene_count, otu_abundance, contribution]) # Now get the percentage of each genes contribution to the sample overall total_counts = max(epsilon, sum([float(row[-1]) for row in sample_gene_rows])) for row in sample_gene_rows: percent_of_sample = float(row[-1]) / total_counts row.append(percent_of_sample) all_gene_rows.extend(sample_gene_rows) count_idx = -2 # Counts are now in the next to last position in each row total_counts = max(epsilon, sum([float(row[count_idx]) for row in all_gene_rows])) otu_index = 2 # position of otu ids in the table o_md = otu_table.metadata(axis="observation") for row in all_gene_rows: percent_of_sample = float(row[count_idx]) / total_counts row.append(percent_of_sample) # add taxonomy information for each OTU obs_index = otu_table.index(row[otu_index], "observation") if o_md is not None and "taxonomy" in o_md[obs_index]: row.extend(o_md[obs_index]["taxonomy"]) lines.extend(all_gene_rows) result.extend(lines) return result
def partition_metagenome_contributions(otu_table,genome_table, limit_to_functions=[], remove_zero_rows=True,verbose=True): """Return a list of the contribution of each organism to each function, per sample (rewritten version using numpy) otu_table -- the BIOM Table object for the OTU table genome_table -- the BIOM Table object for the predicted genomes limit_to_functions -- a list of function ids to include. If empty, include all function ids Output table as a list of lists with header Function\tOrganism\tSample\tCounts\tpercent_of_sample """ if limit_to_functions: if verbose: print "Filtering the genome table to include only user-specified functions:",limit_to_functions ok_ids = frozenset(map(str,limit_to_functions)) filter_by_set = lambda vals,gene_id,metadata: str(gene_id) in ok_ids #filter_by_set = lambda vals,gene_id,metadata: gene_id in ok_ids #if verbose: #print dir(genome_table) # print "Valid function ids:",genome_table.ObservationIds genome_table = genome_table.filterObservations(filter_by_set) if genome_table.isEmpty(): raise ValueError("User filtering by functions (%s) removed all results from the genome table"%(str(limit_to_functions))) otu_data,genome_data,overlapping_ids = extract_otu_and_genome_data(otu_table,genome_table) #We have a list of data with abundances and gene copy numbers lines=[] result = [["Gene","Sample","OTU","GeneCountPerGenome",\ "OTUAbundanceInSample","CountContributedByOTU",\ "ContributionPercentOfSample","ContributionPercentOfAllSamples"]] #TODO refactor as array operations for speed #Zero-valued total counts will be set to epsilon epsilon = 1e-5 for j,gene_id in enumerate(genome_table.ObservationIds): all_gene_rows = [] for k,sample_id in enumerate(otu_table.SampleIds): #Add raw counts for the gene in this sample to a list sample_gene_rows = [] for i,otu_id in enumerate(overlapping_ids): otu_gene_count = genome_data[i][j] otu_abundance = otu_data[i][k] contribution = otu_gene_count * otu_abundance if remove_zero_rows and contribution == 0.0: #skip zero contributions continue sample_gene_rows.append([gene_id,sample_id,otu_id,otu_gene_count,otu_abundance,contribution]) #Now get the percentage of each genes contribution to the sample overall total_counts =max(epsilon,sum([float(row[-1]) for row in sample_gene_rows])) for row in sample_gene_rows: percent_of_sample = float(row[-1])/total_counts row.append(percent_of_sample) all_gene_rows.extend(sample_gene_rows) count_idx = -2 #Counts are now in the next to last position in each row total_counts =max(epsilon,sum([float(row[count_idx]) for row in all_gene_rows])) for row in all_gene_rows: percent_of_sample = float(row[count_idx])/total_counts row.append(percent_of_sample) lines.extend(all_gene_rows) result.extend(lines) return result
def partition_metagenome_contributions(otu_table, genome_table, limit_to_functions=[], limit_to_functional_categories=[], metadata_key='KEGG_Pathways', remove_zero_rows=True, verbose=True): """Return a list of the contribution of each organism to each function, per sample (rewritten version using numpy) otu_table -- the BIOM Table object for the OTU table genome_table -- the BIOM Table object for the predicted genomes limit_to_functions -- a list of function ids to include. If empty, include all function ids limit_by_function_categories -- if provided limit by functional category. For example, this can be used to limit output by KEGG functional categories Output table as a list of lists with header Function\tOrganism\tSample\tCounts\tpercent_of_sample """ if limit_to_functions: if verbose: print "Filtering the genome table to include only user-specified functions:", limit_to_functions ok_ids = frozenset(map(str, limit_to_functions)) filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids genome_table = genome_table.filter(filter_by_set, axis='observation') if genome_table.is_empty(): raise ValueError( "User filtering by functions (%s) removed all results from the genome table" % (str(limit_to_functions))) if limit_to_functional_categories: fn_cat_filter = make_pathway_filter_fn(ok_values=frozenset( map(str, limit_to_functional_categories)), metadata_key=metadata_key) genome_table = genome_table.filter(fn_cat_filter, axis='observation', inplace=False) if genome_table.is_empty(): raise ValueError( "User filtering by functional categories (%s) removed all results from the genome table" % (str(limit_to_functional_categories))) otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data( otu_table, genome_table) #We have a list of data with abundances and gene copy numbers lines = [] result = [["Gene","Sample","OTU","GeneCountPerGenome",\ "OTUAbundanceInSample","CountContributedByOTU",\ "ContributionPercentOfSample","ContributionPercentOfAllSamples", "Kingdom","Phylum","Class","Order","Family","Genus","Species"]] #Zero-valued total counts will be set to epsilon epsilon = 1e-5 for j, gene_id in enumerate(genome_table.ids(axis='observation')): all_gene_rows = [] for k, sample_id in enumerate(otu_table.ids()): #Add raw counts for the gene in this sample to a list sample_gene_rows = [] for i, otu_id in enumerate(overlapping_ids): otu_gene_count = genome_data[i][j] otu_abundance = otu_data[i][k] contribution = otu_gene_count * otu_abundance if remove_zero_rows and contribution == 0.0: #skip zero contributions continue sample_gene_rows.append([ gene_id, sample_id, otu_id, otu_gene_count, otu_abundance, contribution ]) #Now get the percentage of each genes contribution to the sample overall total_counts = max( epsilon, sum([float(row[-1]) for row in sample_gene_rows])) for row in sample_gene_rows: percent_of_sample = float(row[-1]) / total_counts row.append(percent_of_sample) all_gene_rows.extend(sample_gene_rows) count_idx = -2 #Counts are now in the next to last position in each row total_counts = max( epsilon, sum([float(row[count_idx]) for row in all_gene_rows])) otu_index = 2 #position of otu ids in the table o_md = otu_table.metadata(axis='observation') for row in all_gene_rows: percent_of_sample = float(row[count_idx]) / total_counts row.append(percent_of_sample) #add taxonomy information for each OTU obs_index = otu_table.index(row[otu_index], 'observation') if o_md is not None and 'taxonomy' in o_md[obs_index]: row.extend(o_md[obs_index]['taxonomy']) lines.extend(all_gene_rows) result.extend(lines) return result