示例#1
0
def partition_metagenome_contributions(otu_table,genome_table):
    """Return a list of the contribution of each organism to each function, per sample

    otu_table -- the BIOM Table object for the OTU table
    genome_table -- the BIOM Table object for the predicted genomes

    Output table as a list of lists with header
    Function\tOrganism\tSample\tCounts\tpercent_of_sample
    """
    
    otu_data,genome_data,overlapping_ids = extract_otu_and_genome_data(otu_table,genome_table)
    #We have a list of data with abundances and gene copy numbers
    lines=[]
    result = [["Gene","Sample","OTU","GeneCountPerGenome",\
            "OTUAbundanceInSample","CountContributedByOTU",\
            "ContributionPercentOfSample","ContributionPercentOfAllSamples"]]

    #TODO refactor as array operations for speed

    #Zero-valued total counts will be set to epsilon 
    epsilon = 1e-5
    
    for j,gene_id in enumerate(genome_table.ObservationIds):
        all_gene_rows = []
        for k,sample_id in enumerate(otu_table.SampleIds):
            #Add raw counts for the gene in this sample to a list
            sample_gene_rows = []
            for i,otu_id in enumerate(overlapping_ids):
                otu_gene_count = genome_data[i][j]
                otu_abundance = otu_data[i][k]
                contribution =  otu_gene_count * otu_abundance
                sample_gene_rows.append([gene_id,sample_id,otu_id,otu_gene_count,otu_abundance,contribution])
            #Now get the percentage of each genes contribution to the sample overall
            total_counts =max(epsilon,sum([float(row[-1]) for row in sample_gene_rows]))

            for row in sample_gene_rows:
                percent_of_sample = float(row[-1])/total_counts
                row.append(percent_of_sample)
            all_gene_rows.extend(sample_gene_rows)
        
        count_idx = -2 #Counts are now in the next to last position in each row
        total_counts =max(epsilon,sum([float(row[count_idx]) for row in all_gene_rows]))

        for row in all_gene_rows:
            percent_of_sample = float(row[count_idx])/total_counts
            row.append(percent_of_sample)
        lines.extend(all_gene_rows)
    result.extend(lines)

    return result
def partition_metagenome_contributions(otu_table,
                                       genome_table,
                                       limit_to_functions=[],
                                       remove_zero_rows=True,
                                       verbose=True):
    """Return a list of the contribution of each organism to each function, per sample
    (rewritten version using numpy)
    otu_table -- the BIOM Table object for the OTU table
    genome_table -- the BIOM Table object for the predicted genomes
    limit_to_functions -- a list of function ids to include.  If empty, include all function ids
    Output table as a list of lists with header
    Function\tOrganism\tSample\tCounts\tpercent_of_sample
    """

    if limit_to_functions:
        if verbose:
            print "Filtering the genome table to include only user-specified functions:", limit_to_functions
        ok_ids = frozenset(map(str, limit_to_functions))

        filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids
        #filter_by_set = lambda vals,gene_id,metadata: gene_id in ok_ids

        #if verbose:
        #print dir(genome_table)
        #    print "Valid function ids:",genome_table.ObservationIds
        genome_table = genome_table.filterObservations(filter_by_set)

        if genome_table.isEmpty():
            raise ValueError(
                "User filtering by functions (%s) removed all results from the genome table"
                % (str(limit_to_functions)))

    otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data(
        otu_table, genome_table)
    #We have a list of data with abundances and gene copy numbers
    lines = []
    result = [["Gene","Sample","OTU","GeneCountPerGenome",\
            "OTUAbundanceInSample","CountContributedByOTU",\
            "ContributionPercentOfSample","ContributionPercentOfAllSamples"]]

    #TODO refactor as array operations for speed

    #Zero-valued total counts will be set to epsilon
    epsilon = 1e-5

    for j, gene_id in enumerate(genome_table.ObservationIds):
        all_gene_rows = []
        for k, sample_id in enumerate(otu_table.SampleIds):
            #Add raw counts for the gene in this sample to a list
            sample_gene_rows = []
            for i, otu_id in enumerate(overlapping_ids):
                otu_gene_count = genome_data[i][j]
                otu_abundance = otu_data[i][k]
                contribution = otu_gene_count * otu_abundance
                if remove_zero_rows and contribution == 0.0:
                    #skip zero contributions
                    continue
                sample_gene_rows.append([
                    gene_id, sample_id, otu_id, otu_gene_count, otu_abundance,
                    contribution
                ])
            #Now get the percentage of each genes contribution to the sample overall
            total_counts = max(
                epsilon, sum([float(row[-1]) for row in sample_gene_rows]))

            for row in sample_gene_rows:
                percent_of_sample = float(row[-1]) / total_counts
                row.append(percent_of_sample)
            all_gene_rows.extend(sample_gene_rows)

        count_idx = -2  #Counts are now in the next to last position in each row
        total_counts = max(
            epsilon, sum([float(row[count_idx]) for row in all_gene_rows]))

        for row in all_gene_rows:
            percent_of_sample = float(row[count_idx]) / total_counts
            row.append(percent_of_sample)
        lines.extend(all_gene_rows)
    result.extend(lines)

    return result
def partition_metagenome_contributions(
    otu_table,
    genome_table,
    limit_to_functions=[],
    limit_to_functional_categories=[],
    metadata_key="KEGG_Pathways",
    remove_zero_rows=True,
    verbose=True,
):
    """Return a list of the contribution of each organism to each function, per sample
    (rewritten version using numpy)
    otu_table -- the BIOM Table object for the OTU table
    genome_table -- the BIOM Table object for the predicted genomes
    limit_to_functions -- a list of function ids to include.
      If empty, include all function ids

    limit_by_function_categories -- if provided limit by functional category.
      For example, this can be used to limit output by KEGG functional categories

    Output table as a list of lists with header
    Function\tOrganism\tSample\tCounts\tpercent_of_sample
    """

    if limit_to_functions:
        if verbose:
            print "Filtering the genome table to include only user-specified functions:", limit_to_functions
        ok_ids = frozenset(map(str, limit_to_functions))

        filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids
        genome_table = genome_table.filter(filter_by_set, axis="observation")

        if genome_table.is_empty():
            raise ValueError(
                "User filtering by functions (%s) removed all results from the genome table" % (str(limit_to_functions))
            )

    if limit_to_functional_categories:
        fn_cat_filter = make_pathway_filter_fn(
            ok_values=frozenset(map(str, limit_to_functional_categories)), metadata_key=metadata_key
        )
        genome_table = genome_table.filter(fn_cat_filter, axis="observation", inplace=False)

        if genome_table.is_empty():
            raise ValueError(
                "User filtering by functional categories (%s) removed all results from the genome table"
                % (str(limit_to_functional_categories))
            )

    otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data(otu_table, genome_table)
    # We have a list of data with abundances and gene copy numbers
    lines = []
    result = [
        [
            "Gene",
            "Sample",
            "OTU",
            "GeneCountPerGenome",
            "OTUAbundanceInSample",
            "CountContributedByOTU",
            "ContributionPercentOfSample",
            "ContributionPercentOfAllSamples",
            "Kingdom",
            "Phylum",
            "Class",
            "Order",
            "Family",
            "Genus",
            "Species",
        ]
    ]

    # Zero-valued total counts will be set to epsilon
    epsilon = 1e-5

    for j, gene_id in enumerate(genome_table.ids(axis="observation")):
        all_gene_rows = []
        for k, sample_id in enumerate(otu_table.ids()):
            # Add raw counts for the gene in this sample to a list
            sample_gene_rows = []
            for i, otu_id in enumerate(overlapping_ids):
                otu_gene_count = genome_data[i][j]
                otu_abundance = otu_data[i][k]
                contribution = otu_gene_count * otu_abundance
                if remove_zero_rows and contribution == 0.0:
                    # skip zero contributions
                    continue
                sample_gene_rows.append([gene_id, sample_id, otu_id, otu_gene_count, otu_abundance, contribution])
            # Now get the percentage of each genes contribution to the sample overall
            total_counts = max(epsilon, sum([float(row[-1]) for row in sample_gene_rows]))

            for row in sample_gene_rows:
                percent_of_sample = float(row[-1]) / total_counts
                row.append(percent_of_sample)
            all_gene_rows.extend(sample_gene_rows)

        count_idx = -2  # Counts are now in the next to last position in each row
        total_counts = max(epsilon, sum([float(row[count_idx]) for row in all_gene_rows]))
        otu_index = 2  # position of otu ids in the table

        o_md = otu_table.metadata(axis="observation")
        for row in all_gene_rows:
            percent_of_sample = float(row[count_idx]) / total_counts
            row.append(percent_of_sample)

            # add taxonomy information for each OTU
            obs_index = otu_table.index(row[otu_index], "observation")
            if o_md is not None and "taxonomy" in o_md[obs_index]:
                row.extend(o_md[obs_index]["taxonomy"])

        lines.extend(all_gene_rows)
    result.extend(lines)

    return result
def partition_metagenome_contributions(otu_table,genome_table, limit_to_functions=[], remove_zero_rows=True,verbose=True):
    """Return a list of the contribution of each organism to each function, per sample
    (rewritten version using numpy)
    otu_table -- the BIOM Table object for the OTU table
    genome_table -- the BIOM Table object for the predicted genomes
    limit_to_functions -- a list of function ids to include.  If empty, include all function ids
    Output table as a list of lists with header
    Function\tOrganism\tSample\tCounts\tpercent_of_sample
    """
    
    if limit_to_functions:
        if verbose:
            print "Filtering the genome table to include only user-specified functions:",limit_to_functions
        ok_ids = frozenset(map(str,limit_to_functions))
        
        filter_by_set = lambda vals,gene_id,metadata: str(gene_id) in ok_ids
        #filter_by_set = lambda vals,gene_id,metadata: gene_id in ok_ids
        
        #if verbose:
            #print dir(genome_table)
        #    print "Valid function ids:",genome_table.ObservationIds
        genome_table = genome_table.filterObservations(filter_by_set)
        
        if genome_table.isEmpty():
            raise ValueError("User filtering by functions (%s) removed all results from the genome table"%(str(limit_to_functions)))

    otu_data,genome_data,overlapping_ids = extract_otu_and_genome_data(otu_table,genome_table)
    #We have a list of data with abundances and gene copy numbers
    lines=[]
    result = [["Gene","Sample","OTU","GeneCountPerGenome",\
            "OTUAbundanceInSample","CountContributedByOTU",\
            "ContributionPercentOfSample","ContributionPercentOfAllSamples"]]

    #TODO refactor as array operations for speed

    #Zero-valued total counts will be set to epsilon 
    epsilon = 1e-5

    for j,gene_id in enumerate(genome_table.ObservationIds):
        all_gene_rows = []
        for k,sample_id in enumerate(otu_table.SampleIds):
            #Add raw counts for the gene in this sample to a list
            sample_gene_rows = []
            for i,otu_id in enumerate(overlapping_ids):
                otu_gene_count = genome_data[i][j]
                otu_abundance = otu_data[i][k]
                contribution =  otu_gene_count * otu_abundance
                if remove_zero_rows and contribution == 0.0:
                    #skip zero contributions
                    continue
                sample_gene_rows.append([gene_id,sample_id,otu_id,otu_gene_count,otu_abundance,contribution])
            #Now get the percentage of each genes contribution to the sample overall
            total_counts =max(epsilon,sum([float(row[-1]) for row in sample_gene_rows]))

            for row in sample_gene_rows:
                percent_of_sample = float(row[-1])/total_counts
                row.append(percent_of_sample)
            all_gene_rows.extend(sample_gene_rows)
        
        count_idx = -2 #Counts are now in the next to last position in each row
        total_counts =max(epsilon,sum([float(row[count_idx]) for row in all_gene_rows]))

        for row in all_gene_rows:
            percent_of_sample = float(row[count_idx])/total_counts
            row.append(percent_of_sample)
        lines.extend(all_gene_rows)
    result.extend(lines)

    return result
示例#5
0
def partition_metagenome_contributions(otu_table,
                                       genome_table,
                                       limit_to_functions=[],
                                       limit_to_functional_categories=[],
                                       metadata_key='KEGG_Pathways',
                                       remove_zero_rows=True,
                                       verbose=True):
    """Return a list of the contribution of each organism to each function, per sample
    (rewritten version using numpy)
    otu_table -- the BIOM Table object for the OTU table
    genome_table -- the BIOM Table object for the predicted genomes
    limit_to_functions -- a list of function ids to include.
      If empty, include all function ids

    limit_by_function_categories -- if provided limit by functional category.
      For example, this can be used to limit output by KEGG functional categories

    Output table as a list of lists with header
    Function\tOrganism\tSample\tCounts\tpercent_of_sample
    """

    if limit_to_functions:
        if verbose:
            print "Filtering the genome table to include only user-specified functions:", limit_to_functions
        ok_ids = frozenset(map(str, limit_to_functions))

        filter_by_set = lambda vals, gene_id, metadata: str(gene_id) in ok_ids
        genome_table = genome_table.filter(filter_by_set, axis='observation')

        if genome_table.is_empty():
            raise ValueError(
                "User filtering by functions (%s) removed all results from the genome table"
                % (str(limit_to_functions)))

    if limit_to_functional_categories:
        fn_cat_filter = make_pathway_filter_fn(ok_values=frozenset(
            map(str, limit_to_functional_categories)),
                                               metadata_key=metadata_key)
        genome_table = genome_table.filter(fn_cat_filter,
                                           axis='observation',
                                           inplace=False)

        if genome_table.is_empty():
            raise ValueError(
                "User filtering by functional categories (%s) removed all results from the genome table"
                % (str(limit_to_functional_categories)))

    otu_data, genome_data, overlapping_ids = extract_otu_and_genome_data(
        otu_table, genome_table)
    #We have a list of data with abundances and gene copy numbers
    lines = []
    result = [["Gene","Sample","OTU","GeneCountPerGenome",\
            "OTUAbundanceInSample","CountContributedByOTU",\
            "ContributionPercentOfSample","ContributionPercentOfAllSamples",
                   "Kingdom","Phylum","Class","Order","Family","Genus","Species"]]

    #Zero-valued total counts will be set to epsilon
    epsilon = 1e-5

    for j, gene_id in enumerate(genome_table.ids(axis='observation')):
        all_gene_rows = []
        for k, sample_id in enumerate(otu_table.ids()):
            #Add raw counts for the gene in this sample to a list
            sample_gene_rows = []
            for i, otu_id in enumerate(overlapping_ids):
                otu_gene_count = genome_data[i][j]
                otu_abundance = otu_data[i][k]
                contribution = otu_gene_count * otu_abundance
                if remove_zero_rows and contribution == 0.0:
                    #skip zero contributions
                    continue
                sample_gene_rows.append([
                    gene_id, sample_id, otu_id, otu_gene_count, otu_abundance,
                    contribution
                ])
            #Now get the percentage of each genes contribution to the sample overall
            total_counts = max(
                epsilon, sum([float(row[-1]) for row in sample_gene_rows]))

            for row in sample_gene_rows:
                percent_of_sample = float(row[-1]) / total_counts
                row.append(percent_of_sample)
            all_gene_rows.extend(sample_gene_rows)

        count_idx = -2  #Counts are now in the next to last position in each row
        total_counts = max(
            epsilon, sum([float(row[count_idx]) for row in all_gene_rows]))
        otu_index = 2  #position of otu ids in the table

        o_md = otu_table.metadata(axis='observation')
        for row in all_gene_rows:
            percent_of_sample = float(row[count_idx]) / total_counts
            row.append(percent_of_sample)

            #add taxonomy information for each OTU
            obs_index = otu_table.index(row[otu_index], 'observation')
            if o_md is not None and 'taxonomy' in o_md[obs_index]:
                row.extend(o_md[obs_index]['taxonomy'])

        lines.extend(all_gene_rows)
    result.extend(lines)

    return result