Пример #1
0
 def write_biom(self, sample_names, read_taxonomies, biom_file_io):
     '''Write the OTU info to a biom IO output stream
     
     Parameters
     ----------
     sample_names: String
         names of each sample (sample_ids for biom)
     read_taxonomies: Array of hashes as per _iterate_otu_table_rows()
     biom_file_io: io
         open writeable stream to write biom contents to
         
     Returns True if successful, else False'''
     counts = []
     observ_metadata = []
     otu_ids = []
     for otu_id, tax, count in self._iterate_otu_table_rows(read_taxonomies):
         if len(count) != len(sample_names):
             raise Exception("Programming error: mismatched sample names and counts")
         counts.append(count)
         observ_metadata.append({'taxonomy': tax})
         otu_ids.append(str(otu_id))
     if len(counts) == 0:
         logging.info("Not writing BIOM file since no sequences were assigned taxonomy")
         return True
     table = Table(np.array(counts),
                   otu_ids, sample_names, observ_metadata,
                   [{}]*len(sample_names), table_id='GraftM Taxonomy Count Table')
     try:
         table.to_hdf5(biom_file_io, 'GraftM graft')
         return True
     except RuntimeError as e:
         logging.warn("Error writing BIOM output, file not written. The specific error was: %s" % e)
         return False
def load_BIOM(table, informat='json', v=1):
    """
    load a BIOM table from BIOM format. Default format is 'json'.
    """
    from biom.table import Table
    import json
    import sys
    
    informats = ['json','tsv']
    if not informat in informats:
        print "\nPlease specify a valid BIOM input format. Currently we support: '%s'.\n" %"', '".join(informats)
    else:
        if v:
            print "\nSpecified BIOM input format '%s' - ok!" %(informat)
    
    if informat == 'json':
        with open(table) as data_file:
            data = json.load(data_file)
        t = Table.from_json(data)

    elif informat == 'tsv':
        tsv = open(in_tsv)
        func = lambda x : x
        t = Table.from_tsv(tsv, obs_mapping=None, sample_mapping=None, process_func=func)
        tsv.close()
        
    return t
Пример #3
0
    def setUp(self):
        """define some top-level data"""

        self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]])
        {
            (0, 2): 9.0,
            (0, 3): 5.0,
            (0, 4): 3.0,
            (0, 5): 1.0,
            (1, 0): 1.0,
            (1, 1): 5.0,
            (1, 2): 4.0,
            (1, 4): 3.0,
            (1, 5): 2.0,
            (2, 0): 2.0,
            (2, 1): 3.0,
            (2, 2): 1.0,
            (2, 3): 1.0,
            (2, 4): 2.0,
            (2, 5): 5.0,
        }
        self.otu_table = Table(
            self.otu_table_values,
            ["OTU1", "OTU2", "OTU3"],
            ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"],
            [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}, {"taxonomy": ["Streptococcus"]}],
            [None, None, None, None, None, None],
        )
        self.otu_table_f = Table(
            self.otu_table_values,
            ["OTU1", "OTU2", "OTU3"],
            ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"],
            [
                {"taxonomy": ["1A", "1B", "1C", "Bacteria"]},
                {"taxonomy": ["2A", "2B", "2C", "Archaea"]},
                {"taxonomy": ["3A", "3B", "3C", "Streptococcus"]},
            ],
            [None, None, None, None, None, None],
        )

        self.full_lineages = [
            ["1A", "1B", "1C", "Bacteria"],
            ["2A", "2B", "2C", "Archaea"],
            ["3A", "3B", "3C", "Streptococcus"],
        ]
        self.metadata = [
            [
                ["Sample1", "NA", "A"],
                ["Sample2", "NA", "B"],
                ["Sample3", "NA", "A"],
                ["Sample4", "NA", "B"],
                ["Sample5", "NA", "A"],
                ["Sample6", "NA", "B"],
            ],
            ["SampleID", "CAT1", "CAT2"],
            [],
        ]
        self.tree_text = ["('OTU3',('OTU1','OTU2'))"]
        fh, self.tmp_heatmap_fpath = mkstemp(prefix="test_heatmap_", suffix=".pdf")
        close(fh)
Пример #4
0
 def test_tree_filter_table_none(self):
     rooted_nwk = io.StringIO("(O1:4.5,(O2:4,(a:1,b:1):2):0.5);")
     tree = skbio.TreeNode.read(rooted_nwk)
     table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                   ['O1', 'O2'],
                   ['S1', 'S2', 'S3'])
     actual = filter_table(table, tree)
     expected = table.filter(['O1', 'O2'], axis='observation')
     self.assertEqual(actual, expected)
Пример #5
0
def parse_biom_table(fp, input_is_dense=False):
    try:
        return Table.from_hdf5(fp)
    except:
        pass

    if hasattr(fp, 'read'):
        return Table.from_json(json.load(fp), input_is_dense=input_is_dense)
    elif isinstance(fp, list):
        return Table.from_json(json.loads(''.join(fp)),
                               input_is_dense=input_is_dense)
    else:
        return Table.from_json(json.loads(fp), input_is_dense=input_is_dense)
def BIOM_return_clipped_taxonomy(taxlevel, BIOM):
    """
    Returns a BIOM table for which the taxonomy has been clipped at a certain level
    """
    
    from biom.table import Table
    import numpy as np
    
    return_OTUs = {}
    levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'unassigned']
    clip_level=''
    to_drop=[]
    
    if not taxlevel in levels:
        raise KeyError("The taxonomic level you are trying to search: '%s', is not valid" %level)
    
    clip_level = int(levels.index(taxlevel))+1
    #check if the first OTU has 'taxonomy' metadata attached, if yes assume all others have too and resume
    if not 'taxonomy' in BIOM.metadata(axis='observation')[0]:
        raise KeyError('The BIOM table you are trying to screen does not have taxonomy metadata attached to it')
    else:
        print "Found taxonomy metadata with OTUs - ok!"
        
    sample_ids = BIOM.ids(axis='sample')
    observation_ids = BIOM.ids(axis='observation')
    data_to_biom = []
    sample_metadata = BIOM.metadata(axis='sample')
    observation_metadata = BIOM.metadata(axis='observation')
    
    for OTU in observation_ids:
        orig=BIOM.data(OTU, axis='observation')
        data_to_biom.append(orig)
        
    data = np.asarray(data_to_biom)
    
    for i in range(len(observation_metadata)):
        if len(observation_metadata[i]['taxonomy']) > clip_level:
            observation_metadata[i]['taxonomy'] = observation_metadata[i]['taxonomy'][:clip_level]
        if 'unknown' in observation_metadata[i]['taxonomy'][-1]:
            print "fishy: %s" %observation_metadata[i]['taxonomy']
            to_drop.append(observation_ids[i])
#        print observation_metadata[i]['taxonomy']
        
    #construct adjusted table
    outtable = Table(data, observation_ids, sample_ids, table_id='OTU table', sample_metadata=sample_metadata, observation_metadata=observation_metadata)
    
    if to_drop:
        outtable.filter(to_drop, invert=True, axis='observation',inplace=True)
        
    
    return outtable
def BIOM_tsv_to_R_transpose(in_tsv, out_csv):
    """
    Parse a biom table in tsv format and transpose it for input into R
    """
    
    from biom import Table
    
    tsv = open(in_tsv)
    #in_tsv = open('COI-trim30min100-merge-c3-id97-OTU-taxonomy.kraken.tsv')
    func = lambda x : x
    intable = Table.from_tsv(tsv,obs_mapping=None, sample_mapping=None, process_func=func)
    outtable = intable.transpose()
    out=open("transposed.tsv","w")
    out.write(outtable.to_tsv(header_key=None, header_value=None))
    out.close()

    #refine
    intable = open('transposed.tsv','r')
    temp = intable.next()

    out=''
    for line in intable:
        if line.startswith('#'):
            if line.strip().endswith('taxomomy'):
                print "Removing taxonomy"
                line = ",".join(line.strip().split("\t")[:-1]).replace('#OTU ID','Sample').replace('\t',',')+'\n'
            line = line.replace('#OTU ID','Sample').replace('\t',',')
            out+=line
        else:
            line = line.replace('\t',',')
            out+=line

    outtable = open(out_csv,'w')
    outtable.write(out)
    outtable.close()
Пример #8
0
    def setUp(self):
        self.otu_table_vals = array([[1, 0, 2, 4],
                                     [1, 2, 0, 1],
                                     [0, 1, 1, 0],
                                     [1, 2, 1, 0]])

        self.otu_table = Table(self.otu_table_vals,
                                       ['0', '1', '2', '3'],
                                       ['s1', 's2', 's3', 's4'],
                                       [{"taxonomy": ["Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae"]},
                                        {"taxonomy": ["Root",
                                                      "Bacteria",
                                                      "Firmicutes",
                                                      "\"Clostridia\""]},
                                        {"taxonomy": ["Root",
                                                      "Bacteria",
                                                      "Firmicutes",
                                                      "\"Clostridia\""]},
                                        {"taxonomy": ["Root", "Bacteria"]}],
                                        None,)

        self.mapping = """#SampleID\tBarcodeSequence\tTreatment\tDescription
#Test mapping file
s1\tAAAA\tControl\tControl mouse, I.D. 354
s2\tGGGG\tControl\tControl mouse, I.D. 355
s3\tCCCC\tExp\tDisease mouse, I.D. 356
s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n')
Пример #9
0
    def run(self, **kwargs):
        json_table_str = kwargs['json_table_str']
        hdf5_biom = kwargs['hdf5_table']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError("Invalid axis '%s'. Must be either %s." % (
                axis,
                ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if hdf5_biom is None and json_table_str is None:
            raise CommandError("Must specify an input table")
        elif hdf5_biom is not None and json_table_str is not None:
            raise CommandError("Can only specify one input table")

        if json_table_str is not None:
            idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
            new_data = direct_slice_data(json_table_str, idxs, axis)

            # multiple walks over the string. bad form, but easy right now
            # ...should add a yield_and_ignore parser or something.
            def subset_generator():
                yield "{"
                yield direct_parse_key(json_table_str, "id")
                yield ","
                yield direct_parse_key(json_table_str, "format")
                yield ","
                yield direct_parse_key(json_table_str, "format_url")
                yield ","
                yield direct_parse_key(json_table_str, "type")
                yield ","
                yield direct_parse_key(json_table_str, "generated_by")
                yield ","
                yield direct_parse_key(json_table_str, "date")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_type")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_element_type")
                yield ","
                yield new_data
                yield ","
                yield new_axis_md
                yield ","

                if axis == "observation":
                    yield direct_parse_key(json_table_str, "columns")
                else:
                    yield direct_parse_key(json_table_str, "rows")
                yield "}"

            format_ = 'json'
            table = subset_generator()
        else:
            with biom_open(hdf5_biom) as f:
                table = Table.from_hdf5(f, ids=ids, axis=axis)
            format_ = 'hdf5'

        return {'subsetted_table': (table, format_)}
def filter_BIOM_by_per_sample_read_prop(BIOM, min_prop=0.01):
    """
    Filter OTU table by mininimum reads per sample
    """

    import numpy as np
    from biom.table import Table

    print "\nFiltering at level: %s %%\n" %(min_prop*100)
    
#    print "input table:\n"
#    print BIOM
#    print "\n"
    
    sample_ids = BIOM.ids(axis='sample')
    observation_ids = BIOM.ids(axis='observation')
    data_to_biom = []
    sample_metadata = BIOM.metadata(axis='sample')
    observation_metadata = BIOM.metadata(axis='observation')
    sums = BIOM.sum(axis='sample')

    for OTU in observation_ids:
        orig=BIOM.data(OTU, axis='observation')
        for i in range(len(orig)):
            if not int(orig[i]) == 0:
                if not int(orig[i]) >= sums[i]*min_prop:
                    orig[i] = '0.0'
        data_to_biom.append(orig)
    
    data = np.asarray(data_to_biom)

    #construct adjusted table
    table = Table(data, observation_ids, sample_ids, table_id='OTU table', sample_metadata=sample_metadata, observation_metadata=observation_metadata)

    #Filter OTUs with sum = '0'
    to_exclude = []
    observation_sums = table.sum(axis='observation')
    for i in range(len(observation_sums)):
        if int(observation_sums[i]) == 0:
            to_exclude.append(observation_ids[i])
    
    print "Removing %i OTUs for lack of support\n" %len(to_exclude)
    table.filter(to_exclude, invert=True, axis='observation',inplace=True)
    
#    print table
    return table
Пример #11
0
def main(table_loc, otu_list, collapsed_name, output_file, classic=False):
    table = load_table(table_loc)
    f = open(otu_list)
    otus = f.read().strip().split()
    otus = set(otus) & set(table.ids(axis="observation"))
    table1 = table.filter(otus, axis="observation", inplace=False)
    table2 = table.filter(otus, axis="observation", invert=True, inplace=False)
    sums1 = table1.sum(axis='sample')
    sums2 = table2.sum(axis='sample')
    new_table = Table(numpy.array([sums1,sums2]), [collapsed_name, "not_"+collapsed_name], table.ids(axis="sample"), type="otu baptable")
    
    if classic:
        # print to tab delimited biom table
        open(output_file, 'w').write(new_table.to_tsv())
    else:
        # print biom table
        new_table.to_json("predict_reactions.py", open(output_file, 'w'))
Пример #12
0
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID):
    # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column).  For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails.
    with open(OTU_table_classic,'r') as fidin:
        otu_table_data = fidin.readlines()
        firstrow = otu_table_data[0].split('\t')
        sample_labels = firstrow[1:]
        sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n')
        OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))]
        nOTUs = len(OTU_labels)
        nSamples = len(sample_labels)
        # Load OTU table row major order
        OTU_table_data = np.zeros((nOTUs, nSamples))
        for i in range(1,nOTUs+1):
            OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:]
        # Write in BIOM format
        t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID)
        with biom_open(OTU_table_biom, 'w') as f:
            t.to_hdf5(f, "Generated by processing layer", compress=False)
    def setUp(self):
        """define some top-level data"""

        self.otu_table_values = array([[0, 0, 9, 5, 3, 1],
                                       [1, 5, 4, 0, 3, 2],
                                       [2, 3, 1, 1, 2, 5]])
        {(0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0,
         (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0,
         (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0}
        self.otu_table = Table(self.otu_table_values,
                                       ['OTU1', 'OTU2', 'OTU3'],
                                       ['Sample1', 'Sample2', 'Sample3',
                                        'Sample4', 'Sample5', 'Sample6'],
                                       [{"taxonomy": ['Bacteria']},
                                        {"taxonomy": ['Archaea']},
                                        {"taxonomy": ['Streptococcus']}],
                                        [None, None, None, None, None, None])
        self.otu_table_f = Table(self.otu_table_values,
                                         ['OTU1', 'OTU2', 'OTU3'],
                                         ['Sample1', 'Sample2', 'Sample3',
                                          'Sample4', 'Sample5', 'Sample6'],
                                         [{"taxonomy": ['1A', '1B', '1C', 'Bacteria']},
                                          {"taxonomy":
                                           ['2A', '2B', '2C', 'Archaea']},
                                          {"taxonomy": ['3A', '3B', '3C', 'Streptococcus']}],
                                          [None, None, None, None, None, None])

        self.full_lineages = [['1A', '1B', '1C', 'Bacteria'],
                              ['2A', '2B', '2C', 'Archaea'],
                              ['3A', '3B', '3C', 'Streptococcus']]
        self.metadata = [[['Sample1', 'NA', 'A'],
                          ['Sample2', 'NA', 'B'],
                          ['Sample3', 'NA', 'A'],
                          ['Sample4', 'NA', 'B'],
                          ['Sample5', 'NA', 'A'],
                          ['Sample6', 'NA', 'B']],
                         ['SampleID', 'CAT1', 'CAT2'], []]
        self.tree_text = ["('OTU3',('OTU1','OTU2'))"]
        fh, self.tmp_heatmap_fpath = mkstemp(prefix='test_heatmap_',
                                            suffix='.pdf')
        close(fh)
Пример #14
0
def convert_table_to_biom(table_f, sample_mapping, obs_mapping,
                          process_func, **kwargs):
    """Convert a contigency table to a biom table

    sample_mapping : dict of {'sample_id':metadata} or None
    obs_mapping : dict of {'obs_id':metadata} or None
    process_func: a function to transform observation metadata
    dtype : type of table data
    """
    otu_table = Table.from_tsv(table_f, obs_mapping, sample_mapping,
                               process_func, **kwargs)
    return otu_table.to_json(generatedby())
Пример #15
0
def _subset_table(hdf5_biom, json_table_str, axis, ids):
    if axis not in ['sample', 'observation']:
        raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
                         "'observation'." % axis)

    if hdf5_biom is None and json_table_str is None:
        raise ValueError("Must specify an input table")
    elif hdf5_biom is not None and json_table_str is not None:
        raise ValueError("Can only specify one input table")

    if json_table_str is not None:
        idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
        new_data = direct_slice_data(json_table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(json_table_str, "id")
            yield ","
            yield direct_parse_key(json_table_str, "format")
            yield ","
            yield direct_parse_key(json_table_str, "format_url")
            yield ","
            yield direct_parse_key(json_table_str, "type")
            yield ","
            yield direct_parse_key(json_table_str, "generated_by")
            yield ","
            yield direct_parse_key(json_table_str, "date")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_type")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observation":
                yield direct_parse_key(json_table_str, "columns")
            else:
                yield direct_parse_key(json_table_str, "rows")
            yield "}"

        format_ = 'json'
        table = subset_generator()
    else:
        with biom_open(hdf5_biom) as f:
            table = Table.from_hdf5(f, ids=ids, axis=axis)
        format_ = 'hdf5'

    return table, format_
Пример #16
0
    def test_aitchison(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        actual = beta(table=t, metric='aitchison')
        expected = skbio.DistanceMatrix([[0.0000000, 0.4901290, 0.6935510],
                                         [0.4901290, 0.0000000, 0.2034219],
                                         [0.6935510, 0.2034219, 0.0000000]],
                                        ids=['S1', 'S2', 'S3'])

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Пример #17
0
def simulate_correls(corr_stren=(.99, .99), std=(1, 1, 1, 2, 2), means=(100, 100, 100, 100, 100), size=30,
                     noncors=10, noncors_mean=100, noncors_std=100):
    """
    Generates a correlation matrix with diagonal of stds based on input parameters and fills rest of matrix with
    uncorrelated values all with same  mean and standard deviations. Output should have a triangle of correlated
    observations and a pair all other observations should be uncorrelated. Correlation to covariance calculated by
    cor(X,Y)=cov(X,Y)/sd(X)sd(Y).

    Parameters
    ----------
    corr_stren: tuple of length 2, correlations in triangle and in pair
    std: tuple of length 5, standard deviations of each observation
    means: tuple of length 5, mean of each observation
    size: number of samples to generate from the multivariate normal distribution
    noncors: number of uncorrelated values
    noncors_mean: mean of uncorrelated values
    noncors_std: standard deviation of uncorrelated values

    Returns
    -------
    table: a biom table with (size) samples and (5+noncors) observations
    """
    cor = [[std[0], corr_stren[0], corr_stren[0], 0., 0.],  # define the correlation matrix for the triangle and pair
           [corr_stren[0], std[1], corr_stren[0], 0., 0.],
           [corr_stren[0], corr_stren[0], std[2], 0., 0.],
           [0., 0., 0., std[3], corr_stren[1]],
           [0., 0., 0., corr_stren[1], std[4]]]
    cor = np.array(cor)
    cov = np.zeros(np.array(cor.shape) + noncors)  # generate empty covariance matrix to be filled
    for i in range(cor.shape[0]):  # fill in all but diagonal of covariance matrix, first 5
        for j in range(i + 1, cor.shape[0]):
            curr_cov = cor[i, j] * cor[i, i] * cor[j, j]
            cov[i, j] = curr_cov
            cov[j, i] = curr_cov
    for i in range(cor.shape[0]):  # fill diagonal of covariance matrix, first 5
        cov[i, i] = np.square(cor[i, i])
    means = list(means)
    for i in range(cor.shape[0], cov.shape[0]):  # fill diagonal of covariance, 6 to end and populate mean list
        cov[i, i] = noncors_std
        means.append(noncors_mean)

    # fill the count table
    counts = multivariate_normal(means, cov, size).T

    counts = np.round(counts)

    observ_ids = ["Observ_" + str(i) for i in range(cov.shape[0])]
    sample_ids = ["Sample_" + str(i) for i in range(size)]
    table = Table(counts, observ_ids, sample_ids)

    return table
Пример #18
0
    def test_feature_metadata(self):
        # no filtering
        df = pd.DataFrame({'SequencedGenome': ['yes', 'yes']},
                          index=pd.Index(['O1', 'O2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_features(table, metadata=metadata)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        df = pd.DataFrame({'SequencedGenome': ['yes']},
                          index=pd.Index(['O1'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_features(table, metadata=metadata)
        expected = Table(np.array([[1, 3]]), ['O1'], ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter all
        df = pd.DataFrame({}, index=pd.Index(['foo'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_features(table, metadata=metadata)
        self.assertTrue(actual.is_empty())

        # exclude one
        df = pd.DataFrame({'SequencedGenome': ['yes']},
                          index=pd.Index(['O1'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_features(table, metadata=metadata, exclude_ids=True)
        expected = Table(np.array([[1, 1, 2]]), ['O2'], ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # exclude all
        df = pd.DataFrame({'SequencedGenome': ['yes', 'yes']},
                          index=pd.Index(['O1', 'O2'], name='id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_features(table, metadata=metadata, exclude_ids=True)
        self.assertTrue(actual.is_empty())
Пример #19
0
    def test_where(self):
        # no filtering
        df = pd.DataFrame({'SequencedGenome': ['yes', 'no']},
                          index=pd.Index(['O1', 'O2'], name='feature-id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "SequencedGenome='yes' OR SequencedGenome='no'"
        actual = filter_features(table, metadata=metadata, where=where)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        df = pd.DataFrame({'SequencedGenome': ['yes', 'no']},
                          index=pd.Index(['O1', 'O2'], name='feature-id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "SequencedGenome='yes'"
        actual = filter_features(table, metadata=metadata, where=where)
        expected = Table(np.array([[1, 3]]), ['O1'], ['S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter all
        df = pd.DataFrame({'SequencedGenome': ['yes', 'no']},
                          index=pd.Index(['O1', 'O2'], name='feature-id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "SequencedGenome='yes' AND SequencedGenome='no'"
        actual = filter_features(table, metadata=metadata, where=where)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)

        # filter one -> exclude one
        df = pd.DataFrame({'SequencedGenome': ['yes', 'no']},
                          index=pd.Index(['O1', 'O2'], name='feature-id'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "SequencedGenome='yes'"
        actual = filter_features(table,
                                 exclude_ids=True,
                                 metadata=metadata,
                                 where=where)
        expected = Table(np.array([[1, 1, 2]]), ['O2'], ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)
    def test_invalid_args(self):
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        with self.assertRaisesRegex(ValueError, "No filtering"):
            filter_samples(table)

        with self.assertRaisesRegex(ValueError,
                                    "'where' is specified."):
            filter_samples(table, where="Subject='subject-1'")

        with self.assertRaisesRegex(ValueError,
                                    "'exclude_ids' is True."):
            filter_samples(table, exclude_ids=True)
Пример #21
0
    def test_non_phylogenetic(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        actual = beta_diversity('braycurtis', t)
        # expected computed with scipy.spatial.distance.braycurtis
        expected = skbio.DistanceMatrix([[0.0000000, 0.3333333, 0.6666667],
                                         [0.3333333, 0.0000000, 0.4285714],
                                         [0.6666667, 0.4285714, 0.0000000]],
                                        ids=['S1', 'S2', 'S3'])

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Пример #22
0
def fastspar_correlation(table: Table,
                         verbose: bool = False,
                         nprocs=1) -> pd.DataFrame:
    # TODO: multiprocess support
    with tempfile.TemporaryDirectory(prefix='fastspar') as temp:
        table.to_dataframe().to_dense().to_csv(path.join(
            temp, 'otu_table.tsv'),
                                               sep='\t',
                                               index_label='#OTU ID')
        if verbose:
            stdout = None
        else:
            stdout = subprocess.DEVNULL
        subprocess.run([
            'fastspar', '-c',
            path.join(temp, 'otu_table.tsv'), '-r',
            path.join(temp, path.join(temp, 'correl_table.tsv')), '-a',
            path.join(temp, 'covar_table.tsv'), '-t',
            str(nprocs)
        ],
                       stdout=stdout)
        cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0)
        return df_to_correls(cor)
Пример #23
0
def generaete_biom_file(res_df, o, tg_rank, sampleid):
    """
    output result in biom format
    """
    import numpy as np
    import biom
    from biom.table import Table
    if biom.__version__ < '2.1.7':
        sys.exit("[ERROR] Biom library requires v2.1.7 or above.\n")

    target_df = pd.DataFrame()
    target_idx = (res_df['LEVEL']==tg_rank)
    target_df = res_df.loc[target_idx, ['ABUNDANCE','TAXID']]
    target_df['LINEAGE'] = target_df['TAXID'].apply(lambda x: gt.taxid2lineage(x, True, True)).str.split('|')

    sample_ids = [sampleid]
    data = np.array(target_df['ABUNDANCE']).reshape(len(target_df), 1)
    observ_ids = target_df['TAXID']
    observ_metadata = [{'taxonomy': x} for x in target_df['LINEAGE'].tolist()]
    biom_table = Table(data, observ_ids, sample_ids, observ_metadata, table_id='GOTTCHA2')
    biom_table.to_json('GOTTCHA2', direct_io=o)

    return True
Пример #24
0
    def test_rarefy_to_files2(self):
        """rarefy_to_files should write valid files with some metadata on otus

        """
        maker = RarefactionMaker(self.otu_table_meta_fp, 0, 1, 1, 1)
        maker.rarefy_to_files(self.rare_dir,
                              include_full=True,
                              include_lineages=False)

        fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom")
        with biom_open(fname, 'U') as biom_file:
            otu_table = Table.from_hdf5(biom_file)

        self.assertItemsEqual(otu_table.ids(), self.otu_table.ids()[:2])
Пример #25
0
def make_modules_on_correlations(correlation_table: pd.DataFrame, feature_table: Table, min_r: float=.35) -> \
                                     (Table, nx.Graph, pd.Series):
    modules = ma.make_modules_naive(correlation_table, min_r=min_r)
    modules_rev = {asv: module for module, asvs in modules.items() for asv in asvs}
    for asv in feature_table.ids(axis='observation'):
        if asv not in modules_rev:
            modules_rev[asv] = None
    module_membership = pd.Series(modules_rev)
    coll_table = ma.collapse_modules(feature_table, modules)
    metadata = get_metadata_from_table(feature_table)
    metadata = ma.add_modules_to_metadata(modules, metadata)
    correlation_table_filtered = filter_correls(correlation_table, conet=True, min_r=min_r)
    net = correls_to_net(correlation_table_filtered, metadata=metadata)
    return coll_table, net, module_membership
Пример #26
0
    def test_beta_jensenshannon(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        actual = beta(table=t, metric='jensenshannon')
        # expected computed with scipy.spatial.distance.jensenshannon
        expected = skbio.DistanceMatrix([[0.0000000, 0.4645014, 0.52379239],
                                         [0.4645014, 0.0000000, 0.07112939],
                                         [0.52379239, 0.07112939, 0.0000000]],
                                        ids=['S1', 'S2', 'S3'])

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Пример #27
0
    def test_beta_canberra_adkins(self):
        t = Table(np.array([[0, 0], [0, 1], [1, 2]]),
                  ['O1', 'O2', 'O3'],
                  ['S1', 'S2'])
        d = (1. / 2.) * sum([abs(0. - 1.) / (0. + 1.),
                             abs(1. - 2.) / (1. + 2.)])
        expected = skbio.DistanceMatrix(np.array([[0.0, d], [d, 0.0]]),
                                        ids=['S1', 'S2'])
        actual = beta(table=t, metric='canberra_adkins')

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
 def setUp(self):
     THIS_DIR = os.path.dirname(os.path.abspath(__file__))
     tablefp = Table({}, [], [])
     self.emptyfeatures = tablefp
     goodtable = os.path.join(THIS_DIR, 'data/features_formated.biom')
     self.features = load_table(goodtable)
     goodtable = os.path.join(THIS_DIR, 'data/features2_formated.biom')
     ms2_match = os.path.join(THIS_DIR, 'data/ms2_match.txt')
     self.ms2_match = pd.read_csv(ms2_match, sep='\t', index_col=0)
     self.features2 = load_table(goodtable)
     self.goodcsi = qiime2.Artifact.load(os.path.join(THIS_DIR,
                                                      'data/csiFolder.qza'))
     self.goodcsi2 = qiime2.Artifact.load(os.path.join(
                                          THIS_DIR, 'data/csiFolder2.qza'))
Пример #29
0
def calculate_correlations(table: Table, corr_method: str='spearman',
                           p_adjustment_method: str='fdr_bh') -> pd.DataFrame:
    # TODO: multiprocess this
    corr_method_fun = correl_methods[corr_method]
    correls = pd.DataFrame(index=['r', 'p'])
    for (val_i, id_i, _), (val_j, id_j, _) in table.iter_pairwise(axis='observation'):
        r, p = corr_method_fun(val_i, val_j)
        correls[id_i, id_j] = r, p
    correls = correls.transpose()
    correls.index = pd.MultiIndex.from_tuples(correls.index)  # Turn tuple index into actual multiindex
    if p_adjustment_method is not None:
        correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method)
    correls = correls.sort_values('p')
    return correls
    def test_combine_id_and_frequency_filters(self):
        # no filtering
        df = pd.DataFrame(
            {
                'Subject': ['subject-1', 'subject-1', 'subject-2'],
                'SampleType': ['gut', 'tongue', 'gut']
            },
            index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                min_frequency=1)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # id and frequency filters active
        df = pd.DataFrame(
            {
                'Subject': ['subject-1', 'subject-1', 'subject-2'],
                'SampleType': ['gut', 'tongue', 'gut']
            },
            index=pd.Index(['S1', 'S2', 'S3'], name='#SampleID'))
        metadata = qiime2.Metadata(df)
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        where = "Subject='subject-1'"
        actual = filter_samples(table,
                                metadata=metadata,
                                where=where,
                                min_frequency=2)
        expected = Table(np.array([[1], [1]]), ['O1', 'O2'], ['S2'])
        self.assertEqual(actual, expected)
Пример #31
0
 def write_biom(self, sample_names, read_taxonomies, biom_file_io):
     '''Write the OTU info to a biom IO output stream
     
     Parameters
     ----------
     sample_names: String
         names of each sample (sample_ids for biom)
     read_taxonomies: Array of hashes as per _iterate_otu_table_rows()
     biom_file_io: io
         open writeable stream to write biom contents to
         
     Returns True if successful, else False'''
     counts = []
     observ_metadata = []
     otu_ids = []
     for otu_id, tax, count in self._iterate_otu_table_rows(
             read_taxonomies):
         if len(count) != len(sample_names):
             raise Exception(
                 "Programming error: mismatched sample names and counts")
         counts.append(count)
         observ_metadata.append({'taxonomy': tax})
         otu_ids.append(str(otu_id))
     table = Table(np.array(counts),
                   otu_ids,
                   sample_names,
                   observ_metadata, [{}] * len(sample_names),
                   table_id='GraftM Taxonomy Count Table')
     try:
         table.to_hdf5(biom_file_io, 'GraftM graft')
         return True
     except RuntimeError as e:
         logging.warn(
             "Error writing BIOM output, file not written. The specific error was: %s"
             % e)
         return False
Пример #32
0
 def setUp(self):
     rooted_nwk = io.StringIO("((A:0.1, B:0.2)C:0.3, D:0.4, E:0.5)root;")
     self.tree = skbio.TreeNode.read(rooted_nwk)
     self.metadata = Metadata(
         pd.DataFrame(
             data=np.array([['Bacteria', '1'], ['Archea', '1']],
                           dtype=object),
             index=pd.Index(['A', 'D'], name='Feature ID'),
             columns=['kingdom', 'keep'],
         ))
     self.table = Table(data=np.array([[0, 1, 2], [2, 2, 2]]),
                        observation_ids=['A', 'D'],
                        sample_ids=['S1', 'S2', 'S3'])
     self.filtered_tree = self.tree.copy().shear(['A', 'D'])
     self.filtered_tree.prune()
Пример #33
0
    def test_phylogenetic(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        tree = skbio.TreeNode.read(
            io.StringIO('((O1:0.25, O2:0.50):0.25, O3:0.75)root;'))
        actual = beta_diversity('unweighted_unifrac', t, phylogeny=tree)
        # expected computed with skbio.diversity.beta_diversity
        expected = skbio.DistanceMatrix(
            [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]],
            ids=['S1', 'S2', 'S3'])

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Пример #34
0
 def setUp(self):
     THIS_DIR = os.path.dirname(os.path.abspath(__file__))
     tablefp = Table({}, [], [])
     self.emptyfeatures = tablefp
     goodtable = os.path.join(THIS_DIR, 'data/features_formated.biom')
     self.features = load_table(goodtable)
     goodtable = os.path.join(THIS_DIR, 'data/features2_formated.biom')
     self.features2 = load_table(goodtable)
     self.goodcsi = qiime2.Artifact.load(
         os.path.join(THIS_DIR, 'data/csiFolder.qza'))
     goodcsi = self.goodcsi.view(CSIDirFmt)
     self.collated = collate_fingerprint(goodcsi)
     self.goodcsi2 = qiime2.Artifact.load(
         os.path.join(THIS_DIR, 'data/csiFolder2.qza'))
     goodcsi = self.goodcsi2.view(CSIDirFmt)
     self.collated2 = collate_fingerprint(goodcsi)
Пример #35
0
    def test_rarefy_to_files(self):
        """rarefy_to_files should write valid files

        """
        maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1)
        maker.rarefy_to_files(
            self.rare_dir,
            include_full=True,
            include_lineages=False)

        fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom")
        with biom_open(fname, 'U') as biom_file:
            otu_table = Table.from_hdf5(biom_file)

        self.assertItemsEqual(
            otu_table.sample_ids,
            self.otu_table.sample_ids[:2])
    def test_filter_empty_features(self):
        # no filtering
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=42,
                                filter_empty_features=False)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=4,
                                filter_empty_features=False)
        expected = Table(np.array([[0, 1], [1, 1]]),
                         ['O1', 'O2'],
                         ['S1', 'S2'])
        self.assertEqual(actual, expected)

        # filter two
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=1,
                                filter_empty_features=False)
        expected = Table(np.array([[0], [1]]),
                         ['O1', 'O2'],
                         ['S1'])
        self.assertEqual(actual, expected)

        # filter all
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=0,
                                filter_empty_features=False)
        expected = Table(np.array([[], []]), ['O1', 'O2'], [])
        self.assertEqual(actual, expected)
Пример #37
0
def collapse_modules(table, modules):
    """collapse created modules in a biom table, members of multiple modules will be added to the smallest module"""
    table = table.copy()
    module_array = np.zeros((len(modules), table.shape[1]))

    seen = set()
    for module_, otus in modules.items():
        module_number = int(module_.split('_')[-1])
        seen = seen | set(otus)
        # sum everything in the module
        module_array[module_number] = np.sum([table.data(feature, axis="observation") for feature in otus], axis=0)

    table.filter(seen, axis='observation', invert=True)

    # make new table
    new_table_matrix = np.concatenate((table.matrix_data.toarray(), module_array))
    new_table_obs = list(table.ids(axis='observation')) + list(modules.keys())
    return Table(new_table_matrix, new_table_obs, table.ids())
Пример #38
0
def calculate_correlations(
        table: Table,
        corr_method=spearmanr,
        p_adjustment_method: str = 'fdr_bh') -> pd.DataFrame:
    # TODO: multiprocess this
    index = list()
    data = list()
    for (val_i, id_i, _), (val_j, id_j,
                           _) in table.iter_pairwise(axis='observation'):
        r, p = corr_method(val_i, val_j)
        index.append((id_i, id_j))
        data.append((r, p))
    correls = pd.DataFrame(data, index=index, columns=['r', 'p'])
    correls.index = pd.MultiIndex.from_tuples(
        correls.index)  # Turn tuple index into actual multiindex
    if p_adjustment_method is not None:
        correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method)
    return correls
Пример #39
0
    def setUp(self):

        self.qiime_config = load_qiime_config()
        self.tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'

        self.otu_table_data = np.array([[2, 1, 0],
                                        [0, 5, 0],
                                        [0, 3, 0],
                                        [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{'domain': 'Archaea'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'}]

        self.otu_table = Table(self.otu_table_data,
                               self.taxon_names,
                               self.sample_names,
                               observation_metadata=[{}, {}, {}, {}],
                               sample_metadata=[{}, {}, {}])

        self.otu_table_meta = Table(self.otu_table_data,
                                    self.taxon_names, self.sample_names,
                                    observation_metadata=self.otu_metadata)

        fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                        prefix='test_rarefaction',
                                        suffix='.biom')
        close(fd)
        fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                             prefix='test_rarefaction',
                                             suffix='.biom')
        close(fd)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir', suffix='')

        write_biom_table(self.otu_table, self.otu_table_fp)
        write_biom_table(self.otu_table_meta, self.otu_table_meta_fp)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
Пример #40
0
    def test_make_otu_table_taxonomy(self):
        """make_otu_table should work with taxonomy"""
        otu_map_lines = """0	ABC_0	DEF_1
1	ABC_1
x	GHI_2	GHI_3	GHI_77
z	DEF_3	XYZ_1""".split('\n')
        taxonomy = {'0': ['Bacteria', 'Firmicutes'],
                    'x': ['Bacteria', 'Bacteroidetes']}
        obs = make_otu_table(otu_map_lines, taxonomy)

        data = [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 0], [0, 1, 0, 1]]
        obs_md = [{'taxonomy': ['Bacteria', 'Firmicutes']},
                  {'taxonomy': ['None']},
                  {'taxonomy': ['Bacteria', 'Bacteroidetes']},
                  {'taxonomy': ['None']}]
        exp = Table(data, ['0', '1', 'x', 'z'], ['ABC', 'DEF', 'GHI', 'XYZ'],
                    observation_metadata=obs_md, input_is_dense=True)

        self.assertEqual(obs, exp)
Пример #41
0
    def test_parallel_beta(self):
        t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'],
                  ['S1', 'S2', 'S3'])
        parallel = beta(table=t, metric='braycurtis', n_jobs=-1)
        single_thread = beta(table=t, metric='braycurtis', n_jobs=1)
        # expected computed with scipy.spatial.distance.braycurtis
        expected = skbio.DistanceMatrix([[0.0000000, 0.3333333, 0.6666667],
                                         [0.3333333, 0.0000000, 0.4285714],
                                         [0.6666667, 0.4285714, 0.0000000]],
                                        ids=['S1', 'S2', 'S3'])

        self.assertEqual(parallel.ids, expected.ids)
        self.assertEqual(single_thread.ids, expected.ids)
        for id1 in parallel.ids:
            for id2 in parallel.ids:
                npt.assert_almost_equal(parallel[id1, id2], expected[id1, id2])
        for id1 in single_thread.ids:
            for id2 in single_thread.ids:
                npt.assert_almost_equal(single_thread[id1, id2], expected[id1,
                                                                          id2])
Пример #42
0
 def test_write_biom_table(self):
     """Test functionality of write_biom_table().
     """
     table_exp = Table(np.array([[1., 1., 1., 0., 0.],
                                 [1., 0., 0., 0., 0.],
                                 [0., 0., 1., 0., 1.],
                                 [0., 0., 0., 1., 0.],
                                 [0., 0., 0., 1., 0.],
                                 [0., 0., 1., 0., 0.]]),
                       ["k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium",
                        "k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus",
                        "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia",
                        "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Mobiluncus",
                        "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas",
                        "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium"],
                       ["s1", "s2", "s3", "s4", "s5"])
     self.biom_output_fp = join(self.working_dir, "test_output_biom")
     write_biom_table(table_exp, self.biom_output_fp)
     table_obs = load_table(self.biom_output_fp)
     self.assertEqual(table_obs, table_exp)
    def test_max_frequency(self):
        # no filtering
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=42)
        expected = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                         ['O1', 'O2'],
                         ['S1', 'S2', 'S3'])
        self.assertEqual(actual, expected)

        # filter one
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=4)
        expected = Table(np.array([[0, 1], [1, 1]]),
                         ['O1', 'O2'],
                         ['S1', 'S2'])
        self.assertEqual(actual, expected)

        # filter two
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=1)
        expected = Table(np.array([[1]]),
                         ['O2'],
                         ['S1'])
        self.assertEqual(actual, expected)

        # filter all
        table = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                      ['O1', 'O2'],
                      ['S1', 'S2', 'S3'])
        actual = filter_samples(table, max_frequency=0)
        expected = Table(np.array([]), [], [])
        self.assertEqual(actual, expected)
Пример #44
0
def collapse_modules(table, modules, prefix="module"):
    """collapse created modules in a biom table, members of multiple modules will be added to the smallest module"""
    table = table.copy()
    module_array = np.zeros((len(modules), table.shape[1]))

    seen = set()
    for i, module_ in enumerate(modules):
        seen = seen | module_
        # sum everything in the module
        module_array[i] = np.sum(
            [table.data(feature, axis="observation") for feature in module_],
            axis=0)

    table.filter(seen, axis='observation', invert=True)

    # make new table
    new_table_matrix = np.concatenate(
        (table.matrix_data.toarray(), module_array))
    new_table_obs = list(table.ids(axis='observation')) + [
        '_'.join([prefix, str(i)]) for i in range(len(modules))
    ]
    return Table(new_table_matrix, new_table_obs, table.ids())
Пример #45
0
def biom_table2():
    arr = np.array([[250, 0, 100, 446, 75], [0, 0, 1, 1, 2], [2, 2, 2, 2, 2],
                    [100, 100, 500, 1, 1000], [500, 5, 0, 50, 100]])
    obs_ids = ["otu_%s" % i for i in range(5)]
    samp_ids = ["samp_%s" % i for i in range(5)]
    obs_meta = [{
        'taxonomy':
        'k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus; s__'
    }, {
        'taxonomy':
        'k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Paenibacillaceae; g__Paenibacillus; s__'
    }, {
        'taxonomy':
        'k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Methylophilales; f__Methylophilaceae; g__; s__'
    }, {
        'taxonomy':
        'k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__[Ruminococcus]; s__'
    }, {
        'taxonomy':
        'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Microbacteriaceae; g__; s__'
    }]
    return Table(arr, obs_ids, samp_ids, observation_metadata=obs_meta)
Пример #46
0
def calculate_correlations(table: Table, corr_method=spearmanr, p_adjust_method: str = 'fdr_bh', nprocs=1) -> \
        pd.DataFrame:
    if nprocs > multiprocessing.cpu_count():
        warnings.warn(
            "nprocs greater than CPU count, using all avaliable CPUs")
        nprocs = multiprocessing.cpu_count()

    pool = multiprocessing.Pool(nprocs)
    cor = partial(calculate_correlation, corr_method=corr_method)
    results = pool.map(
        cor,
        pairwise_iter_wo_metadata(table.iter_pairwise(axis='observation')))
    index = [i[0] for i in results]
    data = [i[1] for i in results]
    pool.close()
    pool.join()
    correls = pd.DataFrame(data, index=index, columns=['r', 'p'])
    # Turn tuple index into actual multiindex, now guaranteeing that correls index is sorted
    correls.index = pd.MultiIndex.from_tuples(
        [sorted(i) for i in correls.index])
    if p_adjust_method is not None:
        correls['p_adjusted'] = p_adjust(correls.p, method=p_adjust_method)
    return correls
Пример #47
0
def generate_biom_table(seqs_fp, uc_fp, delim='_'):
    """Generate BIOM table and representative FASTA set

    Parameters
    ----------
    seqs_fp: string
        file path to deblurred sequences
    uc_fp: string
        file path to dereplicated sequences map (.uc format)
    delim: string, optional
        delimiter for splitting sample and sequence IDs in sequence label
        default: '_'

    Returns
    -------
    deblur_clusters: dictionary
        dictionary of clusters including dereplicated sequence labels
    Table: biom.table
        an instance of a BIOM table
    """
    # parse clusters in dereplicated sequences map (.uc format)
    with open(uc_fp, 'U') as uc_f:
        derep_clusters, failures, seeds = clusters_from_uc_file(uc_f)
    # parse clusters in deblur file, set observation ID to be the sequence
    deblur_clusters = parse_deblur_output(seqs_fp, derep_clusters)
    # create sparse dictionary of observation and sample ID counts
    data, otu_ids, sample_ids = generate_biom_data(deblur_clusters, delim)
    # build BIOM table
    return deblur_clusters, Table(data,
                                  otu_ids,
                                  sample_ids,
                                  observation_metadata=None,
                                  sample_metadata=None,
                                  table_id=None,
                                  generated_by="deblur",
                                  create_date=datetime.now().isoformat())
Пример #48
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    lower_percentage = opts.lower_percentage
    upper_percentage = opts.upper_percentage
    otu_table_fp = opts.otu_table_fp
    otu_table = load_table(otu_table_fp)
    delimiter = opts.delimiter
    mapping_fp = opts.mapping
    md_as_string = opts.md_as_string
    md_identifier = opts.md_identifier
    levels = opts.level.split(',')
    suppress_classic_table_output = opts.suppress_classic_table_output
    suppress_biom_table_output = opts.suppress_biom_table_output

    if upper_percentage is not None and lower_percentage is not None:
        raise ValueError(
            "upper_percentage and lower_percentage are mutually exclusive")

    if upper_percentage is not None and lower_percentage is not None and \
            mapping:
        raise ValueError("upper_percentage and lower_percentage can not be "
                         "using with mapping file")

    if upper_percentage is not None and \
            (upper_percentage < 0 or upper_percentage > 1.0):
        raise ValueError('max_otu_percentage should be between 0.0 and 1.0')

    if lower_percentage is not None and \
            (lower_percentage < 0 or lower_percentage > 1.0):
        raise ValueError('lower_percentage should be between 0.0 and 1.0')

    if mapping_fp:
        mapping_file = open(mapping_fp, 'U')
        mapping, header, comments = parse_mapping_file(mapping_file)

        # use the input Mapping file for producing the output filenames
        map_dir_path, map_fname = split(mapping_fp)
        map_basename, map_fname_ext = splitext(map_fname)
    else:
        if suppress_classic_table_output and suppress_biom_table_output:
            option_parser.error("Both classic and BIOM output formats were "
                                "suppressed.")

    if not opts.absolute_abundance:
        otu_table = otu_table.norm(axis='sample', inplace=False)

    # introduced output directory to will allow for multiple outputs
    if opts.output_dir:
        create_dir(opts.output_dir, False)
        output_dir_path = opts.output_dir
    else:
        output_dir_path = './'

    # use the input OTU table to produce the output filenames
    dir_path, fname = split(otu_table_fp)
    basename, fname_ext = splitext(fname)

    # Iterate over the levels and generate a summarized taxonomy for each
    for level in levels:
        if mapping_fp:
            # define output filename
            output_fname = join(output_dir_path,
                                map_basename + '_L%s.txt' % (level))

            summary, tax_order = add_summary_mapping(otu_table,
                                                     mapping,
                                                     int(level),
                                                     md_as_string,
                                                     md_identifier)

            write_add_taxa_summary_mapping(summary, tax_order, mapping,
                                           header, output_fname, delimiter)
        else:
            # define the output filename. The extension will be added to the
            # end depending on the output format
            output_fname = join(output_dir_path, basename + '_L%s' % level)

            summary, header = make_summary(otu_table,
                                           int(level),
                                           upper_percentage,
                                           lower_percentage,
                                           md_as_string,
                                           md_identifier)

            sample_ids = header[1:]

            observation_ids = []
            data = []
            for row in summary:
                # Join taxonomic levels to create an observation ID.
                observation_ids.append(delimiter.join(row[0]))
                data.append(row[1:])

            table = Table(np.asarray(data), observation_ids, sample_ids)
            if opts.transposed_output:
                table = table.transpose()

            if not suppress_classic_table_output:
                with open(output_fname + '.txt', 'w') as outfile:
                    outfile.write(table.to_tsv())

            if not suppress_biom_table_output:
                write_biom_table(table, output_fname + '.biom')
Пример #49
0
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
    r"""Parses the biom table stored in the filepath `fp`

    Parameters
    ----------
    fp : file like
        File alike object storing the BIOM table
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
    axis : {'sample', 'observation'}, optional
        The axis to subset on
    input_is_dense : boolean
        Indicates if the BIOM table is dense or sparse. Valid only for JSON
        tables.

    Returns
    -------
    Table
        The BIOM table stored at fp

    Raises
    ------
    ValueError
        If `samples` and `observations` are provided.

    Notes
    -----
    Subsetting from the BIOM table is only supported in one axis

    Examples
    --------
    Parse a hdf5 biom table

    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f) # doctest: +SKIP

    Parse a hdf5 biom table subsetting observations
    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f, ids=["GG_OTU_1"],
    ...                      axis='observation') # doctest: +SKIP
    """
    if axis not in ['observation', 'sample']:
        UnknownAxisError(axis)

    try:
        return Table.from_hdf5(fp, ids=ids, axis=axis)
    except ValueError:
        pass
    except RuntimeError:
        pass
    if hasattr(fp, 'read'):
        old_pos = fp.tell()
        # Read in characters until first non-whitespace
        # If it is a {, then this is (most likely) JSON
        c = fp.read(1)
        while c.isspace():
            c = fp.read(1)
        if c == '{':
            fp.seek(old_pos)
            t = Table.from_json(json.load(fp, object_pairs_hook=OrderedDict),
                                input_is_dense=input_is_dense)
        else:
            fp.seek(old_pos)
            t = Table.from_tsv(fp, None, None, lambda x: x)
    elif isinstance(fp, list):
        try:
            t = Table.from_json(json.loads(''.join(fp),
                                           object_pairs_hook=OrderedDict),
                                input_is_dense=input_is_dense)
        except ValueError:
            t = Table.from_tsv(fp, None, None, lambda x: x)
    else:
        t = Table.from_json(json.loads(fp, object_pairs_hook=OrderedDict),
                            input_is_dense=input_is_dense)

    def subset_ids(data, id_, md):
        return id_ in ids

    def gt_zero(vals, id_, md):
        return np.any(vals)

    if ids is not None:
        t.filter(subset_ids, axis=axis)
        axis = 'observation' if axis == 'sample' else 'sample'
        t.filter(gt_zero, axis=axis)

    return t
Пример #50
0
class TopLevelTests(TestCase):

    """Tests of top-level functions"""

    def setUp(self):
        self.otu_table_vals = array([[1, 0, 2, 4],
                                     [1, 2, 0, 1],
                                     [0, 1, 1, 0],
                                     [1, 2, 1, 0]])

        self.otu_table = Table(self.otu_table_vals,
                                       ['0', '1', '2', '3'],
                                       ['s1', 's2', 's3', 's4'],
                                       [{"taxonomy": ["Root", "Bacteria", "Actinobacteria", "Actinobacteria", "Coriobacteridae", "Coriobacteriales", "Coriobacterineae", "Coriobacteriaceae"]},
                                        {"taxonomy": ["Root",
                                                      "Bacteria",
                                                      "Firmicutes",
                                                      "\"Clostridia\""]},
                                        {"taxonomy": ["Root",
                                                      "Bacteria",
                                                      "Firmicutes",
                                                      "\"Clostridia\""]},
                                        {"taxonomy": ["Root", "Bacteria"]}],
                                        None,)

        self.mapping = """#SampleID\tBarcodeSequence\tTreatment\tDescription
#Test mapping file
s1\tAAAA\tControl\tControl mouse, I.D. 354
s2\tGGGG\tControl\tControl mouse, I.D. 355
s3\tCCCC\tExp\tDisease mouse, I.D. 356
s4\tTTTT\tExp\tDisease mouse, I.D. 357""".split('\n')

    def test_sum_counts_by_consensus(self):
        """should sum otu counts by consensus"""
        #otu_table = parse_otu_table(self.otu_table)
        #otu_table = parse_biom_table(self.otu_table)
        obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 3)
        exp_result = {(
            'Root', 'Bacteria', 'Actinobacteria'): array([1, 0, 2, 4]),
            ('Root', 'Bacteria', 'Firmicutes'): array([1, 3, 1, 1]),
            ('Root', 'Bacteria', 'Other'): array([1, 2, 1, 0])}
        exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3}
        self.assertItemsEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)

        obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 2)
        exp_result = {('Root', 'Bacteria'): array([3, 5, 4, 5])}
        exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3}
        self.assertItemsEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)

        obs_result, obs_mapping = sum_counts_by_consensus(self.otu_table, 4)
        exp_result = {('Root', 'Bacteria', 'Actinobacteria', 'Actinobacteria'):
                      array([1, 0, 2, 4]),
                      ('Root', 'Bacteria', 'Firmicutes', '"Clostridia"'):
                      array([1, 3, 1, 1]),
                      ('Root', 'Bacteria', 'Other', 'Other'): array([1, 2, 1, 0])}
        exp_mapping = {'s1': 0, 's2': 1, 's3': 2, 's4': 3}
        self.assertItemsEqual(obs_result, exp_result)
        self.assertEqual(obs_mapping, exp_mapping)

    def test_make_new_summary_file(self):
        """make_new_summary_file works
        """
        lower_percentage, upper_percentage = None, None
        #otu_table = parse_otu_table(self.otu_table, int)
        #otu_table = parse_biom_table(self.otu_table)
        summary, header = make_summary(
            self.otu_table, 3, upper_percentage, lower_percentage)
        self.assertEqual(header, ['Taxon', 's1', 's2', 's3', 's4'])
        self.assertEqual(
            summary, [[('Root', 'Bacteria', 'Actinobacteria'), 1, 0, 2, 4],
                      [('Root', 'Bacteria', 'Firmicutes'),
                       1, 3, 1, 1],
                      [('Root', 'Bacteria', 'Other'), 1, 2, 1, 0]])

        # test that works with relative abundances
        #otu_table = parse_otu_table(self.otu_table, float)
        #otu_table = parse_biom_table(self.otu_table, float)
        #otu_table = convert_otu_table_relative(otu_table)
        otu_table = self.otu_table.norm(axis='sample', inplace=False)
        summary, header = make_summary(
            otu_table, 3, upper_percentage, lower_percentage)
        self.assertEqual(header, ['Taxon', 's1', 's2', 's3', 's4'])
        self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Actinobacteria'))
        assert_almost_equal(summary[0][1:], [1.0 / 3, 0.0, 0.5, 0.8])
        self.assertEqual(summary[1][0], ('Root', 'Bacteria', 'Firmicutes'))
        assert_almost_equal(summary[1][1:], [1.0 / 3, 0.6, 0.25, 0.2])
        self.assertEqual(summary[2][0], ('Root', 'Bacteria', 'Other'))
        assert_almost_equal(summary[2][1:], [1.0 / 3, 0.4, 0.25, 0.0])

        ##
        # testing lower triming
        lower_percentage, upper_percentage = 0.3, None
        summary, header = make_summary(
            otu_table, 3, upper_percentage, lower_percentage)
        self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Other'))
        assert_almost_equal(summary[0][1:], [1.0 / 3, 0.4, 0.25, 0.0])

        ##
        # testing upper triming
        lower_percentage, upper_percentage = None, 0.4
        summary, header = make_summary(
            otu_table, 3, upper_percentage, lower_percentage)
        self.assertEqual(summary[0][0], ('Root', 'Bacteria', 'Actinobacteria'))
        assert_almost_equal(summary[0][1:], [1.0 / 3, 0.0, 0.5, 0.8])

    def test_add_summary_category_mapping(self):
        """make_new_summary_file works
        """
        #otu_table = parse_otu_table(self.otu_table, int)
        #otu_table = parse_biom_table(self.otu_table)
        mapping, header, comments = parse_mapping_file(self.mapping)
        summary, taxon_order = add_summary_mapping(self.otu_table, mapping, 3)
        self.assertEqual(taxon_order, [('Root', 'Bacteria', 'Actinobacteria'),
                                       ('Root', 'Bacteria', 'Firmicutes'),
                                       ('Root', 'Bacteria', 'Other')])
        self.assertEqual(summary, {'s1': [1, 1, 1],
                                   's2': [0, 3, 2],
                                   's3': [2, 1, 1],
                                   's4': [4, 1, 0]})
#			print "index: %i" %index
			ind_taxonomy.append('%s%s' %(syn[levels[index]], taxon[0]['ScientificName']))
       	

#	print ind_taxonomy			
	
	Taxonomy[taxon[0]['ScientificName']]['taxonomy'] = ind_taxonomy

#	print "Taxonomy: %s" %Taxonomy
		
for taxon in observ_ids:
#	print taxon
#	print Taxonomy[taxon]
	observation_metadata.append(Taxonomy[taxon])

#print "observation metadata:\n%s" %observation_metadata
#print len(observation_metadata)

table = Table(data, observ_ids, sample_id, observation_metadata, sample_metadata, table_id='Example Table')
print table

out=open(args.prefix+".biom","w")
table.to_json('pplacer converted by jplace_to_biom.py v.'+VERSION, direct_io=out)
out.close()

out=open(args.prefix+".tsv","w")
out.write(table.to_tsv(header_key='taxonomy', header_value='taxomomy')) #to_json('generaged by test', direct_io=out)
out.close()

print "\n##### DONE! #####\n"
Пример #52
0
def gibbs(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta,
          source_rarefaction_depth, sink_rarefaction_depth,
          restarts, draws_per_restart, burnin, delay, cluster_start_delay,
          source_sink_column, source_column_value, sink_column_value,
          source_category_column):
    '''Gibb's sampler for Bayesian estimation of microbial sample sources.

    For details, see the project README file.
    '''
    # Create results directory. Click has already checked if it exists, and
    # failed if so.
    os.mkdir(output_dir)

    # Load the mapping file and biom table and remove samples which are not
    # shared.
    o = open(mapping_fp, 'U')
    sample_metadata_lines = o.readlines()
    o.close()

    sample_metadata, biom_table = \
        _cli_sync_biom_and_sample_metadata(
            parse_mapping_file(sample_metadata_lines),
            load_table(table_fp))

    # If biom table has fractional counts, it can produce problems in indexing
    # later on.
    biom_table.transform(lambda data, id, metadata: np.ceil(data))

    # If biom table has sample metadata, there will be pickling errors when
    # submitting multiple jobs. We remove the metadata by making a copy of the
    # table without metadata.
    biom_table = Table(biom_table._data.toarray(),
                       biom_table.ids(axis='observation'),
                       biom_table.ids(axis='sample'))

    # Parse the mapping file and options to get the samples requested for
    # sources and sinks.
    source_samples, sink_samples = sinks_and_sources(
        sample_metadata, column_header=source_sink_column,
        source_value=source_column_value, sink_value=sink_column_value)

    # If we have no source samples neither normal operation or loo will work.
    # Will also likely get strange errors.
    if len(source_samples) == 0:
        raise ValueError('Mapping file or biom table passed contain no '
                         '`source` samples.')

    # Prepare the 'sources' matrix by collapsing the `source_samples` by their
    # metadata values.
    sources_envs, sources_data = collapse_sources(source_samples,
                                                  sample_metadata,
                                                  source_category_column,
                                                  biom_table, sort=True)

    # Rarefiy data if requested.
    sources_data, biom_table = \
        subsample_sources_sinks(sources_data, sink_samples, biom_table,
                                source_rarefaction_depth,
                                sink_rarefaction_depth)

    # Build function that require only a single parameter -- sample -- to
    # enable parallel processing if requested.
    if loo:
        f = partial(_cli_loo_runner, source_category=source_category_column,
                    alpha1=alpha1, alpha2=alpha2, beta=beta,
                    restarts=restarts, draws_per_restart=draws_per_restart,
                    burnin=burnin, delay=delay,
                    sample_metadata=sample_metadata,
                    sources_data=sources_data, sources_envs=sources_envs,
                    biom_table=biom_table, output_dir=output_dir)
        sample_iter = source_samples
    else:
        f = partial(_cli_sink_source_prediction_runner, alpha1=alpha1,
                    alpha2=alpha2, beta=beta, restarts=restarts,
                    draws_per_restart=draws_per_restart, burnin=burnin,
                    delay=delay, sources_data=sources_data,
                    biom_table=biom_table, output_dir=output_dir)
        sample_iter = sink_samples

    if jobs > 1:
        # Launch the ipcluster and wait for it to come up.
        subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True)
        time.sleep(cluster_start_delay)
        c = Client()
        c[:].map(f, sample_iter, block=True)
        # Shut the cluster down. Answer taken from SO:
        # http://stackoverflow.com/questions/30930157/stopping-ipcluster-engines-ipython-parallel
        c.shutdown(hub=True)
    else:
        for sample in sample_iter:
            f(sample)

    # Format results for output.
    samples = []
    samples_data = []
    for sample_fp in glob.glob(os.path.join(output_dir, '*')):
        samples.append(sample_fp.strip().split('/')[-1].split('.txt')[0])
        samples_data.append(np.loadtxt(sample_fp, delimiter='\t'))
    mp, mps = _cli_collate_results(samples, samples_data, sources_envs)

    o = open(os.path.join(output_dir, 'mixing_proportions.txt'), 'w')
    o.writelines(mp)
    o.close()
    o = open(os.path.join(output_dir, 'mixing_proportions_stds.txt'), 'w')
    o.writelines(mps)
    o.close()
Пример #53
0
class TopLevelTests(TestCase):

    """Tests of top-level functions"""

    def setUp(self):
        """define some top-level data"""

        self.otu_table_values = array([[0, 0, 9, 5, 3, 1], [1, 5, 4, 0, 3, 2], [2, 3, 1, 1, 2, 5]])
        {
            (0, 2): 9.0,
            (0, 3): 5.0,
            (0, 4): 3.0,
            (0, 5): 1.0,
            (1, 0): 1.0,
            (1, 1): 5.0,
            (1, 2): 4.0,
            (1, 4): 3.0,
            (1, 5): 2.0,
            (2, 0): 2.0,
            (2, 1): 3.0,
            (2, 2): 1.0,
            (2, 3): 1.0,
            (2, 4): 2.0,
            (2, 5): 5.0,
        }
        self.otu_table = Table(
            self.otu_table_values,
            ["OTU1", "OTU2", "OTU3"],
            ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"],
            [{"taxonomy": ["Bacteria"]}, {"taxonomy": ["Archaea"]}, {"taxonomy": ["Streptococcus"]}],
            [None, None, None, None, None, None],
        )
        self.otu_table_f = Table(
            self.otu_table_values,
            ["OTU1", "OTU2", "OTU3"],
            ["Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"],
            [
                {"taxonomy": ["1A", "1B", "1C", "Bacteria"]},
                {"taxonomy": ["2A", "2B", "2C", "Archaea"]},
                {"taxonomy": ["3A", "3B", "3C", "Streptococcus"]},
            ],
            [None, None, None, None, None, None],
        )

        self.full_lineages = [
            ["1A", "1B", "1C", "Bacteria"],
            ["2A", "2B", "2C", "Archaea"],
            ["3A", "3B", "3C", "Streptococcus"],
        ]
        self.metadata = [
            [
                ["Sample1", "NA", "A"],
                ["Sample2", "NA", "B"],
                ["Sample3", "NA", "A"],
                ["Sample4", "NA", "B"],
                ["Sample5", "NA", "A"],
                ["Sample6", "NA", "B"],
            ],
            ["SampleID", "CAT1", "CAT2"],
            [],
        ]
        self.tree_text = ["('OTU3',('OTU1','OTU2'))"]
        fh, self.tmp_heatmap_fpath = mkstemp(prefix="test_heatmap_", suffix=".pdf")
        close(fh)

    def test_extract_metadata_column(self):
        """Extracts correct column from mapping file"""
        obs = extract_metadata_column(self.otu_table.sample_ids, self.metadata, category="CAT2")
        exp = ["A", "B", "A", "B", "A", "B"]
        self.assertEqual(obs, exp)

    def test_get_order_from_categories(self):
        """Sample indices should be clustered within each category"""
        category_labels = ["A", "B", "A", "B", "A", "B"]
        obs = get_order_from_categories(self.otu_table, category_labels)
        group_string = "".join([category_labels[i] for i in obs])
        self.assertTrue("AAABBB" == group_string or group_string == "BBBAAA")

    def test_get_order_from_tree(self):
        obs = get_order_from_tree(self.otu_table.observation_ids, self.tree_text)
        exp = [2, 0, 1]
        assert_almost_equal(obs, exp)

    def test_make_otu_labels(self):
        lineages = []
        for val, id, meta in self.otu_table.iter(axis="observation"):
            lineages.append([v for v in meta["taxonomy"]])
        obs = make_otu_labels(self.otu_table.observation_ids, lineages, n_levels=1)
        exp = ["Bacteria (OTU1)", "Archaea (OTU2)", "Streptococcus (OTU3)"]
        self.assertEqual(obs, exp)

        full_lineages = []
        for val, id, meta in self.otu_table_f.iter(axis="observation"):
            full_lineages.append([v for v in meta["taxonomy"]])
        obs = make_otu_labels(self.otu_table_f.observation_ids, full_lineages, n_levels=3)
        exp = ["1B;1C;Bacteria (OTU1)", "2B;2C;Archaea (OTU2)", "3B;3C;Streptococcus (OTU3)"]
        self.assertEqual(obs, exp)

    def test_names_to_indices(self):
        new_order = ["Sample4", "Sample2", "Sample3", "Sample6", "Sample5", "Sample1"]
        obs = names_to_indices(self.otu_table.sample_ids, new_order)
        exp = [3, 1, 2, 5, 4, 0]
        assert_almost_equal(obs, exp)

    def test_get_log_transform(self):
        obs = get_log_transform(self.otu_table)

        data = [val for val in self.otu_table.iter_data(axis="observation")]
        xform = asarray(data, dtype=float64)

        for (i, val) in enumerate(obs.iter_data(axis="observation")):
            non_zeros = argwhere(xform[i] != 0)
            xform[i, non_zeros] = log10(xform[i, non_zeros])
            assert_almost_equal(val, xform[i])

    def test_get_clusters(self):
        data = asarray([val for val in self.otu_table.iter_data(axis="observation")])
        obs = get_clusters(data, axis="row")
        self.assertTrue([0, 1, 2] == obs or obs == [1, 2, 0])
        obs = get_clusters(data, axis="column")
        exp = [2, 3, 1, 4, 0, 5]
        self.assertEqual(obs, exp)

    def test_plot_heatmap(self):
        plot_heatmap(
            self.otu_table, self.otu_table.observation_ids, self.otu_table.sample_ids, filename=self.tmp_heatmap_fpath
        )
        self.assertEqual(exists(self.tmp_heatmap_fpath), True)
        remove_files(set([self.tmp_heatmap_fpath]))
Пример #54
0
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
    r"""Parses the biom table stored in the filepath `fp`

    Parameters
    ----------
    fp : file like
        File alike object storing the BIOM table
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
    axis : {'sample', 'observation'}, optional
        The axis to subset on
    input_is_dense : boolean
        Indicates if the BIOM table is dense or sparse. Valid only for JSON
        tables.

    Returns
    -------
    Table
        The BIOM table stored at fp

    Raises
    ------
    ValueError
        If `samples` and `observations` are provided.

    Notes
    -----
    Subsetting from the BIOM table is only supported in one axis

    Examples
    --------
    Parse a hdf5 biom table

    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f) # doctest: +SKIP

    Parse a hdf5 biom table subsetting observations
    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f, ids=["GG_OTU_1"],
    ...                      axis='observation') # doctest: +SKIP
    """
    if axis not in ['observation', 'sample']:
        UnknownAxisError(axis)

    try:
        return Table.from_hdf5(fp, ids=ids, axis=axis)
    except:
        pass

    if hasattr(fp, 'read'):
        old_pos = fp.tell()
        try:
            t = Table.from_json(json.load(fp), input_is_dense=input_is_dense)
        except ValueError:
            fp.seek(old_pos)
            t = Table.from_tsv(fp, None, None, lambda x: x)
    elif isinstance(fp, list):
        try:
            t = Table.from_json(json.loads(''.join(fp)),
                                input_is_dense=input_is_dense)
        except ValueError:
            t = Table.from_tsv(fp, None, None, lambda x: x)
    else:
        t = Table.from_json(json.loads(fp), input_is_dense=input_is_dense)

    if ids is not None:
        f = lambda data, id_, md: id_ in ids
        t.filter(f, axis=axis)
        axis = 'observation' if axis == 'sample' else 'sample'
        f = lambda vals, id_, md: np.any(vals)
        t.filter(f, axis=axis)

    return t
Пример #55
0
class FunctionTests(TestCase):

    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        self.otu_table_data = np.array([[2, 1, 0],
                                        [0, 5, 0],
                                        [0, 3, 0],
                                        [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{'domain': 'Archaea'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'},
                             {'domain': 'Bacteria'}]

        self.otu_table = Table(self.otu_table_data,
                               self.taxon_names,
                               self.sample_names)

        self.otu_table_meta = Table(self.otu_table_data,
                                    self.taxon_names, self.sample_names,
                                    observation_metadata=self.otu_metadata)

        fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                        prefix='test_rarefaction',
                                        suffix='.biom')
        close(fd)
        fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                             prefix='test_rarefaction',
                                             suffix='.biom')
        close(fd)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir', suffix='')

        write_biom_table(self.otu_table, self.otu_table_fp)
        write_biom_table(self.otu_table_meta, self.otu_table_meta_fp)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]

    def tearDown(self):
        """ cleanup temporary files """
        map(remove, self._paths_to_clean_up)
        for d in self._dirs_to_clean_up:
            if os.path.exists(d):
                rmtree(d)

    def test_rarefy_to_list(self):
        """rarefy_to_list should rarefy correctly, same names

        """
        maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1)
        res = maker.rarefy_to_list(include_full=True)
        self.assertItemsEqual(res[-1][2].ids(), self.otu_table.ids())
        self.assertItemsEqual(
            res[-1][2].ids(axis='observation'),
            self.otu_table.ids(axis='observation'))
        self.assertEqual(res[-1][2], self.otu_table)

        sample_value_sum = []
        for val in res[1][2].iter_data(axis='sample'):
            sample_value_sum.append(val.sum())
        npt.assert_almost_equal(sample_value_sum, [1.0, 1.0])

    def test_rarefy_to_files(self):
        """rarefy_to_files should write valid files

        """
        maker = RarefactionMaker(self.otu_table_fp, 1, 2, 1, 1)
        maker.rarefy_to_files(
            self.rare_dir,
            include_full=True,
            include_lineages=False)

        fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom")
        otu_table = load_table(fname)

        self.assertItemsEqual(
            otu_table.ids(),
            self.otu_table.ids()[:2])
        # third sample had 0 seqs, so it's gone

    def test_rarefy_to_files2(self):
        """rarefy_to_files should write valid files with some metadata on otus

        """
        maker = RarefactionMaker(self.otu_table_meta_fp, 1, 2, 1, 1)
        maker.rarefy_to_files(
            self.rare_dir,
            include_full=True,
            include_lineages=False)

        fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom")
        otu_table = load_table(fname)

        self.assertItemsEqual(
            otu_table.ids(),
            self.otu_table.ids()[:2])
        # third sample had 0 seqs, so it's gone

    def test_get_empty_rare(self):
        """get_rare_data should be empty when depth > # seqs in any sample"""
        self.assertRaises(TableException, get_rare_data, self.otu_table,
                          50, include_small_samples=False)

    def test_get_overfull_rare(self):
        """get_rare_data should be identical to given in this case

        here, rare depth > any sample, and include_small... = True"""
        rare_otu_table = get_rare_data(self.otu_table,
                                       50, include_small_samples=True)
        self.assertEqual(len(rare_otu_table.ids()), 3)
        # 4 observations times 3 samples = size 12 before
        self.assertEqual(len(rare_otu_table.ids(axis='observation')), 4)
        for sam in self.otu_table.ids():
            for otu in self.otu_table.ids(axis='observation'):
                rare_val = rare_otu_table.get_value_by_ids(otu, sam)
                self.assertEqual(rare_otu_table.get_value_by_ids(otu, sam),
                                 self.otu_table.get_value_by_ids(otu, sam))

    def test_get_11depth_rare(self):
        """get_rare_data should get only sample X

        """
        rare_otu_table = get_rare_data(self.otu_table,
                                       11, include_small_samples=False)
        self.assertEqual(rare_otu_table.ids(), ('X',))

        # a very complicated way to test things
        rare_values = [val[0]
                       for (val, otu_id, meta) in rare_otu_table.iter(axis='observation')]
        self.assertEqual(rare_values, [1.0, 5.0, 3.0, 2.0])
class TopLevelTests(TestCase):

    """Tests of top-level functions"""

    def setUp(self):
        """define some top-level data"""

        self.otu_table_values = array([[0, 0, 9, 5, 3, 1],
                                       [1, 5, 4, 0, 3, 2],
                                       [2, 3, 1, 1, 2, 5]])
        {(0, 2): 9.0, (0, 3): 5.0, (0, 4): 3.0, (0, 5): 1.0,
         (1, 0): 1.0, (1, 1): 5.0, (1, 2): 4.0, (1, 4): 3.0, (1, 5): 2.0,
         (2, 0): 2.0, (2, 1): 3.0, (2, 2): 1.0, (2, 3): 1.0, (2, 4): 2.0, (2, 5): 5.0}
        self.otu_table = Table(self.otu_table_values,
                                       ['OTU1', 'OTU2', 'OTU3'],
                                       ['Sample1', 'Sample2', 'Sample3',
                                        'Sample4', 'Sample5', 'Sample6'],
                                       [{"taxonomy": ['Bacteria']},
                                        {"taxonomy": ['Archaea']},
                                        {"taxonomy": ['Streptococcus']}],
                                        [None, None, None, None, None, None])
        self.otu_table_f = Table(self.otu_table_values,
                                         ['OTU1', 'OTU2', 'OTU3'],
                                         ['Sample1', 'Sample2', 'Sample3',
                                          'Sample4', 'Sample5', 'Sample6'],
                                         [{"taxonomy": ['1A', '1B', '1C', 'Bacteria']},
                                          {"taxonomy":
                                           ['2A', '2B', '2C', 'Archaea']},
                                          {"taxonomy": ['3A', '3B', '3C', 'Streptococcus']}],
                                          [None, None, None, None, None, None])

        self.full_lineages = [['1A', '1B', '1C', 'Bacteria'],
                              ['2A', '2B', '2C', 'Archaea'],
                              ['3A', '3B', '3C', 'Streptococcus']]
        self.metadata = [[['Sample1', 'NA', 'A'],
                          ['Sample2', 'NA', 'B'],
                          ['Sample3', 'NA', 'A'],
                          ['Sample4', 'NA', 'B'],
                          ['Sample5', 'NA', 'A'],
                          ['Sample6', 'NA', 'B']],
                         ['SampleID', 'CAT1', 'CAT2'], []]
        self.tree_text = ["('OTU3',('OTU1','OTU2'))"]
        fh, self.tmp_heatmap_fpath = mkstemp(prefix='test_heatmap_',
                                            suffix='.pdf')
        close(fh)

    def test_extract_metadata_column(self):
        """Extracts correct column from mapping file"""
        obs = extract_metadata_column(self.otu_table.ids(),
                                      self.metadata, category='CAT2')
        exp = ['A', 'B', 'A', 'B', 'A', 'B']
        self.assertEqual(obs, exp)

    def test_get_order_from_categories(self):
        """Sample indices should be clustered within each category"""
        category_labels = ['A', 'B', 'A', 'B', 'A', 'B']
        obs = get_order_from_categories(self.otu_table, category_labels)
        group_string = "".join([category_labels[i] for i in obs])
        self.assertTrue("AAABBB" == group_string or group_string == "BBBAAA")

    def test_get_order_from_tree(self):
        obs = get_order_from_tree(
            self.otu_table.ids(axis='observation'),
            self.tree_text)
        exp = [2, 0, 1]
        assert_almost_equal(obs, exp)

    def test_make_otu_labels(self):
        lineages = []
        for val, id, meta in self.otu_table.iter(axis='observation'):
            lineages.append([v for v in meta['taxonomy']])
        obs = make_otu_labels(self.otu_table.ids(axis='observation'),
                              lineages, n_levels=1)
        exp = ['Bacteria (OTU1)', 'Archaea (OTU2)', 'Streptococcus (OTU3)']
        self.assertEqual(obs, exp)

        full_lineages = []
        for val, id, meta in self.otu_table_f.iter(axis='observation'):
            full_lineages.append([v for v in meta['taxonomy']])
        obs = make_otu_labels(self.otu_table_f.ids(axis='observation'),
                              full_lineages, n_levels=3)
        exp = ['1B;1C;Bacteria (OTU1)',
               '2B;2C;Archaea (OTU2)',
               '3B;3C;Streptococcus (OTU3)']
        self.assertEqual(obs, exp)

    def test_names_to_indices(self):
        new_order = ['Sample4', 'Sample2', 'Sample3',
                     'Sample6', 'Sample5', 'Sample1']
        obs = names_to_indices(self.otu_table.ids(), new_order)
        exp = [3, 1, 2, 5, 4, 0]
        assert_almost_equal(obs, exp)

    def test_get_log_transform(self):
        obs = get_log_transform(self.otu_table)

        data = [val for val in self.otu_table.iter_data(axis='observation')]
        xform = asarray(data, dtype=float64)

        for (i, val) in enumerate(obs.iter_data(axis='observation')):
            non_zeros = argwhere(xform[i] != 0)
            xform[i, non_zeros] = log10(xform[i, non_zeros])
            assert_almost_equal(val, xform[i])

    def test_get_clusters(self):
        data = asarray([val for val in self.otu_table.iter_data(axis='observation')])
        obs = get_clusters(data, axis='row')
        self.assertTrue([0, 1, 2] == obs or obs == [1, 2, 0])
        obs = get_clusters(data, axis='column')
        exp = [2, 3, 1, 4, 0, 5]
        self.assertEqual(obs, exp)

    def test_plot_heatmap(self):
        plot_heatmap(
            self.otu_table, self.otu_table.ids(axis='observation'),
            self.otu_table.ids(), filename=self.tmp_heatmap_fpath)
        self.assertEqual(exists(self.tmp_heatmap_fpath), True)
        remove_files(set([self.tmp_heatmap_fpath]))