Пример #1
0
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID):
    # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column).  For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails.  Look into this sometime...
    with open(OTU_table_classic, 'r') as fidin:
        otu_table_data = fidin.readlines()
        firstrow = otu_table_data[0].split('\t')
        sample_labels = firstrow[1:]
        sample_labels[len(sample_labels) -
                      1] = sample_labels[len(sample_labels) - 1].rstrip('\n')
        OTU_labels = [
            otu_table_data[i].split('\t')[0]
            for i in range(1, len(otu_table_data))
        ]
        nOTUs = len(OTU_labels)
        nSamples = len(sample_labels)
        # Load OTU table row major order
        OTU_table_data = np.zeros((nOTUs, nSamples))
        for i in range(1, nOTUs + 1):
            OTU_table_data[i - 1, :] = otu_table_data[i].split('\t')[1:]
        # Write in BIOM format
        t = Table(OTU_table_data,
                  OTU_labels,
                  sample_labels,
                  observ_metadata=None,
                  sample_metadata=None,
                  table_id=dataset_ID)
        with biom_open(OTU_table_biom, 'w') as f:
            t.to_hdf5(f, "Generated by processing layer", compress=False)
Пример #2
0
 def write_biom(self, sample_names, read_taxonomies, biom_file_io):
     '''Write the OTU info to a biom IO output stream
     
     Parameters
     ----------
     sample_names: String
         names of each sample (sample_ids for biom)
     read_taxonomies: Array of hashes as per _iterate_otu_table_rows()
     biom_file_io: io
         open writeable stream to write biom contents to
         
     Returns True if successful, else False'''
     counts = []
     observ_metadata = []
     otu_ids = []
     for otu_id, tax, count in self._iterate_otu_table_rows(read_taxonomies):
         if len(count) != len(sample_names):
             raise Exception("Programming error: mismatched sample names and counts")
         counts.append(count)
         observ_metadata.append({'taxonomy': tax})
         otu_ids.append(str(otu_id))
     if len(counts) == 0:
         logging.info("Not writing BIOM file since no sequences were assigned taxonomy")
         return True
     table = Table(np.array(counts),
                   otu_ids, sample_names, observ_metadata,
                   [{}]*len(sample_names), table_id='GraftM Taxonomy Count Table')
     try:
         table.to_hdf5(biom_file_io, 'GraftM graft')
         return True
     except RuntimeError as e:
         logging.warn("Error writing BIOM output, file not written. The specific error was: %s" % e)
         return False
Пример #3
0
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID):
    # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column).  For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails.
    with open(OTU_table_classic,'r') as fidin:
        otu_table_data = fidin.readlines()
        firstrow = otu_table_data[0].split('\t')
        sample_labels = firstrow[1:]
        sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n')
        OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))]
        nOTUs = len(OTU_labels)
        nSamples = len(sample_labels)
        # Load OTU table row major order
        OTU_table_data = np.zeros((nOTUs, nSamples))
        for i in range(1,nOTUs+1):
            OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:]
        # Write in BIOM format
        t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID)
        with biom_open(OTU_table_biom, 'w') as f:
            t.to_hdf5(f, "Generated by processing layer", compress=False)
Пример #4
0
 def write_biom(self, sample_names, read_taxonomies, biom_file_io):
     '''Write the OTU info to a biom IO output stream
     
     Parameters
     ----------
     sample_names: String
         names of each sample (sample_ids for biom)
     read_taxonomies: Array of hashes as per _iterate_otu_table_rows()
     biom_file_io: io
         open writeable stream to write biom contents to
         
     Returns True if successful, else False'''
     counts = []
     observ_metadata = []
     otu_ids = []
     for otu_id, tax, count in self._iterate_otu_table_rows(
             read_taxonomies):
         if len(count) != len(sample_names):
             raise Exception(
                 "Programming error: mismatched sample names and counts")
         counts.append(count)
         observ_metadata.append({'taxonomy': tax})
         otu_ids.append(str(otu_id))
     if len(counts) == 0:
         logging.info(
             "Not writing BIOM file since no sequences were assigned taxonomy"
         )
         return True
     table = Table(np.array(counts),
                   otu_ids,
                   sample_names,
                   observ_metadata, [{}] * len(sample_names),
                   table_id='GraftM Taxonomy Count Table')
     try:
         table.to_hdf5(biom_file_io, 'GraftM graft')
         return True
     except RuntimeError as e:
         logging.warn(
             "Error writing BIOM output, file not written. The specific error was: %s"
             % e)
         return False
    k = int(density * m * n)
    print("Starting density {}...".format(density))
    #new_matrix = sparse.random(*dims, format='csr')
    ind = accumulate_random_choice(mn, k)
    j = np.floor(ind * 1. / m).astype(np.int64, copy=False)
    i = (ind - j * m).astype(np.int64, copy=False)
    vals = np.random.rand(k)
    new_matrix = coo_matrix((vals, (i, j)), shape=(m, n)).asformat('csr',
                                                                   copy=False)
    print("Finished.")
    print("Adding sparse tables...")
    new_matrix = new_matrix + sparse_mat
    print("Finished.")
    print("Making BIOM table...")
    new_table = Table(new_matrix,
                      table.ids('observation'),
                      table.ids('sample'),
                      table_id="Density {} table".format(density))
    #matrix_file = '/home/garmstro/faith_pd/large-data/ind_density-{}.p'.format(density)
    matrix_file = '/home/garmstro/faith_pd/large-data/large_table_density-{}.biom'.format(
        density + base_density)
    print("Finished.")
    print("Writing...")
    with biom_open(matrix_file, 'w') as fp:
        new_table.to_hdf5(fp, "density {}".format(density + base_density))
    del new_table
    del new_matrix
    #with open(matrix_file, 'wb') as fp:
    #    pickle.dump(new_matrix, fp)
    print("Finished.")