def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID): # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column). For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails. Look into this sometime... with open(OTU_table_classic, 'r') as fidin: otu_table_data = fidin.readlines() firstrow = otu_table_data[0].split('\t') sample_labels = firstrow[1:] sample_labels[len(sample_labels) - 1] = sample_labels[len(sample_labels) - 1].rstrip('\n') OTU_labels = [ otu_table_data[i].split('\t')[0] for i in range(1, len(otu_table_data)) ] nOTUs = len(OTU_labels) nSamples = len(sample_labels) # Load OTU table row major order OTU_table_data = np.zeros((nOTUs, nSamples)) for i in range(1, nOTUs + 1): OTU_table_data[i - 1, :] = otu_table_data[i].split('\t')[1:] # Write in BIOM format t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID) with biom_open(OTU_table_biom, 'w') as f: t.to_hdf5(f, "Generated by processing layer", compress=False)
def write_biom(self, sample_names, read_taxonomies, biom_file_io): '''Write the OTU info to a biom IO output stream Parameters ---------- sample_names: String names of each sample (sample_ids for biom) read_taxonomies: Array of hashes as per _iterate_otu_table_rows() biom_file_io: io open writeable stream to write biom contents to Returns True if successful, else False''' counts = [] observ_metadata = [] otu_ids = [] for otu_id, tax, count in self._iterate_otu_table_rows(read_taxonomies): if len(count) != len(sample_names): raise Exception("Programming error: mismatched sample names and counts") counts.append(count) observ_metadata.append({'taxonomy': tax}) otu_ids.append(str(otu_id)) if len(counts) == 0: logging.info("Not writing BIOM file since no sequences were assigned taxonomy") return True table = Table(np.array(counts), otu_ids, sample_names, observ_metadata, [{}]*len(sample_names), table_id='GraftM Taxonomy Count Table') try: table.to_hdf5(biom_file_io, 'GraftM graft') return True except RuntimeError as e: logging.warn("Error writing BIOM output, file not written. The specific error was: %s" % e) return False
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID): # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column). For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails. with open(OTU_table_classic,'r') as fidin: otu_table_data = fidin.readlines() firstrow = otu_table_data[0].split('\t') sample_labels = firstrow[1:] sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n') OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))] nOTUs = len(OTU_labels) nSamples = len(sample_labels) # Load OTU table row major order OTU_table_data = np.zeros((nOTUs, nSamples)) for i in range(1,nOTUs+1): OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:] # Write in BIOM format t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID) with biom_open(OTU_table_biom, 'w') as f: t.to_hdf5(f, "Generated by processing layer", compress=False)
def write_biom(self, sample_names, read_taxonomies, biom_file_io): '''Write the OTU info to a biom IO output stream Parameters ---------- sample_names: String names of each sample (sample_ids for biom) read_taxonomies: Array of hashes as per _iterate_otu_table_rows() biom_file_io: io open writeable stream to write biom contents to Returns True if successful, else False''' counts = [] observ_metadata = [] otu_ids = [] for otu_id, tax, count in self._iterate_otu_table_rows( read_taxonomies): if len(count) != len(sample_names): raise Exception( "Programming error: mismatched sample names and counts") counts.append(count) observ_metadata.append({'taxonomy': tax}) otu_ids.append(str(otu_id)) if len(counts) == 0: logging.info( "Not writing BIOM file since no sequences were assigned taxonomy" ) return True table = Table(np.array(counts), otu_ids, sample_names, observ_metadata, [{}] * len(sample_names), table_id='GraftM Taxonomy Count Table') try: table.to_hdf5(biom_file_io, 'GraftM graft') return True except RuntimeError as e: logging.warn( "Error writing BIOM output, file not written. The specific error was: %s" % e) return False
k = int(density * m * n) print("Starting density {}...".format(density)) #new_matrix = sparse.random(*dims, format='csr') ind = accumulate_random_choice(mn, k) j = np.floor(ind * 1. / m).astype(np.int64, copy=False) i = (ind - j * m).astype(np.int64, copy=False) vals = np.random.rand(k) new_matrix = coo_matrix((vals, (i, j)), shape=(m, n)).asformat('csr', copy=False) print("Finished.") print("Adding sparse tables...") new_matrix = new_matrix + sparse_mat print("Finished.") print("Making BIOM table...") new_table = Table(new_matrix, table.ids('observation'), table.ids('sample'), table_id="Density {} table".format(density)) #matrix_file = '/home/garmstro/faith_pd/large-data/ind_density-{}.p'.format(density) matrix_file = '/home/garmstro/faith_pd/large-data/large_table_density-{}.biom'.format( density + base_density) print("Finished.") print("Writing...") with biom_open(matrix_file, 'w') as fp: new_table.to_hdf5(fp, "density {}".format(density + base_density)) del new_table del new_matrix #with open(matrix_file, 'wb') as fp: # pickle.dump(new_matrix, fp) print("Finished.")