def test_write_dataset_datachunkiterator_data_and_time(self): a = np.arange(30).reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) tstamps = np.arange(5) tsiter = DataChunkIterator.from_iterable(tstamps) ts = TimeSeries('ts_name', daiter, 'A', timestamps=tsiter) self.nwbfile.add_acquisition(ts) with NWBHDF5IO(self.path, 'w') as io: io.write(self.nwbfile, cache_spec=False) with File(self.path, 'r') as f: dset = f['/acquisition/ts_name/data'] self.assertListEqual(dset[:].tolist(), a.tolist())
def test_write_dataset_iterable_multidimensional_array(self): a = np.arange(30).reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) self.io.write_dataset(self.f, DatasetBuilder('test_dataset', daiter, attributes={})) dset = self.f['test_dataset'] self.assertListEqual(dset[:].tolist(), a.tolist())
def test_write_dataset_iterable_multidimensional_array_compression(self): a = np.arange(30).reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) wrapped_daiter = H5DataIO(data=daiter, compression='gzip', compression_opts=5, shuffle=True, fletcher32=True) self.io.write_dataset(self.f, DatasetBuilder('test_dataset', wrapped_daiter, attributes={})) dset = self.f['test_dataset'] self.assertEqual(dset.shape, a.shape) self.assertListEqual(dset[:].tolist(), a.tolist()) self.assertEqual(dset.compression, 'gzip') self.assertEqual(dset.compression_opts, 5) self.assertEqual(dset.shuffle, True) self.assertEqual(dset.fletcher32, True)
def test_write_dataset_datachunkiterator_with_compression(self): a = np.arange(30).reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) wrapped_daiter = H5DataIO(data=daiter, compression='gzip', compression_opts=5, shuffle=True, fletcher32=True) ts = TimeSeries(name='ts_name', data=wrapped_daiter, unit='A', timestamps=np.arange(5.)) self.nwbfile.add_acquisition(ts) with NWBHDF5IO(self.path, 'w') as io: io.write(self.nwbfile, cache_spec=False) with File(self.path, 'r') as f: dset = f['/acquisition/ts_name/data'] self.assertEqual(dset.shape, a.shape) self.assertListEqual(dset[:].tolist(), a.tolist()) self.assertEqual(dset.compression, 'gzip') self.assertEqual(dset.compression_opts, 5) self.assertEqual(dset.shuffle, True) self.assertEqual(dset.fletcher32, True)
embeddings[target_indices] = emb_file['embedding'][to_get] finally: emb_file.close() h5path = args.out print("reading %d Fasta files" % len(fapaths)) print("Total size:", sum(os.path.getsize(f) for f in fapaths)) if args.faa: seqit = AASeqIterator(fapaths, verbose=True) else: seqit = DNASeqIterator(fapaths, verbose=True) packed = DataChunkIterator.from_iterable(iter(seqit), maxshape=(None, ), buffer_size=2**15, dtype=np.dtype('uint8')) seqindex = DataChunkIterator.from_iterable(seqit.index_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('int')) names = DataChunkIterator.from_iterable(seqit.names_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('U')) ids = DataChunkIterator.from_iterable(seqit.id_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('int')) taxa = DataChunkIterator.from_iterable(seqit.taxon_iter, maxshape=(None, ),
def prepare_data(argv=None): '''Aggregate sequence data GTDB using a file-of-files''' import argparse import io import sys import logging import h5py import pandas as pd from skbio import TreeNode from hdmf.common import get_hdf5io from hdmf.data_utils import DataChunkIterator from ..utils import get_faa_path, get_fna_path, get_genomic_path from exabiome.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator from exabiome.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix parser = argparse.ArgumentParser() parser.add_argument( 'accessions', type=str, help='file of the NCBI accessions of the genomes to convert') parser.add_argument('fadir', type=str, help='directory with NCBI sequence files') parser.add_argument('metadata', type=str, help='metadata file from GTDB') parser.add_argument('tree', type=str, help='the distances file') parser.add_argument('out', type=str, help='output HDF5') grp = parser.add_mutually_exclusive_group() parser.add_argument('-e', '--emb', type=str, help='embedding file', default=None) grp.add_argument('-p', '--protein', action='store_true', default=False, help='get paths for protein files') grp.add_argument('-c', '--cds', action='store_true', default=False, help='get paths for CDS files') grp.add_argument('-g', '--genomic', action='store_true', default=False, help='get paths for genomic files (default)') parser.add_argument('-D', '--dist_h5', type=str, help='the distances file', default=None) parser.add_argument( '-d', '--max_deg', type=float, default=None, help='max number of degenerate characters in protein sequences') parser.add_argument('-l', '--min_len', type=float, default=None, help='min length of sequences') parser.add_argument('-V', '--vocab', action='store_true', default=False, help='store sequences as vocabulary data') if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args(args=argv) if not any([args.protein, args.cds, args.genomic]): args.genomic = True logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(message)s') logger = logging.getLogger() # read accessions logger.info('reading accessions %s' % args.accessions) with open(args.accessions, 'r') as f: taxa_ids = [l[:-1] for l in f.readlines()] # get paths to Fasta Files fa_path_func = get_genomic_path if args.cds: fa_path_func = get_fna_path elif args.protein: fa_path_func = get_faa_path fapaths = [fa_path_func(acc, args.fadir) for acc in taxa_ids] di_kwargs = dict() # if a distance matrix file has been given, read and select relevant distances if args.dist_h5: ############################# # read and filter distances ############################# logger.info('reading distances from %s' % args.dist_h5) with h5py.File(args.dist_h5, 'r') as f: dist = f['distances'][:] dist_taxa = f['leaf_names'][:].astype('U') logger.info('selecting distances for taxa found in %s' % args.accessions) dist = select_distances(taxa_ids, dist_taxa, dist) dist = CondensedDistanceMatrix('distances', data=dist) di_kwargs['distances'] = dist ############################# # read and filter taxonomies ############################# logger.info('reading taxonomies from %s' % args.metadata) taxlevels = [ 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] def func(row): dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';'))) dat['species'] = dat['species'].split(' ')[1] dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][ 3:] dat['accession'] = row['accession'][3:] return pd.Series(data=dat) logger.info('selecting GTDB taxonomy for taxa found in %s' % args.accessions) taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative']]\ .apply(func, axis=1)\ .set_index('accession')\ .filter(items=taxa_ids, axis=0) ############################# # read and filter embeddings ############################# emb = None if args.emb is not None: logger.info('reading embeddings from %s' % args.emb) with h5py.File(args.emb, 'r') as f: emb = f['embedding'][:] emb_taxa = f['leaf_names'][:] logger.info('selecting embeddings for taxa found in %s' % args.accessions) emb = select_embeddings(taxa_ids, emb_taxa, emb) ############################# # read and trim tree ############################# logger.info('reading tree from %s' % args.tree) root = TreeNode.read(args.tree, format='newick') logger.info('transforming leaf names for shearing') for tip in root.tips(): tip.name = tip.name[3:].replace(' ', '_') logger.info('shearing taxa not found in %s' % args.accessions) rep_ids = taxdf['gtdb_genome_representative'].values root = root.shear(rep_ids) logger.info('converting tree to Newick string') bytes_io = io.BytesIO() root.write(bytes_io, format='newick') tree_str = bytes_io.getvalue() tree = NewickString('tree', data=tree_str) if di_kwargs.get('distances') is None: from scipy.spatial.distance import squareform tt_dmat = root.tip_tip_distances() if (rep_ids != taxa_ids).any(): tt_dmat = get_nonrep_matrix(taxa_ids, rep_ids, tt_dmat) dmat = tt_dmat.data di_kwargs['distances'] = CondensedDistanceMatrix('distances', data=dmat) h5path = args.out logger.info("reading %d Fasta files" % len(fapaths)) logger.info("Total size: %d", sum(os.path.getsize(f) for f in fapaths)) if args.vocab: if args.protein: SeqTable = SequenceTable seqit = AAVocabIterator(fapaths, logger=logger, min_seq_len=args.min_len) else: SeqTable = DNATable if args.cds: logger.info("reading and writing CDS sequences") seqit = DNAVocabGeneIterator(fapaths, logger=logger, min_seq_len=args.min_len) else: seqit = DNAVocabIterator(fapaths, logger=logger, min_seq_len=args.min_len) else: if args.protein: logger.info("reading and writing protein sequences") seqit = AASeqIterator(fapaths, logger=logger, max_degenerate=args.max_deg, min_seq_len=args.min_len) SeqTable = AATable else: logger.info("reading and writing DNA sequences") seqit = DNASeqIterator(fapaths, logger=logger, min_seq_len=args.min_len) SeqTable = DNATable seqit_bsize = 2**25 if args.protein: seqit_bsize = 2**15 elif args.cds: seqit_bsize = 2**18 # set up DataChunkIterators packed = DataChunkIterator.from_iterable(iter(seqit), maxshape=(None, ), buffer_size=seqit_bsize, dtype=np.dtype('uint8')) seqindex = DataChunkIterator.from_iterable(seqit.index_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('int')) names = DataChunkIterator.from_iterable(seqit.names_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('U')) ids = DataChunkIterator.from_iterable(seqit.id_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('int')) taxa = DataChunkIterator.from_iterable(seqit.taxon_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('uint16')) seqlens = DataChunkIterator.from_iterable(seqit.seqlens_iter, maxshape=(None, ), buffer_size=2**0, dtype=np.dtype('uint32')) io = get_hdf5io(h5path, 'w') tt_args = ['taxa_table', 'a table for storing taxa data', taxa_ids] tt_kwargs = dict() for t in taxlevels[1:]: tt_args.append(taxdf[t].values) if emb is not None: tt_kwargs['embedding'] = emb tt_kwargs['rep_taxon_id'] = rep_ids taxa_table = TaxaTable(*tt_args, **tt_kwargs) seq_table = SeqTable( 'seq_table', 'a table storing sequences for computing sequence embedding', io.set_dataio(names, compression='gzip', chunks=(2**15, )), io.set_dataio(packed, compression='gzip', maxshape=(None, ), chunks=(2**15, )), io.set_dataio(seqindex, compression='gzip', maxshape=(None, ), chunks=(2**15, )), io.set_dataio(seqlens, compression='gzip', maxshape=(None, ), chunks=(2**15, )), io.set_dataio(taxa, compression='gzip', maxshape=(None, ), chunks=(2**15, )), taxon_table=taxa_table, id=io.set_dataio(ids, compression='gzip', maxshape=(None, ), chunks=(2**15, ))) difile = DeepIndexFile(seq_table, taxa_table, tree, **di_kwargs) io.write(difile, exhaust_dci=False) io.close() logger.info("reading %s" % (h5path)) h5size = os.path.getsize(h5path) logger.info("HDF5 size: %d", h5size)
def test_dtype(self): a = np.arange(30, dtype='int32').reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) self.assertEqual(daiter.dtype, a.dtype)
def test_maxshape(self): a = np.arange(30).reshape(5, 2, 3) aiter = iter(a) daiter = DataChunkIterator.from_iterable(aiter, buffer_size=2) self.assertEqual(daiter.maxshape, (None, 2, 3))