def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a FOFN of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) fofn = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) fofn.append(out_fn) return fofn
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format( chunks, maxChunks)) dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a list of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True, skipCounts=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) split_fns = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn, skipCounts=True) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) split_fns.append(out_fn) return split_fns
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info( 'Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)' .format(chunks, maxChunks)) dset_chunks = dset.split( zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))