def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.".format( s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler( bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format( f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.". format(s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError("Sequence and QVs of {r} should be the same!". format(r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def index_input_subreads(self): """Index input subreads in self.fasta_fofn or self.bas_fofn. """ if guess_file_format(self.bas_fofn) == FILE_FORMATS.BAM: msg = "Indexing files in %s, please wait." % self.bas_fofn self.add_log(msg) d = BamCollection(self.bas_fofn) else: msg = "Indexing files in %s, please wait." % self.fasta_fofn self.add_log(msg) d = MetaSubreadFastaReader(get_files_from_file_or_fofn(self.fasta_fofn)) self.add_log("File indexing done.") return d
def index_input_subreads(self): """Index input subreads in self.fasta_fofn or self.bas_fofn. """ if guess_file_format(self.bas_fofn) == FILE_FORMATS.BAM: msg = "Indexing files in %s, please wait." % self.bas_fofn self.add_log(msg) d = BamCollection(self.bas_fofn) else: msg = "Indexing files in %s, please wait." % self.fasta_fofn self.add_log(msg) d = MetaSubreadFastaReader(get_files_from_file_or_fofn(self.fasta_fofn)) self.add_log("File indexing done.") return d
def __init__(self, *args): if len(args) == 1: args = get_files_from_file_or_fofn(args[0]) self._dataset = openDataFile(*args) # Implementation notes: find all the bam files, and group # them together by movieName self._header = BamHeader(ignore_pg=True) for bam in self._dataset.resourceReaders(): if not isinstance(bam, IndexedBamReader): raise ValueError("%s in %s must have pbi index generated", bam.filename, str(self._dataset)) self._header.add(bam.peer.header) for rg in bam.peer.header["RG"]: #readGroupTable: if rg['PL'] != "PACBIO": raise IOError("Input BAM file %s for %s must be PacBio BAM.", bam.filename, self.__class__.__name__) for rg in bam.readGroupTable: assert rg.ReadType in ["CCS", "SUBREAD"]
def __init__(self, *args): if len(args) == 1: args = get_files_from_file_or_fofn(args[0]) self._dataset = openDataFile(*args) # Implementation notes: find all the bam files, and group # them together by movieName self._header = BamHeader(ignore_pg=True) for bam in self._dataset.resourceReaders(): if not isinstance(bam, IndexedBamReader): raise ValueError("%s in %s must have pbi index generated", bam.filename, str(self._dataset)) self._header.add(bam.peer.header) for rg in bam.peer.header["RG"]: #readGroupTable: if rg['PL'] != "PACBIO": raise IOError( "Input BAM file %s for %s must be PacBio BAM.", bam.filename, self.__class__.__name__) for rg in bam.readGroupTable: assert rg.ReadType in ["CCS", "SUBREAD"]