def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn: if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError("{seqid} is not a valid CCS read". format(seqid=seqid)) if ccs_fofn: try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from input ccs fofn.". format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") else: #No quality values provided to pbtranscript.py cluster qvs = [60]*len(r.sequence) # No information given, have strong belief in the base calls if len(r.sequence) != len(qvs): raise ValueError("Sequence and QVs of {r} should be the same!". format(r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError( "{seqid} is not a valid CCS read".format(seqid=seqid)) try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError( "Could not read {s} from input ccs fofn.".format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" logging.debug("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) trim_subread_flanks(tmp_out_file, out_file) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" logging.debug("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) trim_subread_flanks(tmp_out_file, out_file) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() in_queue = manager.Queue(len(in_fns)) in_queue_count = 0 outfile_track = {} # expected out file --> (cmd, tmp) pool = [] out_fns = [] for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) in_queue_count += 1 outfile_track[out_file] = (cmd, tmp_out_file) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,)) pool.append(p) #error_flag = False # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join(timeout=1200) if p.is_alive(): p.terminate() # check that all files exists # if it does not, force to run locally for out_file,(cmd, tmp_out_file) in outfile_track.iteritems(): in_queue.put((cmd, tmp_out_file, out_file)) convert_fofn_to_fasta_worker(in_queue) out_fns.append(out_file) #if error_flag: # raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!" write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() out_fns = manager.list() in_queue = manager.Queue(99999) pool = [] for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue, out_fns)) pool.append(p) for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file # logging.debug("CMD: {cmd}".format(cmd=cmd)) # _out, _code, _msg = backticks(cmd) # if _code != 0: # raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) # trim_subread_flanks(tmp_out_file, out_file) # out_fns.append(out_file) # if op.exists(tmp_out_file): # os.remove(tmp_out_file) # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join() write_files_to_fofn(out_fns, out_filename)