def __check_input(opts, args, parser): """ Make sure the input is in the form of either a cmp.h5 file of aligned reads or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta file is specified if """ arg = args[0] h5_files = [] opts.h5_labels = {} if arg[-6:] == "cmp.h5": print "Found cmp.h5 of aligned reads:" opts.h5_type = "cmp" opts.cmph5_contig_lens = {} opts.cmph5_contig_lens[arg] = {} h5_files.append(arg) print " -- %s" % arg print "Getting contig information from %s..." % arg reader = CmpH5Reader(arg) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) opts.cmph5_contig_lens[arg][slug_name] = length opts.h5_labels[arg] = "remove" reader.close() elif arg[-6:] == "bas.h5": print "Found bas.h5 of unaligned reads:" opts.h5_type = "bas" h5_files.append(arg) opts.h5_labels[arg] = "remove" print " -- %s" % arg elif arg[-5:] == ".fofn": print "Found FOFN of bas.h5 files:" opts.h5_type = "bas" fns = map(lambda x: x.strip("\n"), np.atleast_1d(open(arg, "r").read())) h5_files = fns for fn in fns: print " -- %s" % fn opts.h5_labels[fn] = "remove" if opts.h5_type == "bas": print "*************************************************************" print "* Motif filtering using unaligned reads is not recommended. *" print "* Aligned reads work much better for this! *" print "*************************************************************" print "" if opts.h5_type == "bas" and opts.cross_cov_bins != None: parser.error( "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!" ) return h5_files
def workerProcess(inQueue, refFile, quiverConfig): with CmpH5Reader(os.path.join(args.jobDir, "data/aligned_reads.cmp.h5")) as d: referenceSeq = IndexedFastaReader(refFile)[0].sequence while True: queue_item = inQueue.get() if queue_item == "die": return else: movieID, holeNumber, rcrefstrand = queue_item alns = d[((d.MovieID == movieID) & (d.HoleNumber == holeNumber) & (d.RCRefStrand == rcrefstrand))] cssName = "/".join(alns[0].readName.split("/")[:-1]) + "/" + str(alns[0].RCRefStrand) + "/ssc" if args.ignore_barcodes: cssObj = ConsensusSequence(cssName, "", "", len(alns), 0, 0, "all_reads") else: cssObj = ConsensusSequence(cssName, "", "", len(alns), 0, 0, alns[0].barcodeName) if not cssObj.numPasses >= args.minCoverage: cssObj.minNumPassesFail = True if not checkMapping(alns): cssObj.mappingFail = True if not cssObj.minNumPassesFail and not cssObj.mappingFail: refId = alns[0].referenceId v = hullMany([(a.tStart, a.tEnd) for a in alns]) window = (refId, v[0], v[1]) windowLen = v[1] - v[0] refSeqInWindow = referenceSeq[v[0]:v[1]] css = consensusForAlignments(window, refSeqInWindow, alns, quiverConfig) cssObj.seq = css.sequence cssObj.qual = css.confidence cssObj.coverage = sum((a.referenceSpan > 0.8 * windowLen) for a in alns) if not cssObj.coverage >= args.minCoverage: cssObj.minCoverageFail = True cssObj.predictedAccuracy = estimateAccuracy(css.confidence) if not cssObj.predictedAccuracy >= args.minAvgConfidence: cssObj.minAvgConfidenceFail = True if args.trim: cssObj = trim(cssObj, lseq, rseq) if args.clip: cssObj = clip(cssObj, args.clip) try: cssObj.minConfidence = 1 - unphred(np.amin(np.array(cssObj.qual, dtype=float))) except ValueError: # when cssObj.qual is zero-length cssObj.minConfidence = 0 if not cssObj.minConfidence >= args.minConfidence: cssObj.minConfidenceFail = True resultQueue.put(cssObj) counter.increment()
def scan_WGA_h5( self ): """ Get some necessary information about the WGA cmp.h5 being used to generate the control IPD data. """ self.opts.h5_labels = {} self.opts.cmph5_contig_lens = {} self.opts.h5_labels[self.control_h5] = "control" self.opts.cmph5_contig_lens[self.control_h5] = {} reader = CmpH5Reader(self.control_h5) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) self.opts.cmph5_contig_lens[self.control_h5][slug_name] = length reader.close() return self.opts
def get_fps(align_fn): """ For *.cmp.h5 files, frame rate (fps) is include in each alignment. For *.bam files, the frame rate is encoded in the file header (FRAMERATEHZ) """ if self.opts.aln_ftype=="cmp": # Read frame rate directly from a cmp.h5 alignment reader = CmpH5Reader(align_fn) alignment = reader[0] fps = alignment.movieInfo[2] elif self.opts.aln_ftype=="bam": # Isolate description (DS) from read group (RG) in BAM header bam = pysam.AlignmentFile(align_fn, "rb") h = bam.header rg_ds_l = h.as_dict()["RG"][0]["DS"].split(";") rg_ds_d = dict([ (x.split("=")[0], x.split("=")[1]) for x in rg_ds_l]) fps = float(rg_ds_d["FRAMERATEHZ"]) return fps
def __check_input(opts, args, parser): """ Make sure the input is in the form of either a cmp.h5 file of aligned reads or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta file is specified if """ if len(args) != 2: print "ERROR -- expecting two arguments: \ (1) input hdf5 file (cmp.h5, bas.h5, or FOFN of bas.h5 files) \ (2) file containing the motifs to analyze, separated by newlines, e.g.\ \ GATC-1\ CATG-1\ CAACGA-2" seq_input = args[0] motifs_fn = args[1] h5_files = [] opts.h5_labels = {} if seq_input[-6:] == "cmp.h5": print "Found cmp.h5 of aligned reads:" h5 = os.path.abspath(seq_input) opts.h5_type = "cmp" opts.cmph5_contig_lens = {} opts.cmph5_contig_lens[h5] = {} h5_files.append(h5) print " -- %s" % h5 print "Getting contig information from %s..." % h5 reader = CmpH5Reader(h5) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) opts.cmph5_contig_lens[h5][slug_name] = length opts.h5_labels[h5] = "remove" reader.close() elif seq_input[-6:] == "bas.h5": print "Found bas.h5 of unaligned reads:" opts.h5_type = "bas" h5 = os.path.abspath(seq_input) h5_files.append(h5) opts.h5_labels[h5] = "remove" print " -- %s" % h5 elif seq_input[-5:] == ".fofn": print "Found FOFN of bas.h5 files of unaligned reads:" opts.h5_type = "bas" fofn_content = open(seq_input, "r").read().strip() h5_files = fofn_content.split("\n") for h5 in h5_files: h5 = os.path.abspath(h5) print " -- %s" % h5 opts.h5_labels[h5] = "remove" if opts.h5_type == "bas" and opts.cross_cov_bins != None: parser.error( "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!" ) if opts.h5_type == "cmp": try: for entry in SeqIO.parse(opts.contigs, "fasta"): x = entry.seq y = entry.id except: parser.error( "Please make sure the --contigs input is a valid fasta file.") if not os.path.exists(motifs_fn): parser.error( "Can't find file of motifs to include in methylation profile: %s" % motifs_fn) return h5_files, motifs_fn
taskQueue = multiprocessing.Queue() resultQueue = multiprocessing.Queue() counter = Counter() startTime = time() quiverErrorModel = loadQuiverConfig("P6-C4.AllQVsMergingByChannelModel") if args.trim: dupleSeq = args.trim.upper() if not all([x in ("A", "C", "G", "T", ",") for x in dupleSeq]) or not dupleSeq.count(",") == 1: sys.exit("-t requires a duple of sequences separated by a comma. (eg. ACTAGGA,CTACGAG)") lseq = dupleSeq[:args.trim.find(",")] rseq = args.trim[args.trim.find(",") + 1:] # get the list of reads to extract, and populate the task queue printmessage("Importing data files", ontop=False) with CmpH5Reader(os.path.join(args.jobDir, "data/aligned_reads.cmp.h5")) as c: c.attach(os.path.join(args.jobDir, "input.fofn")) readSet = set(zip(c.MovieID, c.HoleNumber, c.RCRefStrand)) totalNumber = len(readSet) for i in readSet: taskQueue.put(i) for i in range(args.cpus): # poison pill at end of queue taskQueue.put("die") # starts the processes processList = [multiprocessing.Process(target=workerProcess, args=(taskQueue, args.reference, quiverErrorModel)) for i in range(args.cpus)] for i in processList: i.start() printmessage("%d processes started" % args.cpus, ontop=False)
def launch_subprocs(self, h5_file, N_reads, opts): """ """ logging.debug("Creating tasks...") tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() logging.debug("Done.") if opts.h5_type == "cmp": reader = CmpH5Reader(h5_file) to_check = reader entries = range(len(to_check)) elif opts.h5_type == "bas": reader = BasH5Reader(h5_file) if opts.bas_whitelist != None and not opts.control_run: logging.info("Intersecting with whitelist...") bas_whitelist = set(np.loadtxt(opts.bas_whitelist, dtype="str")) to_check = [z for z in reader if z.zmwName in bas_whitelist] else: to_check = reader # Filter on zmw metrics pre = len(to_check) logging.info("Starting with %s reads..." % pre) to_check = [z for z in to_check if z.zmwMetric("Productivity")==1 and \ z.zmwMetric("ReadScore")>opts.minReadScore and \ z.zmwMetric("Pausiness")<opts.maxPausiness] post = len(to_check) logging.info( "Dropped %s reads due to poor zmw metrics (%s remain)" % ((pre - post), post)) # Filter on read length pre = post to_check = [ z for z in to_check if np.sum([len(sub) for sub in z.subreads]) >= opts.readlength_min ] post = len(to_check) logging.info("Dropped %s reads < %s (%s remain)" % ((pre - post), opts.readlength_min, post)) entries = np.array([z.holeNumber for z in to_check]) reader.close() if len(entries) <= opts.procs * 5: procs = 1 else: procs = opts.procs logging.debug("Starting consumers...") consumers = [multiproc.Consumer(tasks, results) for i in xrange(procs)] for w in consumers: w.start() logging.debug("Done.") num_jobs = procs N_target_reads = {} reads_left = N_reads procs_left = procs for job in range(num_jobs): N_target_reads[job] = int(math.ceil( float(reads_left) / procs_left)) reads_left -= N_target_reads[job] procs_left -= 1 logging.debug("Partitioning %s into %s chunks for analysis..." % (h5_file, num_jobs)) chunksize = int(math.ceil(float(len(entries) / procs))) logging.info("Querying %s reads using %s chunks of size %s..." % (len(to_check), procs, chunksize)) entries_chunks = list(self.chunks(entries, chunksize)) for chunk_id in range(procs): n = N_target_reads[chunk_id] # If --N_reads used, this ensures we touch as many contigs as possible idx = entries_chunks[chunk_id] np.random.shuffle(idx) if opts.h5_type == "cmp": tasks.put( cmph5_read.subread_motif_processor(h5_file, chunk_id, idx, n, opts.motifs, opts.bi_motifs, opts)) logging.debug("...%s (%s alignments)" % (chunk_id, len(entries))) elif opts.h5_type == "bas": tasks.put( baxh5_read.subread_motif_processor(h5_file, chunk_id, idx, n, opts.motifs, opts.bi_motifs, opts)) logging.debug("...%s (%s reads)" % (chunk_id, len(entries))) logging.debug("Done") # Add a poison pill for each consumer for i in xrange(num_jobs): tasks.put(None) # Wait for all of the tasks to finish tasks.join() # Start printing results logging.info("Combining results data from all chunks...") parallel_results = [] while num_jobs: result = results.get() parallel_results.append(result) num_jobs -= 1 logging.info("...%s/%s" % ((procs - num_jobs), procs)) logging.info("Done.") return parallel_results
def __call__(self): class ipd_entry: def __init__(self, tup): """ """ self.ref_base = tup[0] self.ipd = tup[1] # self.call = tup[2] # self.read_base = tup[3] self.ref_pos = tup[2] class subread: def __init__(self, cmph5, alignment, label, opts): leftAnchor = 1 rightAnchor = 1 self.entries = {} self.opts = opts self.subname = alignment.readName movieID = alignment.movieInfo[0] alignedLength = alignment.referenceSpan fps = alignment.movieInfo[2] self.refName = alignment.referenceInfo[3] zmw = alignment.HoleNumber self.mol = alignment.MoleculeID if alignment.isForwardStrand: self.strand = 0 else: self.strand = 1 self.ref_bases = alignment.reference() # self.read_bases = alignment.read() read_calls = alignment.transcript() ref_pos = list(alignment.referencePositions()) IPD = list(alignment.IPD()) self.label = self.opts.h5_labels[cmph5] error_mk = [] for read_call in read_calls: # Go through all entries and flag which positions are MM/indels if read_call != "M": # Mismatch or indel at this position! error_mk.append(1) else: error_mk.append(0) # Get the indices of all the non-matches error_idx = [i for (i, val) in enumerate(error_mk) if val == 1] for error_id in error_idx: try: for j in range(leftAnchor): error_mk[error_id - (j + 1)] = 1 for j in range(rightAnchor): error_mk[error_id + (j + 1)] = 1 except IndexError: pass error_mk = np.array(error_mk) ipds = np.array(IPD) / fps strands = np.array([self.strand] * len(read_calls)) self.ref_bases = np.array(list(self.ref_bases)) # self.read_bases = np.array(list(self.read_bases)) self.ref_pos = np.array(ref_pos) read_calls = np.array(list(read_calls)) # Mark the error positions, but leave them in the sequence so # we can pull out intact motifs from contiguous correct bases self.ref_bases[error_mk == 1] = "*" # self.read_bases[error_mk==1] = "*" read_calls[error_mk == 1] = "*" ipds[error_mk == 1] = -9 strands[error_mk == 1] = -9 # Attach these IPD entries to the subread object # for i,tup in enumerate(zip(self.ref_bases, ipds, read_calls, self.read_bases, self.ref_pos)): for i, tup in enumerate(zip(self.ref_bases, ipds, self.ref_pos)): entry = ipd_entry(tup) self.entries[self.ref_pos[i]] = ipd_entry(tup) # self.cap_outliers() self.subread_normalize() def cap_outliers(self, max_ipd=10): """ Cap the outlier IPDs at max_ipd seconds. """ for read_pos, entry in self.entries.iteritems(): entry.ipd = min(entry.ipd, max_ipd) def subread_normalize(self): """ Every IPD entry needs to be normalized by the mean IPD of its subread. """ if len(self.entries) == 0: # Nothing to do here. return self.entries # First populate list of all IPDs per subread. Will use to get normalization factor. subread_vals = [] for entry in self.entries.values(): # Only do if this IPD is NOT from an error position if entry.ipd != -9: subread_vals.append(entry.ipd) rawIPDs = np.array( map(lambda x: math.log(x + 0.001), subread_vals)) nfs = rawIPDs.mean() for pos, entry in self.entries.iteritems(): if entry.ipd == -9: newIPD = -9 else: newIPD = math.log(entry.ipd + 0.001) - nfs entry.ipd = newIPD def zip_bases_and_IPDs(self): """ Reassemble the read and IPD values using the subread normalized IPDs """ od = OrderedDict(sorted(self.entries.items())) ref = [] ref_pos = [] self.ipds = [] for read_pos, entry in od.items(): ref.append(entry.ref_base) ref_pos.append(entry.ref_pos) self.ipds.append(entry.ipd) self.ref_str = "".join(ref) self.ref_pos = ref_pos reader = CmpH5Reader(self.cmph5) read_refs = {} read_SMp = {} read_SMp_N = {} read_comps = {} read_labs = {} contig_SCp = {} i = 0 n_mols = 0 cwd = os.getcwd() # Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file chunksize = 10 self.chunkdir = "chunk_%s" % self.chunk_id if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)): shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir)) os.mkdir(os.path.join(self.opts.tmp, self.chunkdir)) to_dump = defaultdict(list) def dump_data_to_contig_files(refName, to_dump, read_labs): refName = mbin.slugify(refName) ref_subname_fn = "%s_readnames.tmp" % refName ref_label_fn = "%s_labels.tmp" % refName ref_length_fn = "%s_lengths.tmp" % refName ref_ipds_fn = "%s_ipds.tmp" % refName ref_ipds_N_fn = "%s_ipdsN.tmp" % refName ref_comp_N_fn = "%s_compN.tmp" % refName ref_strand_fn = "%s_strand.tmp" % refName self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn)) f_subnames = open( os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn), "a") f_labels = open( os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a") f_lengths = open( os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a") f_ipds = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a") f_ipds_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a") f_comp_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a") f_strand = open( os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a") self.tmp_fs.add(f_subnames) self.tmp_fs.add(f_labels) self.tmp_fs.add(f_ipds) self.tmp_fs.add(f_ipds_N) self.tmp_fs.add(f_comp_N) self.tmp_fs.add(f_strand) if self.opts.motifs_file != None and self.opts.subtract_control: control_ipds_d = pickle.load( open(self.opts.control_pkl_name, "rb")) for i, (subread_ipds, subread_comps, readname, subread_length, strand) in enumerate(to_dump[refName]): ipd_kmers = [motif for motif in subread_ipds.iterkeys()] ipd_means = [ subread_ipds[motif][1] for motif in subread_ipds.iterkeys() ] ipd_counts = [ subread_ipds[motif][0] for motif in subread_ipds.iterkeys() ] ipd_means = [] if self.opts.motifs_file != None and self.opts.subtract_control: for motif in subread_ipds.iterkeys(): if subread_ipds[motif][1] != 0.0: w_control_sub = subread_ipds[motif][ 1] - control_ipds_d[motif] ipd_means.append(w_control_sub) else: # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0) ipd_means.append(subread_ipds[motif][1]) else: for motif in subread_ipds.iterkeys(): ipd_means.append(subread_ipds[motif][1]) comp_kmers = np.array( [motif for motif, ipds in subread_comps.items()]) comp_counts = np.array( [ipds for motif, ipds in subread_comps.items()]) if i == 0 and refName not in self.refName_has_header: ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName ref_comp_kmers_fn = "%s_compkmers.tmp" % refName f_ipds_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_kmers_fn), "a") f_comp_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_kmers_fn), "a") ipds_kmers_str = "\t".join(ipd_kmers) comp_kmers_str = "\t".join(comp_kmers) f_ipds_kmers.write("%s\n" % ipds_kmers_str) f_comp_kmers.write("%s\n" % comp_kmers_str) f_ipds_kmers.close() f_comp_kmers.close() self.refName_has_header.add(refName) ipds_str = "\t".join(map(lambda x: str(round(x, 3)), ipd_means)) ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts)) comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts)) f_subnames.write("%s\n" % readname) f_labels.write("%s\n" % read_labs[readname]) f_lengths.write("%s\n" % subread_length) f_ipds.write("%s\n" % ipds_str) f_ipds_N.write("%s\n" % ipds_N_str) f_comp_N.write("%s\n" % comp_counts_str) f_strand.write("%s\n" % strand) for f in self.tmp_fs: f.close() self.tmp_fs = set() self.tmp_fns = set() self.refName_has_header = set() to_check = reader[self.idx] for alignment in to_check: ref_contig = mbin.slugify(alignment.referenceInfo[3]) label = self.opts.h5_labels[self.cmph5] ref_len = self.opts.cmph5_contig_lens[self.cmph5][ref_contig] if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV: to_get = min(self.N_target_reads, len(self.idx)) incr = to_get / 10 readname = "/".join(alignment.readName.split("/")[:-1]) if len(read_labs.keys()) % incr == 0 and not read_labs.get( readname): logging.info( "...chunk %s\t- mol %s/%s (%.1f%%)" % (self.chunk_id, n_mols, to_get, 100 * n_mols / to_get)) read_labs[readname] = label read_refs[readname] = ref_contig sub = subread(self.cmph5, alignment, label, self.opts) sub.zip_bases_and_IPDs() subread_ipds,subread_comps = read_scanner.scan_motifs( "cmp", \ # sub.read_str, \ sub.ipds, \ sub.ref_str, \ sub.strand, \ self.motifs, \ self.bi_motifs, \ self.opts ) to_dump[ref_contig].append( (subread_ipds, subread_comps, readname, len(sub.ref_str), sub.strand)) # Dump subread IPD and comp data to contig-specific file if len(to_dump[ref_contig]) % chunksize == 0 and len( to_dump[ref_contig]) != 0: dump_data_to_contig_files(ref_contig, to_dump, read_labs) to_dump[ref_contig] = [] n_mols = len(read_labs.keys()) i += 1 if n_mols == self.N_target_reads: break for ref_contig in to_dump.keys(): dump_data_to_contig_files(ref_contig, to_dump, read_labs) for f in self.tmp_fs: f.close() to_dump = defaultdict(list) if i == 0: logging.info("Chunk %s: no qualifying reads found!" % self.chunk_id) logging.info( "Chunk %s: found %s alignments (%s molecules) > %sbp in %s" % (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min, os.path.basename(self.cmph5))) reader.close() return self.tmp_fns