def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(14)) bams = ds2.externalResources.resourceIds assert len(bams) == 2 refwindows = ds2.refWindows assert refwindows == [(0, 0, 224992)] res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count assert count(res1.readsInRange(*refwindows[0])) == 1409 assert count(res2.readsInRange(*refwindows[0])) == 1375 assert count(ds2.readsInRange(*refwindows[0])) == 2784 assert count(ds2.records) == 2784 ds2.disableFilters() assert count(ds2.records) == 53552 assert ds2.countRecords() == 53552
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(15)) bams = ds2.externalResources.resourceIds self.assertEqual(len(bams), 2) refwindows = ds2.refWindows self.assertEqual(refwindows, [(0, 0, 224992)]) res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409) self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375) self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784) self.assertEqual(count(ds2.records), 2784) ds2.disableFilters() self.assertEqual(count(ds2.records), 53552) self.assertEqual(ds2.countRecords(), 53552)
def track_split_molecule_alignments(self, idx_chunks, cmph5_file): # reader = CmpH5Reader(cmph5_file) reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref) chunk_mols = {} for i, idx_chunk in enumerate(idx_chunks): chunk_mols[i] = set() for alignment in reader[idx_chunk]: if self.Config.opts.useZMW: mol_id = "%s_%s" % (alignment.HoleNumber, alignment.movieName) else: mol_id = alignment.MoleculeID chunk_mols[i].add(mol_id) reader.close() split_mols = set() i = 1 for idx_chunk in idx_chunks[1:]: j = i - 1 split = chunk_mols[i] & chunk_mols[j] split_mols = split_mols | split i += 1 return split_mols
def split_up_control_IPDs(self, control_ipds, cmph5_file, idx_chunks): """ Separate out relevant portions of the control_ipds dictionary. We are taking advantage of the fact that the alignment flat files are sorted by aligned reference position. """ # reader = CmpH5Reader(cmph5_file) reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref) local_control_ipds = {} for chunk_id, idx_chunk in enumerate(idx_chunks): idx_mins = [ min(reader[idx].tStart, reader[idx].tEnd) for idx in idx_chunk ] idx_maxs = [ max(reader[idx].tStart, reader[idx].tEnd) for idx in idx_chunk ] first_ref_pos = min(idx_mins) last_ref_pos = max(idx_maxs) # first_ref_pos = pull_last_ref_pos_from_alignments_file( alignments_flat_fn, "head" ) # last_ref_pos = pull_last_ref_pos_from_alignments_file( alignments_flat_fn, "tail" ) region_control = {0: {}, 1: {}} logging.debug( "Split control IPD dicts -- chunk %s: %sbp - %sbp" % (chunk_id, first_ref_pos, last_ref_pos + 1)) for strand in region_control.keys(): for pos in range(first_ref_pos, last_ref_pos + 1): try: region_control[strand][pos] = control_ipds[strand][pos] except KeyError: # In case we don't have WGA coverage at this position pass local_control_ipds[chunk_id] = region_control return local_control_ipds
def get_reference_contigs(self, cmph5): """ Pull out the list of contigs in the h5 file. """ # reader = CmpH5Reader(cmph5) reader = openIndexedAlignmentFile(cmph5, self.Config.ref) contigs = set(map(lambda x: (x[3], x[2]), reader.referenceInfoTable)) return contigs
def test_updateCounts(self): log.info("Testing updateCounts without filters") aln = AlignmentSet(data.getBam(0)) readers = aln.resourceReaders() expLen = 0 for reader in readers: for record in reader: expLen += record.readLength self.assertEqual( record.aStart, record.bam.pbi[record.rowNumber]['aStart']) self.assertEqual( record.aEnd, record.bam.pbi[record.rowNumber]['aEnd']) expNum = 0 for reader in readers: expNum += len(reader) accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum) log.info("Testing whether filters are respected") aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) aln.updateCounts() accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords def count(gen): count = 0 for _ in gen: count += 1 return count expLen = 0 for reader in readers: for record in reader: expLen += record.readLength bfile = openIndexedAlignmentFile(data.getBam(0)) rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0, bfile.referenceInfo('E.faecalis.1').Length) reads = bfile.readsInRange(*rWin) expNum = count(reads) expLen = 0 reads = bfile.readsInRange(*rWin) for read in reads: expLen += read.readLength self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum)
def test_updateCounts(self): log.info("Testing updateCounts without filters") aln = AlignmentSet(data.getBam(0)) readers = aln.resourceReaders() expLen = 0 for reader in readers: for record in reader: expLen += record.readLength self.assertEqual(record.aStart, record.bam.pbi[record.rowNumber]['aStart']) self.assertEqual(record.aEnd, record.bam.pbi[record.rowNumber]['aEnd']) expNum = 0 for reader in readers: expNum += len(reader) accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum) log.info("Testing whether filters are respected") aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) aln.updateCounts() accLen = aln.metadata.totalLength accNum = aln.metadata.numRecords def count(gen): count = 0 for _ in gen: count += 1 return count expLen = 0 for reader in readers: for record in reader: expLen += record.readLength bfile = openIndexedAlignmentFile(data.getBam(0)) rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0, bfile.referenceInfo('E.faecalis.1').Length) reads = bfile.readsInRange(*rWin) expNum = count(reads) expLen = 0 reads = bfile.readsInRange(*rWin) for read in reads: expLen += read.readLength self.assertEqual(expLen, accLen) self.assertEqual(expNum, accNum)
def test_loading_reference(self): log.info('Opening Reference') r = ReferenceSet(data.getRef()).toExternalFiles()[0] log.info('Done Opening Reference') log.info('Opening AlignmentSet') d = AlignmentSet(data.getBam(), referenceFastaFname=r) log.info('Done Opening AlignmentSet') bfile = openIndexedAlignmentFile(data.getBam(), referenceFastaFname=r) self.assertTrue(bfile.isReferenceLoaded) for res in d.resourceReaders(): self.assertTrue(res.isReferenceLoaded) aln = AlignmentSet(data.getBam()) aln.addReference(r) for res in aln.resourceReaders(): self.assertTrue(res.isReferenceLoaded)
def scan_WGA_aligns(self): """ Get some necessary information about the WGA cmp.h5 being used to generate the control IPD data. """ self.opts.aln_fn_labels = {} self.opts.aln_fn_contig_lens = {} self.opts.aln_fn_labels[self.control_aln_fn] = "control" self.opts.aln_fn_contig_lens[self.control_aln_fn] = {} # reader = CmpH5Reader(self.control_aln_fn) reader = openIndexedAlignmentFile(self.control_aln_fn) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) self.opts.aln_fn_contig_lens[ self.control_aln_fn][slug_name] = length reader.close() return self.opts
def launch_parallel_molecule_loading(self, cmph5_file, prefix, movie_name_ID_map, control_ipds): logging.debug("Creating tasks...") tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() logging.debug("Done.") logging.debug("Starting consumers...") num_jobs = self.Config.opts.procs consumers = [ Consumer(tasks, results, self.Config.opts.contig_id) for i in xrange(num_jobs) ] for w in consumers: w.start() logging.debug("Done.") def chunks(l, n): """ Yield successive n-sized chunks from l. """ for i in xrange(0, len(l), n): yield l[i:i + n] # Enqueue jobs logging.info("Partitioning %s into %s chunks for analysis..." % (cmph5_file, num_jobs)) # reader = CmpH5Reader(cmph5_file) reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref) if self.Config.opts.align_ftype == "cmp": alnIDs = [ r.AlnID for r in reader if r.referenceInfo[2] == self.Config.opts.contig_id ] elif self.Config.opts.align_ftype == "bam": alnIDs = [ r.rowNumber for r in reader if r.referenceInfo[2] == self.Config.opts.contig_id ] if len(alnIDs) <= num_jobs: num_jobs = 1 reader.close() # for chunk_id,alignments_flat_fn in enumerate(tmp_flat_files): chunksize = int(math.ceil(float(len(alnIDs) / num_jobs))) idx_chunks = list(chunks((np.array(alnIDs) - 1), chunksize)) if len(idx_chunks[-1]) == 1 and len(idx_chunks) > 1: idx_chunks = idx_chunks[:-1] if prefix == "nat_": logging.info( "%s - Separating out file-matched regions of the control IPD values dict..." % self.Config.opts.contig_id) local_control_ipds = self.split_up_control_IPDs( control_ipds, cmph5_file, idx_chunks) logging.debug("%s - Done." % self.Config.opts.contig_id) # In splitting alignment indexes among processes, some molecules will have # alignments in going to different processes. Track these. split_mols = self.track_split_molecule_alignments( idx_chunks, cmph5_file) for chunk_id in range(num_jobs): idx = idx_chunks[chunk_id] if prefix == "wga_": tasks.put(parse_mol_aligns.wga_molecules_processor( cmph5_file, \ chunk_id, \ prefix, \ self.Config.opts.contig_id, \ self.ref_size, \ self.sites_pos, \ self.sites_neg, \ self.Config.opts, \ idx, \ split_mols )) else: logging.debug("Launching subprocess %s..." % chunk_id) tasks.put(parse_mol_aligns.native_molecules_processor( cmph5_file, \ chunk_id, \ prefix, \ self.Config.opts, \ self.Config.fastq, \ self.Config.ref, \ copy.copy(movie_name_ID_map), \ local_control_ipds[chunk_id], \ self.ref_size, \ self.sites_pos, \ self.sites_neg, \ idx, \ split_mols)) logging.debug("Done (%s)." % chunk_id) # Add a 'poison pill' for each consumer for i in xrange(self.Config.opts.procs): tasks.put(None) tasks.join() # Start printing results parallel_results = [] while num_jobs: result = results.get() parallel_results.append(result) num_jobs -= 1 return parallel_results
def __call__(self): # reader = CmpH5Reader(self.cmph5) reader = openIndexedAlignmentFile(self.cmph5, self.opts.ref) self.chunk_output_fn = "%s_%s.tmp" % (self.out, self.chunk_id) self.var_chunk_fn = "vars_%s.tmp" % self.chunk_id if self.align: self.var_f = open(self.var_chunk_fn, "w") mol_alignments = defaultdict(list) for i, alignment in enumerate(reader[self.idx]): if self.opts.align_ftype == "cmp": aln_acc = alignment.accuracy aln_len = len(alignment.alignmentArray()) elif self.opts.align_ftype == "bam": aln_features = dict(alignment.peer.tags) aln_acc = aln_features["rq"] aln_len = len(aln_features["ip"]) if aln_acc >= self.opts.minAcc and aln_len >= self.opts.minSubreadLen: if self.opts.useZMW: mol_id = "%s_%s" % (alignment.HoleNumber, alignment.movieName) else: mol_id = alignment.MoleculeID mol_alignments[mol_id].append(alignment) self.mols = {} incr = int(max(4, math.floor(float(len(mol_alignments.keys()))) / 4)) for i, (mol_id, alignments) in enumerate(mol_alignments.iteritems()): if i % incr == 0: logging.info( "...chunk %s - processing molecules: %s/%s (%.1f%%)" % (self.chunk_id, i, len(mol_alignments.keys()), 100 * i / len(mol_alignments.keys()))) mol = molecule(alignments, self.prefix, self.leftAnchor, self.rightAnchor, self.contig_id, self.sites_pos, self.sites_neg, self.cmph5, self.opts) if self.opts.useZMW: # Replace the bad moleculeID with the good ZMW ID, formatted: <zmwID>_<movieID> mol.mol_id = mol_id if len(mol.entries) > 0: self.mols[mol_id] = mol # Exclude any molecules that are divided between split-up alignment files self.mols = remove_split_up_molecules(self.mols, self.split_mols) # [Optional]: align CCS reads to reference to find SNPs/errors if not self.align: # Need to empirically try to determine subread start/end positions in order to designate off-limits entries. for mol in self.mols.values(): mol.var_pos = [] # self.empirical_get_start_end_pos( mol ) elif len(self.mols.values()) > 0: CCS = CCS_aligner.mols_aligner( self.mols, \ self.fastq, \ self.ref, \ self.movie_name_ID_map , \ self.align, \ self.chunk_id) # Output the called CCS read-level variants/errors to a chunk file for mol in self.mols.values(): vars_str = ",".join(map(lambda x: str(x), mol.var_no_sc)) self.var_f.write("%s %s\n" % (mol.mol_id, vars_str)) # if self.SMsn: # # If the empirical start/end discovery showed a lack of positions with sufficient coverage, remove molecule # del_me = [mol.mol_id for mol in self.mols.values() if mol.to_del] # logging.debug("Process %s (chunk %s): deleting %s molecules due to too many positions with low coverage." % (self.chunk_id, \ # i, \ # len(del_me))) # for mol_id in del_me: # del self.mols[mol_id] if len(self.mols.values()) > 0: # Identify and remove positions to be excluded from further analysis tot_entries = 0 tot_entries_deleted = 0 for mol in self.mols.values(): entries_deleted, entries = self.remove_off_limits_positions( mol) tot_entries += entries tot_entries_deleted += entries_deleted pct_deleted = float(tot_entries_deleted) / tot_entries * 100 logging.debug("Process %s (chunk %s): deleted %s (%.1f%%) off-limits positions." % (self.chunk_id, \ i, \ tot_entries_deleted, \ pct_deleted)) # Generate the IPD arrays per genomic position/strand by aggregating all IPD entries across molecules (self.ipdArrays) logging.debug("Process %s: generating IPD arrays..." % self.chunk_id) for mol in self.mols.values(): self.create_arrays(mol) if self.SMp: for mol in self.mols.values(): mol.ipdArrays = self.condense_native_mol_motifs_into_one_pos( mol) # Now run the comparison test logging.debug("Process %s: running comparisons..." % self.chunk_id) for mol in self.mols.values(): self.get_scores(mol) mols_w_results = len( [mol for mol in self.mols.values() if len(mol.output) > 0]) logging.debug( "Process %s: %s molecules generated comparison test output" % (self.chunk_id, mols_w_results)) if mols_w_results > 0: self.print_output(self.mols.values()) # self.concatenate_mol_results() if self.align: self.var_f.close() reader.close() return self.chunk_output_fn
def __call__(self): # reader = CmpH5Reader(self.cmph5) reader = openIndexedAlignmentFile(self.cmph5, self.opts.ref) mol_alignments = defaultdict(list) for alignment in reader[self.idx]: if self.opts.align_ftype == "cmp": aln_acc = alignment.accuracy aln_len = len(alignment.alignmentArray()) elif self.opts.align_ftype == "bam": aln_features = dict(alignment.peer.tags) aln_acc = aln_features["rq"] aln_len = len(aln_features["ip"]) if aln_acc >= self.opts.minAcc and aln_len >= self.opts.minSubreadLen: if self.opts.useZMW: mol_id = "%s_%s" % (alignment.HoleNumber, alignment.movieName) else: mol_id = alignment.MoleculeID mol_alignments[mol_id].append(alignment) self.mols = {} i = 0 incr = int(max(4, math.floor(float(len(mol_alignments.keys()))) / 4)) for i, (mol_id, alignments) in enumerate(mol_alignments.iteritems()): if i % incr == 0: logging.info( "...chunk %s - %s/%s (%.1f%%) alignments processed..." % (self.chunk_id, i, len(mol_alignments.keys()), 100 * float(i) / len(mol_alignments.keys()))) mol = molecule(alignments, self.prefix, self.leftAnchor, self.rightAnchor, self.contig_id, self.sites_pos, self.sites_neg, self.cmph5, self.opts) if self.opts.useZMW: # Replace the bad moleculeID with the good ZMW ID, formatted: <zmwID>_<movieID> mol.mol_id = mol_id if len(mol.entries) > 0: self.mols[mol.mol_id] = mol if self.opts.useZMW: pass else: # Generate map between ZMW and molecule IDs (self.zmw_mol_map) self.mols, self.zmw_mol_map = generate_molecule_ZMW_map( self.mols, self.chunk_id) # Exclude any molecules that are divided between split-up alignment files logging.debug( "Process %s: removing %s total molecules whose alignments are different chunks..." % (self.chunk_id, len(self.split_mols))) self.mols = remove_split_up_molecules(self.mols, self.split_mols) # Generate the IPD arrays per genomic position/strand by aggregating all # IPD entries across molecules (self.ipdArrays) self.ipdArrays = self.create_agg_IPD_arrays() reader.close() # Return the processed IPD array dictionary return self.ipdArrays
def __call__( self ): class ipd_entry: def __init__( self, tup ): """ """ self.ref_base = tup[0] self.ipd = tup[1] self.ref_pos = tup[2] class subread: def __init__( self, align_fn, alignment, label, opts ): leftAnchor = 1 rightAnchor = 1 self.entries = {} self.opts = opts self.subname = alignment.readName alignedLength = alignment.referenceSpan self.refName = alignment.referenceInfo[3] zmw = alignment.HoleNumber ########################################### # self.mol = alignment.MoleculeID self.mol = alignment.HoleNumber ########################################### if alignment.isForwardStrand: self.strand = 0 else: self.strand = 1 self.ref_bases = alignment.reference() read_calls = alignment.transcript() ref_pos = list(alignment.referencePositions()) IPD = list(alignment.IPD()) self.label = self.opts.aln_fn_labels[align_fn] error_mk = [] for read_call in read_calls: # Go through all entries and flag which positions are MM/indels if read_call != "M": # Mismatch or indel at this position! error_mk.append(1) else: error_mk.append(0) # Get the indices of all the non-matches error_idx = [i for (i,val) in enumerate(error_mk) if val == 1] for error_id in error_idx: try: for j in range(leftAnchor): error_mk[error_id - (j+1)] = 1 for j in range(rightAnchor): error_mk[error_id + (j+1)] = 1 except IndexError: pass error_mk = np.array(error_mk) ipds = np.array(IPD) / self.opts.fps strands = np.array([self.strand] * len(read_calls)) self.ref_bases = np.array(list(self.ref_bases)) self.ref_pos = np.array(ref_pos) read_calls = np.array(list(read_calls)) # Mark the error positions, but leave them in the sequence so # we can pull out intact motifs from contiguous correct bases self.ref_bases[error_mk==1] = "*" read_calls[error_mk==1] = "*" ipds[error_mk==1] = -9 strands[error_mk==1] = -9 # Attach these IPD entries to the subread object for i,tup in enumerate(zip(self.ref_bases, ipds, self.ref_pos)): entry = ipd_entry(tup) self.entries[ self.ref_pos[i] ] = ipd_entry(tup) # self.cap_outliers() self.subread_normalize() def cap_outliers( self, max_ipd=10 ): """ Cap the outlier IPDs at max_ipd seconds. """ for read_pos,entry in self.entries.iteritems(): entry.ipd = min(entry.ipd, max_ipd) def subread_normalize( self ): """ Every IPD entry needs to be normalized by the mean IPD of its subread. """ if len(self.entries) == 0: # Nothing to do here. return self.entries # First populate list of all IPDs per subread. Will use to get normalization factor. subread_vals = [] for entry in self.entries.values(): # Only do if this IPD is NOT from an error position if entry.ipd != -9: subread_vals.append(entry.ipd) rawIPDs = np.array(map(lambda x: math.log(x + 0.001), subread_vals)) nfs = rawIPDs.mean() for pos, entry in self.entries.iteritems(): if entry.ipd == -9: newIPD = -9 else: newIPD = math.log(entry.ipd + 0.001) - nfs entry.ipd = newIPD def zip_bases_and_IPDs( self ): """ Reassemble the read and IPD values using the subread normalized IPDs """ od = OrderedDict(sorted(self.entries.items())) ref = [] ref_pos = [] self.ipds = [] for read_pos, entry in od.items(): ref.append(entry.ref_base) ref_pos.append(entry.ref_pos) self.ipds.append(entry.ipd) self.ref_str = "".join(ref) self.ref_pos = ref_pos if self.opts.aln_ftype=="cmp": reader = CmpH5Reader(self.align_fn) elif self.opts.aln_ftype=="bam": reader = openIndexedAlignmentFile(self.align_fn, self.opts.ref) else: raise Exception("Unrecognized alignment filetype (must be *.cmp.h5 or *.bam): %s" % self.align_fn) def get_fps(align_fn): """ For *.cmp.h5 files, frame rate (fps) is include in each alignment. For *.bam files, the frame rate is encoded in the file header (FRAMERATEHZ) """ if self.opts.aln_ftype=="cmp": # Read frame rate directly from a cmp.h5 alignment reader = CmpH5Reader(align_fn) alignment = reader[0] fps = alignment.movieInfo[2] elif self.opts.aln_ftype=="bam": # Isolate description (DS) from read group (RG) in BAM header bam = pysam.AlignmentFile(align_fn, "rb") h = bam.header rg_ds_l = h.as_dict()["RG"][0]["DS"].split(";") rg_ds_d = dict([ (x.split("=")[0], x.split("=")[1]) for x in rg_ds_l]) fps = float(rg_ds_d["FRAMERATEHZ"]) return fps # Pull the frame rate value from the alignment file self.opts.fps = get_fps(self.align_fn) read_refs = {} read_SMp = {} read_SMp_N = {} read_comps = {} read_labs = {} contig_SCp = {} i = 0 n_mols = 0 cwd = os.getcwd() # Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file chunksize = 10 self.chunkdir = "chunk_%s" % self.chunk_id if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)): shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir)) os.mkdir(os.path.join(self.opts.tmp, self.chunkdir)) to_dump = defaultdict(list) def dump_data_to_contig_files( refName, to_dump, read_labs ): refName = mbin.slugify(refName) ref_subname_fn = "%s_readnames.tmp" % refName ref_label_fn = "%s_labels.tmp" % refName ref_length_fn = "%s_lengths.tmp" % refName ref_ipds_fn = "%s_ipds.tmp" % refName ref_ipds_N_fn = "%s_ipdsN.tmp" % refName ref_comp_N_fn = "%s_compN.tmp" % refName ref_strand_fn = "%s_strand.tmp" % refName self.tmp_fns.add( os.path.join(self.chunkdir, ref_subname_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_label_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_length_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_ipds_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_ipds_N_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_comp_N_fn) ) self.tmp_fns.add( os.path.join(self.chunkdir, ref_strand_fn) ) f_subnames = open(os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn), "a") f_labels = open(os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a") f_lengths = open(os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a") f_ipds = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a") f_ipds_N = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a") f_comp_N = open(os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a") f_strand = open(os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a") self.tmp_fs.add(f_subnames) self.tmp_fs.add(f_labels) self.tmp_fs.add(f_ipds) self.tmp_fs.add(f_ipds_N) self.tmp_fs.add(f_comp_N) self.tmp_fs.add(f_strand) if self.opts.motifs_file!=None and self.opts.subtract_control: control_ipds_d = pickle.load( open(self.opts.control_pkl_name,"rb" ) ) for i,(subread_ipds,subread_comps,readname,subread_length,strand) in enumerate(to_dump[refName]): ipd_kmers = [motif for motif in subread_ipds.iterkeys()] ipd_means = [subread_ipds[motif][1] for motif in subread_ipds.iterkeys()] ipd_counts = [subread_ipds[motif][0] for motif in subread_ipds.iterkeys()] ipd_means = [] if self.opts.motifs_file!=None and self.opts.subtract_control: for motif in subread_ipds.iterkeys(): if subread_ipds[motif][1] != 0.0: w_control_sub = subread_ipds[motif][1] - control_ipds_d[motif] ipd_means.append(w_control_sub) else: # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0) ipd_means.append(subread_ipds[motif][1]) else: for motif in subread_ipds.iterkeys(): ipd_means.append(subread_ipds[motif][1]) comp_kmers = np.array( [motif for motif,ipds in subread_comps.items()] ) comp_counts = np.array( [ipds for motif,ipds in subread_comps.items()] ) if i==0 and refName not in self.refName_has_header: ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName ref_comp_kmers_fn = "%s_compkmers.tmp" % refName f_ipds_kmers = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_kmers_fn), "a") f_comp_kmers = open(os.path.join(self.opts.tmp, self.chunkdir, ref_comp_kmers_fn), "a") ipds_kmers_str = "\t".join(ipd_kmers) comp_kmers_str = "\t".join(comp_kmers) f_ipds_kmers.write("%s\n" % ipds_kmers_str) f_comp_kmers.write("%s\n" % comp_kmers_str) f_ipds_kmers.close() f_comp_kmers.close() self.refName_has_header.add(refName) ipds_str = "\t".join(map(lambda x: str(round(x,3)), ipd_means)) ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts)) comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts)) f_subnames.write( "%s\n" % readname) f_labels.write( "%s\n" % read_labs[readname]) f_lengths.write( "%s\n" % subread_length) f_ipds.write( "%s\n" % ipds_str) f_ipds_N.write( "%s\n" % ipds_N_str) f_comp_N.write( "%s\n" % comp_counts_str) f_strand.write( "%s\n" % strand) for f in self.tmp_fs: f.close() self.tmp_fs = set() self.tmp_fns = set() self.refName_has_header = set() to_check = reader[self.idx] for alignment in to_check: ref_contig = mbin.slugify(alignment.referenceInfo[3]) label = self.opts.aln_fn_labels[self.align_fn] ref_len = self.opts.aln_fn_contig_lens[self.align_fn][ref_contig] if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV: to_get = min(self.N_target_reads, len(self.idx)) incr = to_get/10 readname = "/".join(alignment.readName.split("/")[:-1]) if len(read_labs.keys())%incr==0 and not read_labs.get(readname): logging.info("...chunk %s\t- mol %s/%s (%.1f%%)" % (self.chunk_id, n_mols, to_get, 100*n_mols/to_get)) read_labs[readname] = label read_refs[readname] = ref_contig sub = subread( self.align_fn, alignment, label, self.opts ) sub.zip_bases_and_IPDs() subread_ipds,subread_comps = read_scanner.scan_motifs( "aligned", \ sub.ipds, \ sub.ref_str, \ sub.strand, \ self.motifs, \ self.bi_motifs, \ self.opts ) to_dump[ref_contig].append( (subread_ipds,subread_comps,readname,len(sub.ref_str),sub.strand) ) # Dump subread IPD and comp data to contig-specific file if len(to_dump[ref_contig])%chunksize==0 and len(to_dump[ref_contig])!=0: dump_data_to_contig_files( ref_contig, to_dump, read_labs ) to_dump[ref_contig] = [] n_mols = len(read_labs.keys()) i += 1 if n_mols==self.N_target_reads: break for ref_contig in to_dump.keys(): dump_data_to_contig_files( ref_contig, to_dump, read_labs ) for f in self.tmp_fs: f.close() to_dump = defaultdict(list) if i==0: logging.info("Chunk %s: no qualifying reads found!" % self.chunk_id) logging.info("Chunk %s: found %s alignments (%s molecules) > %sbp in %s" % (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min, os.path.basename(self.align_fn))) reader.close() return self.tmp_fns
def launch_parallel_molecule_loading(self, cmph5_file, prefix, movie_name_ID_map, control_ipds): logging.debug("Creating tasks...") tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() logging.debug("Done.") logging.debug("Starting consumers...") num_jobs = self.Config.opts.procs consumers = [ Consumer(tasks, results, self.Config.opts.contig_id) for i in xrange(num_jobs) ] for w in consumers: w.start() logging.debug("Done.") def chunks(l, n): """ Yield successive n-sized chunks from l. """ for i in xrange(0, len(l), n): yield l[i:i + n] # Enqueue jobs logging.info("Partitioning %s into %s chunks for analysis..." % (cmph5_file, num_jobs)) # reader = CmpH5Reader(cmph5_file) reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref) if self.Config.opts.align_ftype == "cmp": alnIDs = [ r.AlnID for r in reader if r.referenceInfo[2] == self.Config.opts.contig_id ] elif self.Config.opts.align_ftype == "bam": alnIDs = [ r.rowNumber for r in reader if r.referenceInfo[2] == self.Config.opts.contig_id ] if len(alnIDs) <= num_jobs: num_jobs = 1 reader.close() # for chunk_id,alignments_flat_fn in enumerate(tmp_flat_files): # chunksize = int(math.ceil(float( len(alnIDs)/num_jobs ))) # idx_chunks = list(chunks( (np.array(alnIDs)-1), chunksize )) # if len(idx_chunks[-1])==1 and len(idx_chunks)>1: # idx_chunks = idx_chunks[:-1] ############################################################################ ## MODIFIED CHUNKING ## ENSURES SUBREADS FROM SAME MOLECULE ARE NOT SPLIT ## logging.info( "Chunking to ensure subreads are not split between chunks...") grouped_mols = defaultdict(list) for alnID in alnIDs: mol_id = "%s_%s" % (reader[alnID].HoleNumber, reader[alnID].movieName) grouped_mols[mol_id].append(alnID) chunksize = int(math.ceil(float(len(grouped_mols.keys()) / num_jobs))) logging.info("Chunking: %s / %s molecules per chunk" % (chunksize, len(grouped_mols.keys()))) idx_chunks = [] added = 0 chunk = [] for mol in grouped_mols.keys(): if added >= chunksize: idx_chunks.append(chunk) chunk = [] added = 0 chunk += grouped_mols[mol] added += 1 if len(chunk) > 0: idx_chunks.append(chunk) idx_chunks = np.array(idx_chunks) num_jobs = len(idx_chunks) idx_count = 0 for idx_chunk in idx_chunks: for idx in idx_chunk: idx_count += 1 logging.info("%s / %s subreads successfully chunked..." % (idx_count, len(alnIDs))) ############################################################################# if prefix == "nat_": logging.info( "%s - Separating out file-matched regions of the control IPD values dict..." % self.Config.opts.contig_id) local_control_ipds = self.split_up_control_IPDs( control_ipds, cmph5_file, idx_chunks) logging.debug("%s - Done." % self.Config.opts.contig_id) # In splitting alignment indexes among processes, some molecules will have # alignments in going to different processes. Track these. split_mols = self.track_split_molecule_alignments( idx_chunks, cmph5_file) for chunk_id in range(num_jobs): idx = idx_chunks[chunk_id] if prefix == "wga_": tasks.put(parse_mol_aligns.wga_molecules_processor( cmph5_file, \ chunk_id, \ prefix, \ self.Config.opts.contig_id, \ self.ref_size, \ self.sites_pos, \ self.sites_neg, \ self.Config.opts, \ idx, \ split_mols )) else: logging.debug("Launching subprocess %s..." % chunk_id) tasks.put(parse_mol_aligns.native_molecules_processor( cmph5_file, \ chunk_id, \ prefix, \ self.Config.opts, \ self.Config.fastq, \ self.Config.ref, \ copy.copy(movie_name_ID_map), \ local_control_ipds[chunk_id], \ self.ref_size, \ self.sites_pos, \ self.sites_neg, \ idx, \ split_mols)) logging.debug("Done (%s)." % chunk_id) # Add a 'poison pill' for each consumer for i in xrange(self.Config.opts.procs): tasks.put(None) tasks.join() # Start printing results parallel_results = [] while num_jobs: result = results.get() parallel_results.append(result) num_jobs -= 1 return parallel_results