示例#1
0
def __check_input(opts, args, parser):
    """
	Make sure the input is in the form of either a cmp.h5 file of aligned reads
	or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta 
	file is specified if 
	"""
    arg = args[0]
    h5_files = []
    opts.h5_labels = {}

    if arg[-6:] == "cmp.h5":
        print "Found cmp.h5 of aligned reads:"

        opts.h5_type = "cmp"
        opts.cmph5_contig_lens = {}
        opts.cmph5_contig_lens[arg] = {}

        h5_files.append(arg)
        print "  -- %s" % arg
        print "Getting contig information from %s..." % arg
        reader = CmpH5Reader(arg)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            opts.cmph5_contig_lens[arg][slug_name] = length
            opts.h5_labels[arg] = "remove"
        reader.close()

    elif arg[-6:] == "bas.h5":
        print "Found bas.h5 of unaligned reads:"
        opts.h5_type = "bas"
        h5_files.append(arg)
        opts.h5_labels[arg] = "remove"
        print "  -- %s" % arg

    elif arg[-5:] == ".fofn":
        print "Found FOFN of bas.h5 files:"
        opts.h5_type = "bas"
        fns = map(lambda x: x.strip("\n"),
                  np.atleast_1d(open(arg, "r").read()))
        h5_files = fns
        for fn in fns:
            print "  -- %s" % fn
            opts.h5_labels[fn] = "remove"

    if opts.h5_type == "bas":
        print "*************************************************************"
        print "* Motif filtering using unaligned reads is not recommended. *"
        print "*         Aligned reads work much better for this!          *"
        print "*************************************************************"
        print ""

    if opts.h5_type == "bas" and opts.cross_cov_bins != None:
        parser.error(
            "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!"
        )

    return h5_files
示例#2
0
文件: pbssc.py 项目: verheytb/pbssc
def workerProcess(inQueue, refFile, quiverConfig):
    with CmpH5Reader(os.path.join(args.jobDir, "data/aligned_reads.cmp.h5")) as d:
        referenceSeq = IndexedFastaReader(refFile)[0].sequence
        while True:
            queue_item = inQueue.get()
            if queue_item == "die":
                return
            else:
                movieID, holeNumber, rcrefstrand = queue_item
                alns = d[((d.MovieID == movieID) & (d.HoleNumber == holeNumber) & (d.RCRefStrand == rcrefstrand))]
                cssName = "/".join(alns[0].readName.split("/")[:-1]) + "/" + str(alns[0].RCRefStrand) + "/ssc"
                if args.ignore_barcodes:
                    cssObj = ConsensusSequence(cssName, "", "", len(alns), 0, 0, "all_reads")
                else:
                    cssObj = ConsensusSequence(cssName, "", "", len(alns), 0, 0, alns[0].barcodeName)
                if not cssObj.numPasses >= args.minCoverage:
                    cssObj.minNumPassesFail = True
                if not checkMapping(alns):
                    cssObj.mappingFail = True
                if not cssObj.minNumPassesFail and not cssObj.mappingFail:
                    refId = alns[0].referenceId
                    v = hullMany([(a.tStart, a.tEnd) for a in alns])
                    window = (refId, v[0], v[1])
                    windowLen = v[1] - v[0]
                    refSeqInWindow = referenceSeq[v[0]:v[1]]
                    css = consensusForAlignments(window, refSeqInWindow, alns, quiverConfig)
                    cssObj.seq = css.sequence
                    cssObj.qual = css.confidence
                    cssObj.coverage = sum((a.referenceSpan > 0.8 * windowLen) for a in alns)
                    if not cssObj.coverage >= args.minCoverage:
                        cssObj.minCoverageFail = True
                    cssObj.predictedAccuracy = estimateAccuracy(css.confidence)
                    if not cssObj.predictedAccuracy >= args.minAvgConfidence:
                        cssObj.minAvgConfidenceFail = True
                    if args.trim:
                        cssObj = trim(cssObj, lseq, rseq)
                    if args.clip:
                        cssObj = clip(cssObj, args.clip)
                    try:
                        cssObj.minConfidence = 1 - unphred(np.amin(np.array(cssObj.qual, dtype=float)))
                    except ValueError:  # when cssObj.qual is zero-length
                        cssObj.minConfidence = 0
                    if not cssObj.minConfidence >= args.minConfidence:
                        cssObj.minConfidenceFail = True
                resultQueue.put(cssObj)
                counter.increment()
示例#3
0
	def scan_WGA_h5( self ):
		"""
		Get some necessary information about the WGA cmp.h5 
		being used to generate the control IPD data.
		"""
		self.opts.h5_labels                          = {}
		self.opts.cmph5_contig_lens                  = {}
		self.opts.h5_labels[self.control_h5]         = "control"
		self.opts.cmph5_contig_lens[self.control_h5] = {}
		
		reader = CmpH5Reader(self.control_h5)
		for entry in reader.referenceInfoTable:
			name      = entry[3]
			length    = entry[4]
			slug_name = mbin.slugify(name)
			self.opts.cmph5_contig_lens[self.control_h5][slug_name] = length
		reader.close()

		return self.opts
示例#4
0
		def get_fps(align_fn):
			"""
			For *.cmp.h5 files, frame rate (fps) is include in each alignment.
			For *.bam files, the frame rate is encoded in the file header (FRAMERATEHZ)
			"""
			if self.opts.aln_ftype=="cmp":
				# Read frame rate directly from a cmp.h5 alignment
				reader    = CmpH5Reader(align_fn)
				alignment = reader[0]
				fps       = alignment.movieInfo[2]
			
			elif self.opts.aln_ftype=="bam":
				# Isolate description (DS) from read group (RG) in BAM header
				bam     = pysam.AlignmentFile(align_fn, "rb")
				h       = bam.header
				rg_ds_l = h.as_dict()["RG"][0]["DS"].split(";")
				rg_ds_d = dict([ (x.split("=")[0], x.split("=")[1]) for x in rg_ds_l])
				fps     = float(rg_ds_d["FRAMERATEHZ"])

			return fps
示例#5
0
def __check_input(opts, args, parser):
    """
	Make sure the input is in the form of either a cmp.h5 file of aligned reads
	or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta 
	file is specified if 
	"""
    if len(args) != 2:
        print "ERROR -- expecting two arguments: \
				 (1) input hdf5 file (cmp.h5, bas.h5, or FOFN of bas.h5 files) \
				 (2) file containing the motifs to analyze, separated by newlines, e.g.\
				     \
				     GATC-1\
				     CATG-1\
				     CAACGA-2"

    seq_input = args[0]
    motifs_fn = args[1]
    h5_files = []
    opts.h5_labels = {}

    if seq_input[-6:] == "cmp.h5":
        print "Found cmp.h5 of aligned reads:"

        h5 = os.path.abspath(seq_input)
        opts.h5_type = "cmp"
        opts.cmph5_contig_lens = {}
        opts.cmph5_contig_lens[h5] = {}

        h5_files.append(h5)
        print "  -- %s" % h5
        print "Getting contig information from %s..." % h5
        reader = CmpH5Reader(h5)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            opts.cmph5_contig_lens[h5][slug_name] = length
            opts.h5_labels[h5] = "remove"
        reader.close()

    elif seq_input[-6:] == "bas.h5":
        print "Found bas.h5 of unaligned reads:"
        opts.h5_type = "bas"
        h5 = os.path.abspath(seq_input)
        h5_files.append(h5)
        opts.h5_labels[h5] = "remove"
        print "  -- %s" % h5

    elif seq_input[-5:] == ".fofn":
        print "Found FOFN of bas.h5 files of unaligned reads:"
        opts.h5_type = "bas"
        fofn_content = open(seq_input, "r").read().strip()
        h5_files = fofn_content.split("\n")
        for h5 in h5_files:
            h5 = os.path.abspath(h5)
            print "  -- %s" % h5
            opts.h5_labels[h5] = "remove"

    if opts.h5_type == "bas" and opts.cross_cov_bins != None:
        parser.error(
            "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!"
        )

    if opts.h5_type == "cmp":
        try:
            for entry in SeqIO.parse(opts.contigs, "fasta"):
                x = entry.seq
                y = entry.id
        except:
            parser.error(
                "Please make sure the --contigs input is a valid fasta file.")

    if not os.path.exists(motifs_fn):
        parser.error(
            "Can't find file of motifs to include in methylation profile: %s" %
            motifs_fn)

    return h5_files, motifs_fn
示例#6
0
文件: pbssc.py 项目: verheytb/pbssc
taskQueue = multiprocessing.Queue()
resultQueue = multiprocessing.Queue()
counter = Counter()
startTime = time()
quiverErrorModel = loadQuiverConfig("P6-C4.AllQVsMergingByChannelModel")

if args.trim:
    dupleSeq = args.trim.upper()
    if not all([x in ("A", "C", "G", "T", ",") for x in dupleSeq]) or not dupleSeq.count(",") == 1:
        sys.exit("-t requires a duple of sequences separated by a comma. (eg. ACTAGGA,CTACGAG)")
    lseq = dupleSeq[:args.trim.find(",")]
    rseq = args.trim[args.trim.find(",") + 1:]

# get the list of reads to extract, and populate the task queue
printmessage("Importing data files", ontop=False)
with CmpH5Reader(os.path.join(args.jobDir, "data/aligned_reads.cmp.h5")) as c:
    c.attach(os.path.join(args.jobDir, "input.fofn"))
    readSet = set(zip(c.MovieID, c.HoleNumber, c.RCRefStrand))
totalNumber = len(readSet)
for i in readSet:
    taskQueue.put(i)
for i in range(args.cpus):  # poison pill at end of queue
    taskQueue.put("die")

# starts the processes
processList = [multiprocessing.Process(target=workerProcess, args=(taskQueue, args.reference, quiverErrorModel)) for i
               in range(args.cpus)]
for i in processList:
    i.start()
printmessage("%d processes started" % args.cpus, ontop=False)
示例#7
0
文件: mbin.py 项目: TankMermaid/mbin
    def launch_subprocs(self, h5_file, N_reads, opts):
        """
		"""
        logging.debug("Creating tasks...")
        tasks = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()
        logging.debug("Done.")

        if opts.h5_type == "cmp":
            reader = CmpH5Reader(h5_file)
            to_check = reader
            entries = range(len(to_check))
        elif opts.h5_type == "bas":
            reader = BasH5Reader(h5_file)
            if opts.bas_whitelist != None and not opts.control_run:
                logging.info("Intersecting with whitelist...")
                bas_whitelist = set(np.loadtxt(opts.bas_whitelist,
                                               dtype="str"))
                to_check = [z for z in reader if z.zmwName in bas_whitelist]
            else:
                to_check = reader
            # Filter on zmw metrics
            pre = len(to_check)
            logging.info("Starting with %s reads..." % pre)
            to_check = [z for z in to_check if z.zmwMetric("Productivity")==1 and \
                       z.zmwMetric("ReadScore")>opts.minReadScore and \
                       z.zmwMetric("Pausiness")<opts.maxPausiness]
            post = len(to_check)
            logging.info(
                "Dropped %s reads due to poor zmw metrics (%s remain)" %
                ((pre - post), post))
            # Filter on read length
            pre = post
            to_check = [
                z for z in to_check
                if np.sum([len(sub)
                           for sub in z.subreads]) >= opts.readlength_min
            ]
            post = len(to_check)
            logging.info("Dropped %s reads < %s (%s remain)" %
                         ((pre - post), opts.readlength_min, post))
            entries = np.array([z.holeNumber for z in to_check])

        reader.close()
        if len(entries) <= opts.procs * 5:
            procs = 1
        else:
            procs = opts.procs

        logging.debug("Starting consumers...")
        consumers = [multiproc.Consumer(tasks, results) for i in xrange(procs)]
        for w in consumers:
            w.start()
        logging.debug("Done.")

        num_jobs = procs
        N_target_reads = {}
        reads_left = N_reads
        procs_left = procs

        for job in range(num_jobs):
            N_target_reads[job] = int(math.ceil(
                float(reads_left) / procs_left))
            reads_left -= N_target_reads[job]
            procs_left -= 1

        logging.debug("Partitioning %s into %s chunks for analysis..." %
                      (h5_file, num_jobs))
        chunksize = int(math.ceil(float(len(entries) / procs)))
        logging.info("Querying %s reads using %s chunks of size %s..." %
                     (len(to_check), procs, chunksize))
        entries_chunks = list(self.chunks(entries, chunksize))
        for chunk_id in range(procs):
            n = N_target_reads[chunk_id]
            # If --N_reads used, this ensures we touch as many contigs as possible
            idx = entries_chunks[chunk_id]
            np.random.shuffle(idx)
            if opts.h5_type == "cmp":
                tasks.put(
                    cmph5_read.subread_motif_processor(h5_file, chunk_id, idx,
                                                       n, opts.motifs,
                                                       opts.bi_motifs, opts))
                logging.debug("...%s (%s alignments)" %
                              (chunk_id, len(entries)))
            elif opts.h5_type == "bas":
                tasks.put(
                    baxh5_read.subread_motif_processor(h5_file, chunk_id, idx,
                                                       n, opts.motifs,
                                                       opts.bi_motifs, opts))
                logging.debug("...%s (%s reads)" % (chunk_id, len(entries)))
        logging.debug("Done")

        # Add a poison pill for each consumer
        for i in xrange(num_jobs):
            tasks.put(None)

        # Wait for all of the tasks to finish
        tasks.join()

        # Start printing results
        logging.info("Combining results data from all chunks...")
        parallel_results = []
        while num_jobs:
            result = results.get()
            parallel_results.append(result)
            num_jobs -= 1
            logging.info("...%s/%s" % ((procs - num_jobs), procs))
        logging.info("Done.")
        return parallel_results
示例#8
0
    def __call__(self):
        class ipd_entry:
            def __init__(self, tup):
                """
				"""
                self.ref_base = tup[0]
                self.ipd = tup[1]
                # self.call      = tup[2]
                # self.read_base = tup[3]
                self.ref_pos = tup[2]

        class subread:
            def __init__(self, cmph5, alignment, label, opts):
                leftAnchor = 1
                rightAnchor = 1
                self.entries = {}
                self.opts = opts

                self.subname = alignment.readName
                movieID = alignment.movieInfo[0]
                alignedLength = alignment.referenceSpan
                fps = alignment.movieInfo[2]
                self.refName = alignment.referenceInfo[3]
                zmw = alignment.HoleNumber
                self.mol = alignment.MoleculeID
                if alignment.isForwardStrand:
                    self.strand = 0
                else:
                    self.strand = 1
                self.ref_bases = alignment.reference()
                # self.read_bases = alignment.read()

                read_calls = alignment.transcript()
                ref_pos = list(alignment.referencePositions())
                IPD = list(alignment.IPD())
                self.label = self.opts.h5_labels[cmph5]

                error_mk = []
                for read_call in read_calls:
                    # Go through all entries and flag which positions are MM/indels
                    if read_call != "M":
                        # Mismatch or indel at this position!
                        error_mk.append(1)
                    else:
                        error_mk.append(0)

                # Get the indices of all the non-matches
                error_idx = [i for (i, val) in enumerate(error_mk) if val == 1]
                for error_id in error_idx:
                    try:
                        for j in range(leftAnchor):
                            error_mk[error_id - (j + 1)] = 1
                        for j in range(rightAnchor):
                            error_mk[error_id + (j + 1)] = 1
                    except IndexError:
                        pass
                error_mk = np.array(error_mk)

                ipds = np.array(IPD) / fps

                strands = np.array([self.strand] * len(read_calls))

                self.ref_bases = np.array(list(self.ref_bases))
                # self.read_bases = np.array(list(self.read_bases))
                self.ref_pos = np.array(ref_pos)
                read_calls = np.array(list(read_calls))

                # Mark the error positions, but leave them in the sequence so
                # we can pull out intact motifs from contiguous correct bases
                self.ref_bases[error_mk == 1] = "*"
                # self.read_bases[error_mk==1] = "*"
                read_calls[error_mk == 1] = "*"
                ipds[error_mk == 1] = -9
                strands[error_mk == 1] = -9

                # Attach these IPD entries to the subread object
                # for i,tup in enumerate(zip(self.ref_bases, ipds, read_calls, self.read_bases, self.ref_pos)):
                for i, tup in enumerate(zip(self.ref_bases, ipds,
                                            self.ref_pos)):
                    entry = ipd_entry(tup)
                    self.entries[self.ref_pos[i]] = ipd_entry(tup)

                # self.cap_outliers()

                self.subread_normalize()

            def cap_outliers(self, max_ipd=10):
                """
				Cap the outlier IPDs at max_ipd seconds.
				"""
                for read_pos, entry in self.entries.iteritems():
                    entry.ipd = min(entry.ipd, max_ipd)

            def subread_normalize(self):
                """
				Every IPD entry needs to be normalized by the mean IPD of its subread.
				"""
                if len(self.entries) == 0:
                    # Nothing to do here.
                    return self.entries

                # First populate list of all IPDs per subread. Will use to get normalization factor.
                subread_vals = []
                for entry in self.entries.values():
                    # Only do if this IPD is NOT from an error position
                    if entry.ipd != -9:
                        subread_vals.append(entry.ipd)

                rawIPDs = np.array(
                    map(lambda x: math.log(x + 0.001), subread_vals))
                nfs = rawIPDs.mean()

                for pos, entry in self.entries.iteritems():
                    if entry.ipd == -9:
                        newIPD = -9
                    else:
                        newIPD = math.log(entry.ipd + 0.001) - nfs

                    entry.ipd = newIPD

            def zip_bases_and_IPDs(self):
                """
				Reassemble the read and IPD values using the subread normalized IPDs
				"""
                od = OrderedDict(sorted(self.entries.items()))
                ref = []
                ref_pos = []
                self.ipds = []
                for read_pos, entry in od.items():
                    ref.append(entry.ref_base)
                    ref_pos.append(entry.ref_pos)
                    self.ipds.append(entry.ipd)
                self.ref_str = "".join(ref)
                self.ref_pos = ref_pos

        reader = CmpH5Reader(self.cmph5)

        read_refs = {}
        read_SMp = {}
        read_SMp_N = {}
        read_comps = {}
        read_labs = {}
        contig_SCp = {}
        i = 0
        n_mols = 0

        cwd = os.getcwd()

        # Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file
        chunksize = 10
        self.chunkdir = "chunk_%s" % self.chunk_id
        if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)):
            shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir))
        os.mkdir(os.path.join(self.opts.tmp, self.chunkdir))
        to_dump = defaultdict(list)

        def dump_data_to_contig_files(refName, to_dump, read_labs):
            refName = mbin.slugify(refName)
            ref_subname_fn = "%s_readnames.tmp" % refName
            ref_label_fn = "%s_labels.tmp" % refName
            ref_length_fn = "%s_lengths.tmp" % refName
            ref_ipds_fn = "%s_ipds.tmp" % refName
            ref_ipds_N_fn = "%s_ipdsN.tmp" % refName
            ref_comp_N_fn = "%s_compN.tmp" % refName
            ref_strand_fn = "%s_strand.tmp" % refName

            self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn))
            f_subnames = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn),
                "a")
            f_labels = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a")
            f_lengths = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a")
            f_ipds = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a")
            f_ipds_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a")
            f_comp_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a")
            f_strand = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a")
            self.tmp_fs.add(f_subnames)
            self.tmp_fs.add(f_labels)
            self.tmp_fs.add(f_ipds)
            self.tmp_fs.add(f_ipds_N)
            self.tmp_fs.add(f_comp_N)
            self.tmp_fs.add(f_strand)

            if self.opts.motifs_file != None and self.opts.subtract_control:
                control_ipds_d = pickle.load(
                    open(self.opts.control_pkl_name, "rb"))

            for i, (subread_ipds, subread_comps, readname, subread_length,
                    strand) in enumerate(to_dump[refName]):
                ipd_kmers = [motif for motif in subread_ipds.iterkeys()]
                ipd_means = [
                    subread_ipds[motif][1]
                    for motif in subread_ipds.iterkeys()
                ]
                ipd_counts = [
                    subread_ipds[motif][0]
                    for motif in subread_ipds.iterkeys()
                ]

                ipd_means = []
                if self.opts.motifs_file != None and self.opts.subtract_control:
                    for motif in subread_ipds.iterkeys():
                        if subread_ipds[motif][1] != 0.0:
                            w_control_sub = subread_ipds[motif][
                                1] - control_ipds_d[motif]
                            ipd_means.append(w_control_sub)
                        else:  # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0)
                            ipd_means.append(subread_ipds[motif][1])
                else:
                    for motif in subread_ipds.iterkeys():
                        ipd_means.append(subread_ipds[motif][1])

                comp_kmers = np.array(
                    [motif for motif, ipds in subread_comps.items()])
                comp_counts = np.array(
                    [ipds for motif, ipds in subread_comps.items()])
                if i == 0 and refName not in self.refName_has_header:
                    ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName
                    ref_comp_kmers_fn = "%s_compkmers.tmp" % refName
                    f_ipds_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_ipds_kmers_fn), "a")
                    f_comp_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_comp_kmers_fn), "a")
                    ipds_kmers_str = "\t".join(ipd_kmers)
                    comp_kmers_str = "\t".join(comp_kmers)
                    f_ipds_kmers.write("%s\n" % ipds_kmers_str)
                    f_comp_kmers.write("%s\n" % comp_kmers_str)
                    f_ipds_kmers.close()
                    f_comp_kmers.close()
                    self.refName_has_header.add(refName)
                ipds_str = "\t".join(map(lambda x: str(round(x, 3)),
                                         ipd_means))
                ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts))
                comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts))
                f_subnames.write("%s\n" % readname)
                f_labels.write("%s\n" % read_labs[readname])
                f_lengths.write("%s\n" % subread_length)
                f_ipds.write("%s\n" % ipds_str)
                f_ipds_N.write("%s\n" % ipds_N_str)
                f_comp_N.write("%s\n" % comp_counts_str)
                f_strand.write("%s\n" % strand)

            for f in self.tmp_fs:
                f.close()

        self.tmp_fs = set()
        self.tmp_fns = set()
        self.refName_has_header = set()
        to_check = reader[self.idx]
        for alignment in to_check:
            ref_contig = mbin.slugify(alignment.referenceInfo[3])
            label = self.opts.h5_labels[self.cmph5]
            ref_len = self.opts.cmph5_contig_lens[self.cmph5][ref_contig]
            if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV:
                to_get = min(self.N_target_reads, len(self.idx))
                incr = to_get / 10
                readname = "/".join(alignment.readName.split("/")[:-1])
                if len(read_labs.keys()) % incr == 0 and not read_labs.get(
                        readname):
                    logging.info(
                        "...chunk %s\t- mol %s/%s (%.1f%%)" %
                        (self.chunk_id, n_mols, to_get, 100 * n_mols / to_get))

                read_labs[readname] = label
                read_refs[readname] = ref_contig

                sub = subread(self.cmph5, alignment, label, self.opts)
                sub.zip_bases_and_IPDs()
                subread_ipds,subread_comps = read_scanner.scan_motifs( "cmp",          \
																	   # sub.read_str,   \
                                sub.ipds,       \
                                sub.ref_str,    \
                                sub.strand,     \
                                self.motifs,    \
                                self.bi_motifs, \
                                self.opts )

                to_dump[ref_contig].append(
                    (subread_ipds, subread_comps, readname, len(sub.ref_str),
                     sub.strand))
                # Dump subread IPD and comp data to contig-specific file
                if len(to_dump[ref_contig]) % chunksize == 0 and len(
                        to_dump[ref_contig]) != 0:
                    dump_data_to_contig_files(ref_contig, to_dump, read_labs)
                    to_dump[ref_contig] = []

                n_mols = len(read_labs.keys())
                i += 1

                if n_mols == self.N_target_reads:
                    break

        for ref_contig in to_dump.keys():
            dump_data_to_contig_files(ref_contig, to_dump, read_labs)
        for f in self.tmp_fs:
            f.close()
        to_dump = defaultdict(list)

        if i == 0:
            logging.info("Chunk %s: no qualifying reads found!" %
                         self.chunk_id)

        logging.info(
            "Chunk %s: found %s alignments (%s molecules) > %sbp in %s" %
            (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min,
             os.path.basename(self.cmph5)))
        reader.close()

        return self.tmp_fns