Exemplo n.º 1
0
    def test_split_by_contigs_presplit(self):
        # Consumes too much memory for Jenkins

        # Test to make sure the result of a split by contigs has an appropriate
        # number of records (make sure filters are appropriately aggressive)
        ds2 = DataSet(data.getXml(14))
        bams = ds2.externalResources.resourceIds
        assert len(bams) == 2
        refwindows = ds2.refWindows
        assert refwindows == [(0, 0, 224992)]
        res1 = openIndexedAlignmentFile(bams[0][7:])
        res2 = openIndexedAlignmentFile(bams[1][7:])

        def count(iterable):
            count = 0
            for _ in iterable:
                count += 1
            return count

        assert count(res1.readsInRange(*refwindows[0])) == 1409
        assert count(res2.readsInRange(*refwindows[0])) == 1375
        assert count(ds2.readsInRange(*refwindows[0])) == 2784
        assert count(ds2.records) == 2784
        ds2.disableFilters()
        assert count(ds2.records) == 53552
        assert ds2.countRecords() == 53552
Exemplo n.º 2
0
    def test_split_by_contigs_presplit(self):
        # Consumes too much memory for Jenkins

        # Test to make sure the result of a split by contigs has an appropriate
        # number of records (make sure filters are appropriately aggressive)
        ds2 = DataSet(data.getXml(15))
        bams = ds2.externalResources.resourceIds
        self.assertEqual(len(bams), 2)
        refwindows = ds2.refWindows
        self.assertEqual(refwindows, [(0, 0, 224992)])
        res1 = openIndexedAlignmentFile(bams[0][7:])
        res2 = openIndexedAlignmentFile(bams[1][7:])

        def count(iterable):
            count = 0
            for _ in iterable:
                count += 1
            return count

        self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
        self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
        self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
        self.assertEqual(count(ds2.records), 2784)
        ds2.disableFilters()
        self.assertEqual(count(ds2.records), 53552)
        self.assertEqual(ds2.countRecords(), 53552)
Exemplo n.º 3
0
    def track_split_molecule_alignments(self, idx_chunks, cmph5_file):
        # reader     = CmpH5Reader(cmph5_file)
        reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref)
        chunk_mols = {}
        for i, idx_chunk in enumerate(idx_chunks):
            chunk_mols[i] = set()
            for alignment in reader[idx_chunk]:

                if self.Config.opts.useZMW:
                    mol_id = "%s_%s" % (alignment.HoleNumber,
                                        alignment.movieName)
                else:
                    mol_id = alignment.MoleculeID

                chunk_mols[i].add(mol_id)
        reader.close()

        split_mols = set()
        i = 1
        for idx_chunk in idx_chunks[1:]:
            j = i - 1
            split = chunk_mols[i] & chunk_mols[j]
            split_mols = split_mols | split
            i += 1
        return split_mols
Exemplo n.º 4
0
    def split_up_control_IPDs(self, control_ipds, cmph5_file, idx_chunks):
        """
		Separate out relevant portions of the control_ipds dictionary. We are taking
		advantage of the fact that the alignment flat files are sorted by aligned
		reference position.
		"""

        # reader                  = CmpH5Reader(cmph5_file)
        reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref)

        local_control_ipds = {}
        for chunk_id, idx_chunk in enumerate(idx_chunks):
            idx_mins = [
                min(reader[idx].tStart, reader[idx].tEnd) for idx in idx_chunk
            ]
            idx_maxs = [
                max(reader[idx].tStart, reader[idx].tEnd) for idx in idx_chunk
            ]
            first_ref_pos = min(idx_mins)
            last_ref_pos = max(idx_maxs)
            # first_ref_pos  = pull_last_ref_pos_from_alignments_file( alignments_flat_fn, "head" )
            # last_ref_pos   = pull_last_ref_pos_from_alignments_file( alignments_flat_fn, "tail" )
            region_control = {0: {}, 1: {}}
            logging.debug(
                "Split control IPD dicts  --  chunk %s: %sbp - %sbp" %
                (chunk_id, first_ref_pos, last_ref_pos + 1))
            for strand in region_control.keys():
                for pos in range(first_ref_pos, last_ref_pos + 1):
                    try:
                        region_control[strand][pos] = control_ipds[strand][pos]
                    except KeyError:
                        # In case we don't have WGA coverage at this position
                        pass
            local_control_ipds[chunk_id] = region_control
        return local_control_ipds
Exemplo n.º 5
0
    def get_reference_contigs(self, cmph5):
        """
		Pull out the list of contigs in the h5 file.
		"""
        # reader  = CmpH5Reader(cmph5)
        reader = openIndexedAlignmentFile(cmph5, self.Config.ref)
        contigs = set(map(lambda x: (x[3], x[2]), reader.referenceInfoTable))
        return contigs
Exemplo n.º 6
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(
                    record.aStart, record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(
                    record.aEnd, record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID,
                0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Exemplo n.º 7
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(record.aStart,
                                 record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(record.aEnd,
                                 record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Exemplo n.º 8
0
    def test_split_by_contigs_presplit(self):
        # Consumes too much memory for Jenkins

        # Test to make sure the result of a split by contigs has an appropriate
        # number of records (make sure filters are appropriately aggressive)
        ds2 = DataSet(data.getXml(15))
        bams = ds2.externalResources.resourceIds
        self.assertEqual(len(bams), 2)
        refwindows = ds2.refWindows
        self.assertEqual(refwindows, [(0, 0, 224992)])
        res1 = openIndexedAlignmentFile(bams[0][7:])
        res2 = openIndexedAlignmentFile(bams[1][7:])
        def count(iterable):
            count = 0
            for _ in iterable:
                count += 1
            return count
        self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
        self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
        self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
        self.assertEqual(count(ds2.records), 2784)
        ds2.disableFilters()
        self.assertEqual(count(ds2.records), 53552)
        self.assertEqual(ds2.countRecords(), 53552)
Exemplo n.º 9
0
    def test_loading_reference(self):
        log.info('Opening Reference')
        r = ReferenceSet(data.getRef()).toExternalFiles()[0]
        log.info('Done Opening Reference')
        log.info('Opening AlignmentSet')
        d = AlignmentSet(data.getBam(), referenceFastaFname=r)
        log.info('Done Opening AlignmentSet')
        bfile = openIndexedAlignmentFile(data.getBam(), referenceFastaFname=r)
        self.assertTrue(bfile.isReferenceLoaded)
        for res in d.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)

        aln = AlignmentSet(data.getBam())
        aln.addReference(r)
        for res in aln.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)
Exemplo n.º 10
0
    def test_loading_reference(self):
        log.info('Opening Reference')
        r = ReferenceSet(data.getRef()).toExternalFiles()[0]
        log.info('Done Opening Reference')
        log.info('Opening AlignmentSet')
        d = AlignmentSet(data.getBam(), referenceFastaFname=r)
        log.info('Done Opening AlignmentSet')
        bfile = openIndexedAlignmentFile(data.getBam(),
                                         referenceFastaFname=r)
        self.assertTrue(bfile.isReferenceLoaded)
        for res in d.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)

        aln = AlignmentSet(data.getBam())
        aln.addReference(r)
        for res in aln.resourceReaders():
            self.assertTrue(res.isReferenceLoaded)
Exemplo n.º 11
0
    def scan_WGA_aligns(self):
        """
		Get some necessary information about the WGA cmp.h5 
		being used to generate the control IPD data.
		"""
        self.opts.aln_fn_labels = {}
        self.opts.aln_fn_contig_lens = {}
        self.opts.aln_fn_labels[self.control_aln_fn] = "control"
        self.opts.aln_fn_contig_lens[self.control_aln_fn] = {}

        # reader = CmpH5Reader(self.control_aln_fn)
        reader = openIndexedAlignmentFile(self.control_aln_fn)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            self.opts.aln_fn_contig_lens[
                self.control_aln_fn][slug_name] = length
        reader.close()

        return self.opts
Exemplo n.º 12
0
    def launch_parallel_molecule_loading(self, cmph5_file, prefix,
                                         movie_name_ID_map, control_ipds):
        logging.debug("Creating tasks...")
        tasks = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()
        logging.debug("Done.")

        logging.debug("Starting consumers...")
        num_jobs = self.Config.opts.procs
        consumers = [
            Consumer(tasks, results, self.Config.opts.contig_id)
            for i in xrange(num_jobs)
        ]
        for w in consumers:
            w.start()
        logging.debug("Done.")

        def chunks(l, n):
            """
			Yield successive n-sized chunks from l.
			"""
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # Enqueue jobs
        logging.info("Partitioning %s into %s chunks for analysis..." %
                     (cmph5_file, num_jobs))
        # reader         = CmpH5Reader(cmph5_file)
        reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref)
        if self.Config.opts.align_ftype == "cmp":
            alnIDs = [
                r.AlnID for r in reader
                if r.referenceInfo[2] == self.Config.opts.contig_id
            ]
        elif self.Config.opts.align_ftype == "bam":
            alnIDs = [
                r.rowNumber for r in reader
                if r.referenceInfo[2] == self.Config.opts.contig_id
            ]

        if len(alnIDs) <= num_jobs:
            num_jobs = 1
        reader.close()

        # for chunk_id,alignments_flat_fn in enumerate(tmp_flat_files):
        chunksize = int(math.ceil(float(len(alnIDs) / num_jobs)))
        idx_chunks = list(chunks((np.array(alnIDs) - 1), chunksize))

        if len(idx_chunks[-1]) == 1 and len(idx_chunks) > 1:
            idx_chunks = idx_chunks[:-1]

        if prefix == "nat_":
            logging.info(
                "%s - Separating out file-matched regions of the control IPD values dict..."
                % self.Config.opts.contig_id)
            local_control_ipds = self.split_up_control_IPDs(
                control_ipds, cmph5_file, idx_chunks)
            logging.debug("%s - Done." % self.Config.opts.contig_id)

        # In splitting alignment indexes among processes, some molecules will have
        # alignments in going to different processes. Track these.
        split_mols = self.track_split_molecule_alignments(
            idx_chunks, cmph5_file)

        for chunk_id in range(num_jobs):
            idx = idx_chunks[chunk_id]
            if prefix == "wga_":
                tasks.put(parse_mol_aligns.wga_molecules_processor( cmph5_file,                   \
                             chunk_id,                     \
                             prefix,                       \
                             self.Config.opts.contig_id,   \
                             self.ref_size,                \
                             self.sites_pos,               \
                             self.sites_neg,               \
                             self.Config.opts,             \
                             idx,                          \
                             split_mols ))
            else:
                logging.debug("Launching subprocess %s..." % chunk_id)
                tasks.put(parse_mol_aligns.native_molecules_processor( cmph5_file,                          \
                                chunk_id,                            \
                                prefix,                              \
                                self.Config.opts,                    \
                                self.Config.fastq,                   \
                                self.Config.ref,                     \
                                copy.copy(movie_name_ID_map),        \
                                local_control_ipds[chunk_id],        \
                                self.ref_size,                       \
                                self.sites_pos,                      \
                                self.sites_neg,                      \
                                idx,                                 \
                                split_mols))
                logging.debug("Done (%s)." % chunk_id)

        # Add a 'poison pill' for each consumer
        for i in xrange(self.Config.opts.procs):
            tasks.put(None)
        tasks.join()

        # Start printing results
        parallel_results = []
        while num_jobs:
            result = results.get()
            parallel_results.append(result)
            num_jobs -= 1

        return parallel_results
Exemplo n.º 13
0
    def __call__(self):
        # reader = CmpH5Reader(self.cmph5)
        reader = openIndexedAlignmentFile(self.cmph5, self.opts.ref)

        self.chunk_output_fn = "%s_%s.tmp" % (self.out, self.chunk_id)
        self.var_chunk_fn = "vars_%s.tmp" % self.chunk_id
        if self.align:
            self.var_f = open(self.var_chunk_fn, "w")

        mol_alignments = defaultdict(list)
        for i, alignment in enumerate(reader[self.idx]):

            if self.opts.align_ftype == "cmp":
                aln_acc = alignment.accuracy
                aln_len = len(alignment.alignmentArray())
            elif self.opts.align_ftype == "bam":
                aln_features = dict(alignment.peer.tags)
                aln_acc = aln_features["rq"]
                aln_len = len(aln_features["ip"])

            if aln_acc >= self.opts.minAcc and aln_len >= self.opts.minSubreadLen:
                if self.opts.useZMW:
                    mol_id = "%s_%s" % (alignment.HoleNumber,
                                        alignment.movieName)
                else:
                    mol_id = alignment.MoleculeID
                mol_alignments[mol_id].append(alignment)

        self.mols = {}
        incr = int(max(4, math.floor(float(len(mol_alignments.keys()))) / 4))
        for i, (mol_id, alignments) in enumerate(mol_alignments.iteritems()):
            if i % incr == 0:
                logging.info(
                    "...chunk %s - processing molecules: %s/%s (%.1f%%)" %
                    (self.chunk_id, i, len(mol_alignments.keys()),
                     100 * i / len(mol_alignments.keys())))

            mol = molecule(alignments, self.prefix, self.leftAnchor,
                           self.rightAnchor, self.contig_id, self.sites_pos,
                           self.sites_neg, self.cmph5, self.opts)
            if self.opts.useZMW:
                # Replace the bad moleculeID with the good ZMW ID, formatted: <zmwID>_<movieID>
                mol.mol_id = mol_id

            if len(mol.entries) > 0:
                self.mols[mol_id] = mol

        # Exclude any molecules that are divided between split-up alignment files
        self.mols = remove_split_up_molecules(self.mols, self.split_mols)

        # [Optional]: align CCS reads to reference to find SNPs/errors
        if not self.align:
            # Need to empirically try to determine subread start/end positions in order to designate off-limits entries.
            for mol in self.mols.values():
                mol.var_pos = []
                # self.empirical_get_start_end_pos( mol )
        elif len(self.mols.values()) > 0:
            CCS = CCS_aligner.mols_aligner( self.mols,                \
                    self.fastq,               \
                    self.ref,                 \
                    self.movie_name_ID_map , \
                    self.align,               \
                    self.chunk_id)
            # Output the called CCS read-level variants/errors to a chunk file
            for mol in self.mols.values():
                vars_str = ",".join(map(lambda x: str(x), mol.var_no_sc))
                self.var_f.write("%s %s\n" % (mol.mol_id, vars_str))

        # if self.SMsn:
        # 	# If the empirical start/end discovery showed a lack of positions with sufficient coverage, remove molecule
        # 	del_me = [mol.mol_id for mol in self.mols.values() if mol.to_del]
        # 	logging.debug("Process %s (chunk %s): deleting %s molecules due to too many positions with low coverage." % (self.chunk_id, \
        # i, \
        # len(del_me)))
        # for mol_id in del_me:
        # 	del self.mols[mol_id]

        if len(self.mols.values()) > 0:
            # Identify and remove positions to be excluded from further analysis
            tot_entries = 0
            tot_entries_deleted = 0
            for mol in self.mols.values():
                entries_deleted, entries = self.remove_off_limits_positions(
                    mol)
                tot_entries += entries
                tot_entries_deleted += entries_deleted
            pct_deleted = float(tot_entries_deleted) / tot_entries * 100
            logging.debug("Process %s (chunk %s): deleted %s (%.1f%%) off-limits positions." % (self.chunk_id, \
                                  i, \
                                  tot_entries_deleted, \
                                  pct_deleted))

        # Generate the IPD arrays per genomic position/strand by aggregating all IPD entries across molecules (self.ipdArrays)
        logging.debug("Process %s: generating IPD arrays..." % self.chunk_id)
        for mol in self.mols.values():
            self.create_arrays(mol)

        if self.SMp:
            for mol in self.mols.values():
                mol.ipdArrays = self.condense_native_mol_motifs_into_one_pos(
                    mol)

        # Now run the comparison test
        logging.debug("Process %s: running comparisons..." % self.chunk_id)
        for mol in self.mols.values():
            self.get_scores(mol)

        mols_w_results = len(
            [mol for mol in self.mols.values() if len(mol.output) > 0])
        logging.debug(
            "Process %s: %s molecules generated comparison test output" %
            (self.chunk_id, mols_w_results))

        if mols_w_results > 0:
            self.print_output(self.mols.values())
            # self.concatenate_mol_results()

        if self.align:
            self.var_f.close()
        reader.close()
        return self.chunk_output_fn
Exemplo n.º 14
0
    def __call__(self):
        # reader = CmpH5Reader(self.cmph5)
        reader = openIndexedAlignmentFile(self.cmph5, self.opts.ref)

        mol_alignments = defaultdict(list)
        for alignment in reader[self.idx]:

            if self.opts.align_ftype == "cmp":
                aln_acc = alignment.accuracy
                aln_len = len(alignment.alignmentArray())
            elif self.opts.align_ftype == "bam":
                aln_features = dict(alignment.peer.tags)
                aln_acc = aln_features["rq"]
                aln_len = len(aln_features["ip"])

            if aln_acc >= self.opts.minAcc and aln_len >= self.opts.minSubreadLen:
                if self.opts.useZMW:
                    mol_id = "%s_%s" % (alignment.HoleNumber,
                                        alignment.movieName)
                else:
                    mol_id = alignment.MoleculeID
                mol_alignments[mol_id].append(alignment)

        self.mols = {}
        i = 0
        incr = int(max(4, math.floor(float(len(mol_alignments.keys()))) / 4))
        for i, (mol_id, alignments) in enumerate(mol_alignments.iteritems()):
            if i % incr == 0:
                logging.info(
                    "...chunk %s - %s/%s (%.1f%%) alignments processed..." %
                    (self.chunk_id, i, len(mol_alignments.keys()),
                     100 * float(i) / len(mol_alignments.keys())))

            mol = molecule(alignments, self.prefix, self.leftAnchor,
                           self.rightAnchor, self.contig_id, self.sites_pos,
                           self.sites_neg, self.cmph5, self.opts)
            if self.opts.useZMW:
                # Replace the bad moleculeID with the good ZMW ID, formatted: <zmwID>_<movieID>
                mol.mol_id = mol_id

            if len(mol.entries) > 0:
                self.mols[mol.mol_id] = mol

        if self.opts.useZMW:
            pass
        else:
            # Generate map between ZMW and molecule IDs (self.zmw_mol_map)
            self.mols, self.zmw_mol_map = generate_molecule_ZMW_map(
                self.mols, self.chunk_id)

        # Exclude any molecules that are divided between split-up alignment files
        logging.debug(
            "Process %s: removing %s total molecules whose alignments are different chunks..."
            % (self.chunk_id, len(self.split_mols)))
        self.mols = remove_split_up_molecules(self.mols, self.split_mols)

        # Generate the IPD arrays per genomic position/strand by aggregating all
        # IPD entries across molecules (self.ipdArrays)
        self.ipdArrays = self.create_agg_IPD_arrays()

        reader.close()

        # Return the processed IPD array dictionary
        return self.ipdArrays
Exemplo n.º 15
0
	def __call__( self ):
		
		class ipd_entry:
			def __init__( self, tup ):
				"""
				"""
				self.ref_base  = tup[0]
				self.ipd       = tup[1]
				self.ref_pos   = tup[2]

		class subread:
			def __init__( self, align_fn, alignment, label, opts ):
				leftAnchor   = 1
				rightAnchor  = 1
				self.entries = {}
				self.opts    = opts

				self.subname    = alignment.readName
				alignedLength   = alignment.referenceSpan
				self.refName    = alignment.referenceInfo[3]
				zmw             = alignment.HoleNumber


				###########################################
				# self.mol        = alignment.MoleculeID
				self.mol        = alignment.HoleNumber
				###########################################
				

				if alignment.isForwardStrand:
					self.strand = 0
				else:
					self.strand = 1

				self.ref_bases  = alignment.reference()

				read_calls      = alignment.transcript()
				ref_pos         = list(alignment.referencePositions())
				IPD             = list(alignment.IPD())
				self.label      = self.opts.aln_fn_labels[align_fn]

				error_mk = []
				for read_call in read_calls:
					# Go through all entries and flag which positions are MM/indels
					if read_call != "M":
						# Mismatch or indel at this position!
						error_mk.append(1)
					else:
						error_mk.append(0)
				
				# Get the indices of all the non-matches
				error_idx = [i for (i,val) in enumerate(error_mk) if val == 1]
				for error_id in error_idx:
					try:
						for j in range(leftAnchor):
							error_mk[error_id - (j+1)] = 1
						for j in range(rightAnchor):
							error_mk[error_id + (j+1)] = 1
					except IndexError:
						pass
				error_mk = np.array(error_mk)

				ipds     = np.array(IPD) / self.opts.fps
				
				strands  = np.array([self.strand] * len(read_calls))

				self.ref_bases  = np.array(list(self.ref_bases))
				self.ref_pos    = np.array(ref_pos)
				read_calls      = np.array(list(read_calls))

				# Mark the error positions, but leave them in the sequence so
				# we can pull out intact motifs from contiguous correct bases
				self.ref_bases[error_mk==1]  = "*"
				read_calls[error_mk==1]      = "*"
				ipds[error_mk==1]            = -9
				strands[error_mk==1]         = -9

				# Attach these IPD entries to the subread object
				for i,tup in enumerate(zip(self.ref_bases, ipds, self.ref_pos)):
					entry = ipd_entry(tup)
					self.entries[ self.ref_pos[i] ] = ipd_entry(tup)

				# self.cap_outliers()

				self.subread_normalize()

			def cap_outliers( self, max_ipd=10 ):
				"""
				Cap the outlier IPDs at max_ipd seconds.
				"""
				for read_pos,entry in self.entries.iteritems():
					entry.ipd = min(entry.ipd, max_ipd)

			def subread_normalize( self ):
				"""
				Every IPD entry needs to be normalized by the mean IPD of its subread.
				"""
				if len(self.entries) == 0:
					# Nothing to do here.
					return self.entries

				# First populate list of all IPDs per subread. Will use to get normalization factor.
				subread_vals = []
				for entry in self.entries.values():
					# Only do if this IPD is NOT from an error position
					if entry.ipd != -9:
						subread_vals.append(entry.ipd)

				rawIPDs = np.array(map(lambda x: math.log(x + 0.001), subread_vals))
				nfs     = rawIPDs.mean()

				for pos, entry in self.entries.iteritems():
					if entry.ipd == -9:
						newIPD = -9
					else:
						newIPD = math.log(entry.ipd + 0.001) - nfs
					
					entry.ipd = newIPD

			def zip_bases_and_IPDs( self ):
				"""
				Reassemble the read and IPD values using the subread normalized IPDs
				"""
				od        = OrderedDict(sorted(self.entries.items()))
				ref       = []
				ref_pos   = []
				self.ipds = []
				for read_pos, entry in od.items():
					ref.append(entry.ref_base)
					ref_pos.append(entry.ref_pos)
					self.ipds.append(entry.ipd)
				self.ref_str  = "".join(ref)
				self.ref_pos  = ref_pos

		if self.opts.aln_ftype=="cmp":
			reader = CmpH5Reader(self.align_fn)
		elif self.opts.aln_ftype=="bam":
			reader = openIndexedAlignmentFile(self.align_fn, self.opts.ref)
		else:
			raise Exception("Unrecognized alignment filetype (must be *.cmp.h5 or *.bam): %s" % self.align_fn)

		def get_fps(align_fn):
			"""
			For *.cmp.h5 files, frame rate (fps) is include in each alignment.
			For *.bam files, the frame rate is encoded in the file header (FRAMERATEHZ)
			"""
			if self.opts.aln_ftype=="cmp":
				# Read frame rate directly from a cmp.h5 alignment
				reader    = CmpH5Reader(align_fn)
				alignment = reader[0]
				fps       = alignment.movieInfo[2]
			
			elif self.opts.aln_ftype=="bam":
				# Isolate description (DS) from read group (RG) in BAM header
				bam     = pysam.AlignmentFile(align_fn, "rb")
				h       = bam.header
				rg_ds_l = h.as_dict()["RG"][0]["DS"].split(";")
				rg_ds_d = dict([ (x.split("=")[0], x.split("=")[1]) for x in rg_ds_l])
				fps     = float(rg_ds_d["FRAMERATEHZ"])

			return fps

		# Pull the frame rate value from the alignment file
		self.opts.fps = get_fps(self.align_fn)

		read_refs     = {}
		read_SMp      = {}
		read_SMp_N    = {}
		read_comps    = {}
		read_labs     = {}
		contig_SCp    = {}
		i             = 0
		n_mols        = 0

		cwd = os.getcwd()

		# Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file
		chunksize     = 10
		self.chunkdir = "chunk_%s" % self.chunk_id
		if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)):
			shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir))
		os.mkdir(os.path.join(self.opts.tmp, self.chunkdir))
		to_dump = defaultdict(list)

		def dump_data_to_contig_files( refName, to_dump, read_labs ):
			refName           = mbin.slugify(refName)
			ref_subname_fn    = "%s_readnames.tmp"    % refName
			ref_label_fn      = "%s_labels.tmp"       % refName
			ref_length_fn     = "%s_lengths.tmp"      % refName
			ref_ipds_fn       = "%s_ipds.tmp"         % refName
			ref_ipds_N_fn     = "%s_ipdsN.tmp"        % refName
			ref_comp_N_fn     = "%s_compN.tmp"        % refName
			ref_strand_fn     = "%s_strand.tmp"       % refName

			self.tmp_fns.add( os.path.join(self.chunkdir, ref_subname_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_label_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_length_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_ipds_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_ipds_N_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_comp_N_fn) )
			self.tmp_fns.add( os.path.join(self.chunkdir, ref_strand_fn) )
			f_subnames   = open(os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn),  "a")
			f_labels     = open(os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn),    "a")
			f_lengths    = open(os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn),   "a")
			f_ipds       = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn),     "a")
			f_ipds_N     = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn),   "a")
			f_comp_N     = open(os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn),   "a")
			f_strand     = open(os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn),   "a")
			self.tmp_fs.add(f_subnames)
			self.tmp_fs.add(f_labels)
			self.tmp_fs.add(f_ipds)
			self.tmp_fs.add(f_ipds_N)
			self.tmp_fs.add(f_comp_N)
			self.tmp_fs.add(f_strand)
			
			if self.opts.motifs_file!=None and self.opts.subtract_control:
				control_ipds_d = pickle.load( open(self.opts.control_pkl_name,"rb" ) )

			for i,(subread_ipds,subread_comps,readname,subread_length,strand) in enumerate(to_dump[refName]):
				ipd_kmers   = [motif                  for motif in subread_ipds.iterkeys()]
				ipd_means   = [subread_ipds[motif][1] for motif in subread_ipds.iterkeys()]
				ipd_counts  = [subread_ipds[motif][0] for motif in subread_ipds.iterkeys()]

				ipd_means = []
				if self.opts.motifs_file!=None and self.opts.subtract_control:
					for motif in subread_ipds.iterkeys():
						if subread_ipds[motif][1] != 0.0:
							w_control_sub = subread_ipds[motif][1] - control_ipds_d[motif]
							ipd_means.append(w_control_sub)
						else: # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0)
							ipd_means.append(subread_ipds[motif][1])
				else:
					for motif in subread_ipds.iterkeys():
						ipd_means.append(subread_ipds[motif][1])

				comp_kmers  = np.array( [motif   for motif,ipds in subread_comps.items()] )
				comp_counts = np.array( [ipds    for motif,ipds in subread_comps.items()] )
				if i==0 and refName not in self.refName_has_header:
					ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName
					ref_comp_kmers_fn = "%s_compkmers.tmp" % refName
					f_ipds_kmers      = open(os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_kmers_fn), "a")
					f_comp_kmers      = open(os.path.join(self.opts.tmp, self.chunkdir, ref_comp_kmers_fn), "a")
					ipds_kmers_str    = "\t".join(ipd_kmers)
					comp_kmers_str    = "\t".join(comp_kmers)
					f_ipds_kmers.write("%s\n" % ipds_kmers_str)
					f_comp_kmers.write("%s\n" % comp_kmers_str)
					f_ipds_kmers.close()
					f_comp_kmers.close()
					self.refName_has_header.add(refName)
				ipds_str        = "\t".join(map(lambda x: str(round(x,3)), ipd_means))
				ipds_N_str      = "\t".join(map(lambda x: str(x),          ipd_counts))
				comp_counts_str = "\t".join(map(lambda x: str(x),          comp_counts))
				f_subnames.write(  "%s\n" % readname)
				f_labels.write(    "%s\n" % read_labs[readname])
				f_lengths.write(   "%s\n" % subread_length)
				f_ipds.write(      "%s\n" % ipds_str)
				f_ipds_N.write(    "%s\n" % ipds_N_str)
				f_comp_N.write(    "%s\n" % comp_counts_str)
				f_strand.write(    "%s\n" % strand)

			for f in self.tmp_fs:
				f.close()
		self.tmp_fs             = set()
		self.tmp_fns            = set()
		self.refName_has_header = set()
		to_check = reader[self.idx]
		for alignment in to_check:
			ref_contig = mbin.slugify(alignment.referenceInfo[3])
			label      = self.opts.aln_fn_labels[self.align_fn]
			ref_len    = self.opts.aln_fn_contig_lens[self.align_fn][ref_contig]
			if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV:
				to_get    = min(self.N_target_reads, len(self.idx))
				incr      = to_get/10
				readname  = "/".join(alignment.readName.split("/")[:-1])
				if len(read_labs.keys())%incr==0 and not read_labs.get(readname):
					logging.info("...chunk %s\t- mol %s/%s (%.1f%%)" % (self.chunk_id, n_mols, to_get, 100*n_mols/to_get))

				read_labs[readname] = label
				read_refs[readname] = ref_contig

				sub = subread( self.align_fn, alignment, label, self.opts )
				sub.zip_bases_and_IPDs()
				subread_ipds,subread_comps = read_scanner.scan_motifs( "aligned",      \
																	   sub.ipds,       \
																	   sub.ref_str,    \
																	   sub.strand,     \
																	   self.motifs,    \
																	   self.bi_motifs, \
																	   self.opts )
				
				to_dump[ref_contig].append( (subread_ipds,subread_comps,readname,len(sub.ref_str),sub.strand) )
				# Dump subread IPD and comp data to contig-specific file
				if len(to_dump[ref_contig])%chunksize==0 and len(to_dump[ref_contig])!=0:
					dump_data_to_contig_files( ref_contig, to_dump, read_labs )
					to_dump[ref_contig] = []

				n_mols = len(read_labs.keys())
				i += 1
					
				if n_mols==self.N_target_reads:
					break

		for ref_contig in to_dump.keys():
			dump_data_to_contig_files( ref_contig, to_dump, read_labs )
		for f in self.tmp_fs:
			f.close()
		to_dump = defaultdict(list)

		if i==0:
			logging.info("Chunk %s: no qualifying reads found!" % self.chunk_id)
		
		logging.info("Chunk %s: found %s alignments (%s molecules) > %sbp in %s" % (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min, os.path.basename(self.align_fn)))
		reader.close()

		return self.tmp_fns
Exemplo n.º 16
0
    def launch_parallel_molecule_loading(self, cmph5_file, prefix,
                                         movie_name_ID_map, control_ipds):
        logging.debug("Creating tasks...")
        tasks = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()
        logging.debug("Done.")

        logging.debug("Starting consumers...")
        num_jobs = self.Config.opts.procs
        consumers = [
            Consumer(tasks, results, self.Config.opts.contig_id)
            for i in xrange(num_jobs)
        ]
        for w in consumers:
            w.start()
        logging.debug("Done.")

        def chunks(l, n):
            """
			Yield successive n-sized chunks from l.
			"""
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # Enqueue jobs
        logging.info("Partitioning %s into %s chunks for analysis..." %
                     (cmph5_file, num_jobs))
        # reader         = CmpH5Reader(cmph5_file)
        reader = openIndexedAlignmentFile(cmph5_file, self.Config.ref)
        if self.Config.opts.align_ftype == "cmp":
            alnIDs = [
                r.AlnID for r in reader
                if r.referenceInfo[2] == self.Config.opts.contig_id
            ]
        elif self.Config.opts.align_ftype == "bam":
            alnIDs = [
                r.rowNumber for r in reader
                if r.referenceInfo[2] == self.Config.opts.contig_id
            ]

        if len(alnIDs) <= num_jobs:
            num_jobs = 1
        reader.close()

        # for chunk_id,alignments_flat_fn in enumerate(tmp_flat_files):
        # chunksize    = int(math.ceil(float( len(alnIDs)/num_jobs )))
        # idx_chunks   = list(chunks( (np.array(alnIDs)-1), chunksize ))

        # if len(idx_chunks[-1])==1 and len(idx_chunks)>1:
        # 	idx_chunks = idx_chunks[:-1]

        ############################################################################
        ## MODIFIED CHUNKING ## ENSURES SUBREADS FROM SAME MOLECULE ARE NOT SPLIT ##
        logging.info(
            "Chunking to ensure subreads are not split between chunks...")
        grouped_mols = defaultdict(list)
        for alnID in alnIDs:
            mol_id = "%s_%s" % (reader[alnID].HoleNumber,
                                reader[alnID].movieName)
            grouped_mols[mol_id].append(alnID)

        chunksize = int(math.ceil(float(len(grouped_mols.keys()) / num_jobs)))
        logging.info("Chunking: %s / %s molecules per chunk" %
                     (chunksize, len(grouped_mols.keys())))
        idx_chunks = []
        added = 0
        chunk = []
        for mol in grouped_mols.keys():
            if added >= chunksize:
                idx_chunks.append(chunk)
                chunk = []
                added = 0
            chunk += grouped_mols[mol]
            added += 1
        if len(chunk) > 0:
            idx_chunks.append(chunk)

        idx_chunks = np.array(idx_chunks)

        num_jobs = len(idx_chunks)

        idx_count = 0
        for idx_chunk in idx_chunks:
            for idx in idx_chunk:
                idx_count += 1

        logging.info("%s / %s subreads successfully chunked..." %
                     (idx_count, len(alnIDs)))

        #############################################################################

        if prefix == "nat_":
            logging.info(
                "%s - Separating out file-matched regions of the control IPD values dict..."
                % self.Config.opts.contig_id)
            local_control_ipds = self.split_up_control_IPDs(
                control_ipds, cmph5_file, idx_chunks)
            logging.debug("%s - Done." % self.Config.opts.contig_id)

        # In splitting alignment indexes among processes, some molecules will have
        # alignments in going to different processes. Track these.
        split_mols = self.track_split_molecule_alignments(
            idx_chunks, cmph5_file)

        for chunk_id in range(num_jobs):
            idx = idx_chunks[chunk_id]
            if prefix == "wga_":
                tasks.put(parse_mol_aligns.wga_molecules_processor( cmph5_file,                   \
                             chunk_id,                     \
                             prefix,                       \
                             self.Config.opts.contig_id,   \
                             self.ref_size,                \
                             self.sites_pos,               \
                             self.sites_neg,               \
                             self.Config.opts,             \
                             idx,                          \
                             split_mols ))
            else:
                logging.debug("Launching subprocess %s..." % chunk_id)
                tasks.put(parse_mol_aligns.native_molecules_processor( cmph5_file,                          \
                                chunk_id,                            \
                                prefix,                              \
                                self.Config.opts,                    \
                                self.Config.fastq,                   \
                                self.Config.ref,                     \
                                copy.copy(movie_name_ID_map),        \
                                local_control_ipds[chunk_id],        \
                                self.ref_size,                       \
                                self.sites_pos,                      \
                                self.sites_neg,                      \
                                idx,                                 \
                                split_mols))
                logging.debug("Done (%s)." % chunk_id)

        # Add a 'poison pill' for each consumer
        for i in xrange(self.Config.opts.procs):
            tasks.put(None)
        tasks.join()

        # Start printing results
        parallel_results = []
        while num_jobs:
            result = results.get()
            parallel_results.append(result)
            num_jobs -= 1

        return parallel_results