예제 #1
0
def get_sequencing_chemistry(entry_points, include_system_type=True):
    """
    Given a list of entry points (eid, path), extract the sequencing chemistry
    (and optionally system name) as a human-readable string.
    """
    chemistries = set()
    is_sequel = is_rsii = False
    for eid, path in entry_points:
        if eid == "eid_subread" and op.isfile(path):
            ds = SubreadSet(path)
            for bam in ds.resourceReaders():
                for rg in bam.readGroupTable:
                    chemistries.add(rg.SequencingChemistry)
                    if rg.SequencingChemistry.startswith("S"):
                        is_sequel = True
                    else:
                        is_rsii = True
    if len(chemistries) == 0:
        return "NA"
    chemistry_str = "; ".join(sorted(list(chemistries)))
    if include_system_type:
        fmt = "{s} ({c})"
        if is_sequel and is_rsii:
            return fmt.format(s="Mixed", c=chemistry_str)
        elif is_sequel:
            return fmt.format(s="Sequel", c=chemistry_str)
        elif is_rsii:
            return fmt.format(s="RSII", c=chemistry_str)
        else:
            raise ValueError("Can't determine system type for {c}".format(
                             c=chemistry_str))
    return chemistry_str
예제 #2
0
def get_data_stats(entry_points):
    """
    Get basic metrics for input dataset (assumed to be a SubreadSet).
    """
    for eid, path in entry_points:
        if eid == "eid_subread" and op.isfile(path):
            ds = SubreadSet(path)
            n_zmws = 0
            for bam in ds.resourceReaders():
                n_zmws += len(set(bam.pbi.holeNumber))
            return data_stats(n_zmws, ds.numRecords, ds.totalLength)
    return data_stats("NA", "NA", "NA")
def get_subread_ZMW_stats(subread_xml, report):
    """
    Fills a dict with:
    'numZMW' --- number of sequencing ZMWs
    'numSubread' -- number of subreads
    'avgZMWlen' -- approximated average ZMW length
    'avgSubreadlen' --- average subread length
    """
    subread_lens = []
    zmw_lens = defaultdict(lambda: 0)

    ds = SubreadSet(subread_xml)
    for rr in ds.resourceReaders():
        for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd):
            subread_lens.append(qEnd-qStart)
            zmw_lens[zmw] = max(zmw_lens[zmw], qEnd)

    report['numZMW'] = len(zmw_lens)
    report['numSubread'] = len(subread_lens)
    report['avgZMWlen'] = int(sum(zmw_lens.itervalues())*1./len(zmw_lens))
    report['avgSubreadlen'] = int(sum(subread_lens)*1./len(subread_lens))
def get_subread_ZMW_stats(subread_xml, report):
    """
    Fills a dict with:
    'numZMW' --- number of sequencing ZMWs
    'numSubread' -- number of subreads
    'avgZMWlen' -- approximated average ZMW length
    'avgSubreadlen' --- average subread length
    """
    subread_lens = []
    zmw_lens = defaultdict(lambda: 0)

    ds = SubreadSet(subread_xml)
    for rr in ds.resourceReaders():
        for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd):
            subread_lens.append(qEnd - qStart)
            zmw_lens[zmw] = max(zmw_lens[zmw], qEnd)

    report['numZMW'] = len(zmw_lens)
    report['numSubread'] = len(subread_lens)
    report['avgZMWlen'] = int(sum(zmw_lens.itervalues()) * 1. / len(zmw_lens))
    report['avgSubreadlen'] = int(sum(subread_lens) * 1. / len(subread_lens))
예제 #5
0
class PpaBurstMetrics:
    """
    Class for retrieving burst metrics. Two flavors
        Alignment based
        HMM Classifier based

    If required information not available, return None.
    """
    def __init__(self, subread_set_path, zmws=None, subsampleto=None):

        self.subread_set_path = subread_set_path
        self.subread_set = SubreadSet(subread_set_path)
        self.framerate = self.subread_set.resourceReaders(
        )[0].readGroupTable.FrameRate[0]
        self.subsampleto = subsampleto

        dsets = [(self.subread_set, 'subreads')]
        # grab path to scraps if available
        if self.subread_set.externalResources[0].scraps:
            self.scraps = IndexedBamReader(
                self.subread_set.externalResources[0].scraps)
            dsets.append((self.scraps, 'scraps'))

        self.ppa_burst_dtypes = self._set_ppa_burst_dtypes(
        )  # column info of burst table
        self.reads_dtypes = self._set_reads_dtypes(
        )  # column info of reads table

        if self._hasPpaBurstInfo(self.subread_set):
            if zmws is None:
                self.zmws = self._subsample_zmws()
            else:
                self.zmws = zmws
                log.info('Number of ZMWs ' + str(len(zmws)))

            results = []
            # if scraps info was present, scrape that for burst info, too
            for dset in reversed(dsets):
                ppa_bursts, reads = self.retrieve_classifier_bursts(
                    dset[0], dset[1])
                results.append((ppa_bursts, reads))
            if len(results) == 1:
                self.ppa_bursts = results[0][0]
                self.reads = results[0][1]
            elif len(results) == 2:
                subread_ppa_bursts = results[0][0]
                subread_reads = results[0][1]
                scraps_ppa_bursts = results[1][0]
                scraps_reads = results[1][1]
                self.ppa_bursts = np.hstack(
                    (subread_ppa_bursts, scraps_ppa_bursts))
                self.reads = np.hstack((subread_reads, scraps_reads))

    def _hasPpaBurstInfo(self, dset):
        """
        Check dataset for presence of 'pe' tag
        """
        if (len(dset) > 0 and 'pe' in [tag[0] for tag in dset[0].peer.tags]):
            return True
        else:
            log.info('The pe tag is not present, burst info was not annotated')
            return False

    def _set_ppa_burst_dtypes(self):
        """
        Return columns of the PPA bursts table
        """
        return [
            ('zmw', int),
            ('qStart', int),
            ('qEnd', int),
            ('seqType', 'S1'),  # seqType -> {H, L, A}
            ('burstStart', int),
            ('burstLength', int),
            ('numShorties', int),
            ('burstStartTime', int),
            ('burstEndTime', int),
            ('previousBasecall', 'S1'),
            ('previousBaseIndex', int),
            ('fractionC', float),
            ('fractionA', float),
            ('fractionT', float),
            ('fractionG', float)
        ]

    def _set_reads_dtypes(self):
        """
        Return columns of the Reads table
        """
        return [('zmw', int), ('seqType', 'S1'), ('qStart', int),
                ('qEnd', int), ('startTime', int), ('endTime', int)]

    def _resize_array(self, arr, index, increase_by):
        """
        Resize NumPy array if necessary
        """
        if index >= len(arr):  # extend array if needed
            new_size = tuple(map(operator.add, arr.shape, (increase_by, )))
            arr = np.resize(arr, new_size)
        return arr

    def _subsample_zmws(self):
        """
        Subsample Zmws for measurement
        """
        if hasattr(self, 'scraps'):
            zmws = np.union1d(self.subread_set.index.holeNumber,
                              self.scraps.index.holeNumber
                              )  # scraps index bug should be fixed
        else:
            zmws = np.unique(self.subread_set.index.holeNumber)

        if self.subsampleto is not None:
            if len(zmws) > self.subsampleto:
                zmws = np.unique(random.sample(zmws, self.subsampleto))
        return zmws

    def retrieve_classifier_bursts(self, dset, dset_type):
        """
        Retrieve information about the bursts detected by the classifier.
        Returns a recarray with the following columns:
            zmw
            queryStart
            queryEnd
            burstStart
            burstLength
            burstStartTime
            burstEndTime
            previousBasecall
            previousBaseIndex
            fractionC
            fractionA
            fractionT
            fractionG
        """
        if 'pe' not in [t[0] for t in dset[0].peer.tags
                        ]:  # check for burst classification
            return None

        # these if/else statements are here because
        # of a bug in scraps. The index array returns
        # a list of tuple (all identical values) when
        # trying to access dset.index['holeNumber']
        if dset_type == 'subreads':
            holeNumbers = dset.index['holeNumber']
        elif dset_type == 'scraps':
            holeNumbers = dset.holeNumber
        read_indices = np.flatnonzero(np.in1d(holeNumbers, self.zmws))
        bursts = np.zeros((len(self.zmws), ), dtype=self.ppa_burst_dtypes)
        burst_count = 0

        reads = np.zeros((len(read_indices), ), dtype=self.reads_dtypes)
        read_count = 0

        bases = ['a', 'c', 'g', 't']

        cnt = 0
        for index in read_indices:
            if cnt % 10000 == 0:
                log.info(str(float(cnt) / len(read_indices)))
            cnt += 1

            read = dset[index]

            # Store information about the read being considered
            # Keep info even if read doesn't contain a burst
            reads['zmw'][read_count] = read.holeNumber
            reads['qStart'][read_count] = read.qStart
            reads['qEnd'][read_count] = read.qEnd
            p2b = fm.pls2base(fm.s2npl(read.peer.get_tag('pc')))
            p2b = np.flatnonzero(
                fm.pls2base(fm.s2npl(read.peer.get_tag('pc'))) >= 0)
            start_frames = read.peer.get_tag('sf')
            reads['startTime'][read_count] = start_frames[p2b[0]]
            reads['endTime'][read_count] = start_frames[p2b[-1]]
            if dset_type == 'subreads':
                reads['seqType'] = 'H'
            elif dset_type == 'scraps':
                reads['seqType'] = read.scrapType
            read_count += 1

            # Consider read for bursts and record burst
            # information if they exist
            pe_reason = np.array(read.peer.get_tag('pe'))
            """convert short-frame exclusions that happen
            during bursts into burst exclusions"""
            shorties = np.zeros((len(pe_reason), ), dtype=int)
            for j in np.arange(1, len(pe_reason)):
                if pe_reason[j] == 1 and pe_reason[j - 1] == 2:
                    pe_reason[j] = 2
                    shorties[j] = 1

            bursty_indices = np.flatnonzero(pe_reason == 2)
            bursty_gaps = np.diff(bursty_indices)
            bursty_breaks = np.flatnonzero(bursty_gaps > 1)

            if bursty_indices.any():
                if len(bursts) <= burst_count + len(bursty_breaks) + 1:
                    # resize the bursts table
                    bursts = self._resize_array(
                        bursts, burst_count + len(bursty_breaks) + 1,
                        len(self.zmws) * 10)

                bursts['zmw'][burst_count] = read.holeNumber
                if dset_type == 'subreads':
                    bursts['seqType'] = 'H'
                elif dset_type == 'scraps':
                    bursts['seqType'] = read.scrapType
                else:
                    raise IOError(
                        'dset type must be either subreads or scraps')
                bursts['qStart'][burst_count] = read.qStart
                bursts['qEnd'][burst_count] = read.qEnd
                start_frames = read.peer.get_tag('sf')
                p2b = fm.pls2base(fm.s2npl(read.peer.get_tag('pc')))
                bursts['burstStart'][burst_count] = bursty_indices[0]
                j = bursty_indices[0] - 1
                previous_base_index = p2b[j]
                while (previous_base_index < 0) and (j >= 0):
                    j -= 1
                    previous_base_index = p2b[j]
                try:
                    bursts['previousBaseIndex'][
                        burst_count] = previous_base_index
                    bursts['previousBasecall'][burst_count] = read.read(
                        aligned=False)[previous_base_index]
                except IndexError:  # catch reads where there are no previous basecalls
                    bursts['previousBaseIndex'][burst_count] = -1
                    bursts['previousBasecall'][burst_count] = 'Z'
                if bursty_breaks.any():
                    # This uses sandwich logic. Store the start info for the
                    # first burst. If there are additional bursts, scan through
                    # and store all the info for those.
                    # Finally, store the burst end info of the last burst
                    # If there was a single burst, the for loop would be skipped
                    # altogether.
                    for bursty_break in bursty_breaks:
                        j = bursty_indices[bursty_break]
                        bursts['burstLength'][burst_count] = j - bursts[
                            'burstStart'][burst_count] + 1
                        bursts['burstStartTime'][burst_count] = start_frames[
                            bursts['burstStart'][burst_count]]
                        bursts['burstEndTime'][burst_count] = start_frames[(
                            bursts['burstStart'][burst_count] +
                            bursts['burstLength'][burst_count])]
                        bs = bursts['burstStart'][burst_count]
                        be = (bursts['burstStart'][burst_count] +
                              bursts['burstLength'][burst_count])
                        bursts['numShorties'][burst_count] = np.sum(
                            shorties[bs:be])
                        burstcalls = list(read.peer.get_tag('pc')[bs:be])
                        for base in bases:
                            f1 = np.divide(
                                len(
                                    np.flatnonzero(
                                        np.array(burstcalls, 'S') == base)),
                                len(burstcalls),
                                dtype=float)  # include rejected bases (pulses)
                            f2 = np.divide(len(
                                np.flatnonzero(
                                    np.array(burstcalls, 'S') == string.upper(
                                        base))),
                                           len(burstcalls),
                                           dtype=float)  # include basecalls
                            bursts['fraction' +
                                   string.upper(base)][burst_count] = f1 + f2
                        burst_count += 1
                        bursts['zmw'][burst_count] = read.holeNumber
                        bursts['qStart'][burst_count] = read.qStart
                        bursts['qEnd'][burst_count] = read.qEnd
                        next_index = bursty_indices[bursty_break + 1]
                        bursts['burstStart'][burst_count] = next_index
                        j = next_index - 1
                        previous_base_index = p2b[j]
                        while (previous_base_index < 0) and (j >= 0):
                            j -= 1
                            previous_base_index = p2b[j]
                        bursts['previousBaseIndex'][
                            burst_count] = previous_base_index
                        bursts['previousBasecall'][burst_count] = read.read(
                            aligned=False)[previous_base_index]
                bursts['burstLength'][burst_count] = (
                    bursty_indices[-1] - bursts['burstStart'][burst_count])
                bursts['burstStartTime'][burst_count] = start_frames[
                    bursts['burstStart'][burst_count]]
                bursts['burstEndTime'][burst_count] = start_frames[(
                    bursts['burstStart'][burst_count] +
                    bursts['burstLength'][burst_count])]
                bs = bursts['burstStart'][burst_count]
                be = bursts['burstStart'][burst_count] + bursts['burstLength'][
                    burst_count]
                bursts['numShorties'][burst_count] = np.sum(shorties[bs:be])
                burstcalls = list(read.peer.get_tag('pc')[bs:be])
                for base in bases:
                    f1 = np.divide(
                        len(np.flatnonzero(np.array(burstcalls, 'S') == base)),
                        len(burstcalls),
                        dtype=float)  # include rejected bases (pulses)
                    f2 = np.divide(len(
                        np.flatnonzero(
                            np.array(burstcalls, 'S') == string.upper(base))),
                                   len(burstcalls),
                                   dtype=float)  # include basecalls
                    bursts['fraction' +
                           string.upper(base)][burst_count] = f1 + f2
                burst_count += 1

        # remove the empty rows
        bursts = bursts[bursts['zmw'] != 0]
        bursts['burstStartTime'] = np.divide(bursts['burstStartTime'],
                                             self.framerate * 60,
                                             dtype=float)
        bursts['burstEndTime'] = np.divide(bursts['burstEndTime'],
                                           self.framerate * 60,
                                           dtype=float)
        reads = reads[reads['zmw'] != 0]
        reads['startTime'] = np.divide(reads['startTime'],
                                       self.framerate * 60,
                                       dtype=float)
        reads['endTime'] = np.divide(reads['endTime'],
                                     self.framerate * 60,
                                     dtype=float)
        return bursts, reads