def get_sequencing_chemistry(entry_points, include_system_type=True): """ Given a list of entry points (eid, path), extract the sequencing chemistry (and optionally system name) as a human-readable string. """ chemistries = set() is_sequel = is_rsii = False for eid, path in entry_points: if eid == "eid_subread" and op.isfile(path): ds = SubreadSet(path) for bam in ds.resourceReaders(): for rg in bam.readGroupTable: chemistries.add(rg.SequencingChemistry) if rg.SequencingChemistry.startswith("S"): is_sequel = True else: is_rsii = True if len(chemistries) == 0: return "NA" chemistry_str = "; ".join(sorted(list(chemistries))) if include_system_type: fmt = "{s} ({c})" if is_sequel and is_rsii: return fmt.format(s="Mixed", c=chemistry_str) elif is_sequel: return fmt.format(s="Sequel", c=chemistry_str) elif is_rsii: return fmt.format(s="RSII", c=chemistry_str) else: raise ValueError("Can't determine system type for {c}".format( c=chemistry_str)) return chemistry_str
def get_data_stats(entry_points): """ Get basic metrics for input dataset (assumed to be a SubreadSet). """ for eid, path in entry_points: if eid == "eid_subread" and op.isfile(path): ds = SubreadSet(path) n_zmws = 0 for bam in ds.resourceReaders(): n_zmws += len(set(bam.pbi.holeNumber)) return data_stats(n_zmws, ds.numRecords, ds.totalLength) return data_stats("NA", "NA", "NA")
def get_subread_ZMW_stats(subread_xml, report): """ Fills a dict with: 'numZMW' --- number of sequencing ZMWs 'numSubread' -- number of subreads 'avgZMWlen' -- approximated average ZMW length 'avgSubreadlen' --- average subread length """ subread_lens = [] zmw_lens = defaultdict(lambda: 0) ds = SubreadSet(subread_xml) for rr in ds.resourceReaders(): for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd): subread_lens.append(qEnd-qStart) zmw_lens[zmw] = max(zmw_lens[zmw], qEnd) report['numZMW'] = len(zmw_lens) report['numSubread'] = len(subread_lens) report['avgZMWlen'] = int(sum(zmw_lens.itervalues())*1./len(zmw_lens)) report['avgSubreadlen'] = int(sum(subread_lens)*1./len(subread_lens))
def get_subread_ZMW_stats(subread_xml, report): """ Fills a dict with: 'numZMW' --- number of sequencing ZMWs 'numSubread' -- number of subreads 'avgZMWlen' -- approximated average ZMW length 'avgSubreadlen' --- average subread length """ subread_lens = [] zmw_lens = defaultdict(lambda: 0) ds = SubreadSet(subread_xml) for rr in ds.resourceReaders(): for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd): subread_lens.append(qEnd - qStart) zmw_lens[zmw] = max(zmw_lens[zmw], qEnd) report['numZMW'] = len(zmw_lens) report['numSubread'] = len(subread_lens) report['avgZMWlen'] = int(sum(zmw_lens.itervalues()) * 1. / len(zmw_lens)) report['avgSubreadlen'] = int(sum(subread_lens) * 1. / len(subread_lens))
class PpaBurstMetrics: """ Class for retrieving burst metrics. Two flavors Alignment based HMM Classifier based If required information not available, return None. """ def __init__(self, subread_set_path, zmws=None, subsampleto=None): self.subread_set_path = subread_set_path self.subread_set = SubreadSet(subread_set_path) self.framerate = self.subread_set.resourceReaders( )[0].readGroupTable.FrameRate[0] self.subsampleto = subsampleto dsets = [(self.subread_set, 'subreads')] # grab path to scraps if available if self.subread_set.externalResources[0].scraps: self.scraps = IndexedBamReader( self.subread_set.externalResources[0].scraps) dsets.append((self.scraps, 'scraps')) self.ppa_burst_dtypes = self._set_ppa_burst_dtypes( ) # column info of burst table self.reads_dtypes = self._set_reads_dtypes( ) # column info of reads table if self._hasPpaBurstInfo(self.subread_set): if zmws is None: self.zmws = self._subsample_zmws() else: self.zmws = zmws log.info('Number of ZMWs ' + str(len(zmws))) results = [] # if scraps info was present, scrape that for burst info, too for dset in reversed(dsets): ppa_bursts, reads = self.retrieve_classifier_bursts( dset[0], dset[1]) results.append((ppa_bursts, reads)) if len(results) == 1: self.ppa_bursts = results[0][0] self.reads = results[0][1] elif len(results) == 2: subread_ppa_bursts = results[0][0] subread_reads = results[0][1] scraps_ppa_bursts = results[1][0] scraps_reads = results[1][1] self.ppa_bursts = np.hstack( (subread_ppa_bursts, scraps_ppa_bursts)) self.reads = np.hstack((subread_reads, scraps_reads)) def _hasPpaBurstInfo(self, dset): """ Check dataset for presence of 'pe' tag """ if (len(dset) > 0 and 'pe' in [tag[0] for tag in dset[0].peer.tags]): return True else: log.info('The pe tag is not present, burst info was not annotated') return False def _set_ppa_burst_dtypes(self): """ Return columns of the PPA bursts table """ return [ ('zmw', int), ('qStart', int), ('qEnd', int), ('seqType', 'S1'), # seqType -> {H, L, A} ('burstStart', int), ('burstLength', int), ('numShorties', int), ('burstStartTime', int), ('burstEndTime', int), ('previousBasecall', 'S1'), ('previousBaseIndex', int), ('fractionC', float), ('fractionA', float), ('fractionT', float), ('fractionG', float) ] def _set_reads_dtypes(self): """ Return columns of the Reads table """ return [('zmw', int), ('seqType', 'S1'), ('qStart', int), ('qEnd', int), ('startTime', int), ('endTime', int)] def _resize_array(self, arr, index, increase_by): """ Resize NumPy array if necessary """ if index >= len(arr): # extend array if needed new_size = tuple(map(operator.add, arr.shape, (increase_by, ))) arr = np.resize(arr, new_size) return arr def _subsample_zmws(self): """ Subsample Zmws for measurement """ if hasattr(self, 'scraps'): zmws = np.union1d(self.subread_set.index.holeNumber, self.scraps.index.holeNumber ) # scraps index bug should be fixed else: zmws = np.unique(self.subread_set.index.holeNumber) if self.subsampleto is not None: if len(zmws) > self.subsampleto: zmws = np.unique(random.sample(zmws, self.subsampleto)) return zmws def retrieve_classifier_bursts(self, dset, dset_type): """ Retrieve information about the bursts detected by the classifier. Returns a recarray with the following columns: zmw queryStart queryEnd burstStart burstLength burstStartTime burstEndTime previousBasecall previousBaseIndex fractionC fractionA fractionT fractionG """ if 'pe' not in [t[0] for t in dset[0].peer.tags ]: # check for burst classification return None # these if/else statements are here because # of a bug in scraps. The index array returns # a list of tuple (all identical values) when # trying to access dset.index['holeNumber'] if dset_type == 'subreads': holeNumbers = dset.index['holeNumber'] elif dset_type == 'scraps': holeNumbers = dset.holeNumber read_indices = np.flatnonzero(np.in1d(holeNumbers, self.zmws)) bursts = np.zeros((len(self.zmws), ), dtype=self.ppa_burst_dtypes) burst_count = 0 reads = np.zeros((len(read_indices), ), dtype=self.reads_dtypes) read_count = 0 bases = ['a', 'c', 'g', 't'] cnt = 0 for index in read_indices: if cnt % 10000 == 0: log.info(str(float(cnt) / len(read_indices))) cnt += 1 read = dset[index] # Store information about the read being considered # Keep info even if read doesn't contain a burst reads['zmw'][read_count] = read.holeNumber reads['qStart'][read_count] = read.qStart reads['qEnd'][read_count] = read.qEnd p2b = fm.pls2base(fm.s2npl(read.peer.get_tag('pc'))) p2b = np.flatnonzero( fm.pls2base(fm.s2npl(read.peer.get_tag('pc'))) >= 0) start_frames = read.peer.get_tag('sf') reads['startTime'][read_count] = start_frames[p2b[0]] reads['endTime'][read_count] = start_frames[p2b[-1]] if dset_type == 'subreads': reads['seqType'] = 'H' elif dset_type == 'scraps': reads['seqType'] = read.scrapType read_count += 1 # Consider read for bursts and record burst # information if they exist pe_reason = np.array(read.peer.get_tag('pe')) """convert short-frame exclusions that happen during bursts into burst exclusions""" shorties = np.zeros((len(pe_reason), ), dtype=int) for j in np.arange(1, len(pe_reason)): if pe_reason[j] == 1 and pe_reason[j - 1] == 2: pe_reason[j] = 2 shorties[j] = 1 bursty_indices = np.flatnonzero(pe_reason == 2) bursty_gaps = np.diff(bursty_indices) bursty_breaks = np.flatnonzero(bursty_gaps > 1) if bursty_indices.any(): if len(bursts) <= burst_count + len(bursty_breaks) + 1: # resize the bursts table bursts = self._resize_array( bursts, burst_count + len(bursty_breaks) + 1, len(self.zmws) * 10) bursts['zmw'][burst_count] = read.holeNumber if dset_type == 'subreads': bursts['seqType'] = 'H' elif dset_type == 'scraps': bursts['seqType'] = read.scrapType else: raise IOError( 'dset type must be either subreads or scraps') bursts['qStart'][burst_count] = read.qStart bursts['qEnd'][burst_count] = read.qEnd start_frames = read.peer.get_tag('sf') p2b = fm.pls2base(fm.s2npl(read.peer.get_tag('pc'))) bursts['burstStart'][burst_count] = bursty_indices[0] j = bursty_indices[0] - 1 previous_base_index = p2b[j] while (previous_base_index < 0) and (j >= 0): j -= 1 previous_base_index = p2b[j] try: bursts['previousBaseIndex'][ burst_count] = previous_base_index bursts['previousBasecall'][burst_count] = read.read( aligned=False)[previous_base_index] except IndexError: # catch reads where there are no previous basecalls bursts['previousBaseIndex'][burst_count] = -1 bursts['previousBasecall'][burst_count] = 'Z' if bursty_breaks.any(): # This uses sandwich logic. Store the start info for the # first burst. If there are additional bursts, scan through # and store all the info for those. # Finally, store the burst end info of the last burst # If there was a single burst, the for loop would be skipped # altogether. for bursty_break in bursty_breaks: j = bursty_indices[bursty_break] bursts['burstLength'][burst_count] = j - bursts[ 'burstStart'][burst_count] + 1 bursts['burstStartTime'][burst_count] = start_frames[ bursts['burstStart'][burst_count]] bursts['burstEndTime'][burst_count] = start_frames[( bursts['burstStart'][burst_count] + bursts['burstLength'][burst_count])] bs = bursts['burstStart'][burst_count] be = (bursts['burstStart'][burst_count] + bursts['burstLength'][burst_count]) bursts['numShorties'][burst_count] = np.sum( shorties[bs:be]) burstcalls = list(read.peer.get_tag('pc')[bs:be]) for base in bases: f1 = np.divide( len( np.flatnonzero( np.array(burstcalls, 'S') == base)), len(burstcalls), dtype=float) # include rejected bases (pulses) f2 = np.divide(len( np.flatnonzero( np.array(burstcalls, 'S') == string.upper( base))), len(burstcalls), dtype=float) # include basecalls bursts['fraction' + string.upper(base)][burst_count] = f1 + f2 burst_count += 1 bursts['zmw'][burst_count] = read.holeNumber bursts['qStart'][burst_count] = read.qStart bursts['qEnd'][burst_count] = read.qEnd next_index = bursty_indices[bursty_break + 1] bursts['burstStart'][burst_count] = next_index j = next_index - 1 previous_base_index = p2b[j] while (previous_base_index < 0) and (j >= 0): j -= 1 previous_base_index = p2b[j] bursts['previousBaseIndex'][ burst_count] = previous_base_index bursts['previousBasecall'][burst_count] = read.read( aligned=False)[previous_base_index] bursts['burstLength'][burst_count] = ( bursty_indices[-1] - bursts['burstStart'][burst_count]) bursts['burstStartTime'][burst_count] = start_frames[ bursts['burstStart'][burst_count]] bursts['burstEndTime'][burst_count] = start_frames[( bursts['burstStart'][burst_count] + bursts['burstLength'][burst_count])] bs = bursts['burstStart'][burst_count] be = bursts['burstStart'][burst_count] + bursts['burstLength'][ burst_count] bursts['numShorties'][burst_count] = np.sum(shorties[bs:be]) burstcalls = list(read.peer.get_tag('pc')[bs:be]) for base in bases: f1 = np.divide( len(np.flatnonzero(np.array(burstcalls, 'S') == base)), len(burstcalls), dtype=float) # include rejected bases (pulses) f2 = np.divide(len( np.flatnonzero( np.array(burstcalls, 'S') == string.upper(base))), len(burstcalls), dtype=float) # include basecalls bursts['fraction' + string.upper(base)][burst_count] = f1 + f2 burst_count += 1 # remove the empty rows bursts = bursts[bursts['zmw'] != 0] bursts['burstStartTime'] = np.divide(bursts['burstStartTime'], self.framerate * 60, dtype=float) bursts['burstEndTime'] = np.divide(bursts['burstEndTime'], self.framerate * 60, dtype=float) reads = reads[reads['zmw'] != 0] reads['startTime'] = np.divide(reads['startTime'], self.framerate * 60, dtype=float) reads['endTime'] = np.divide(reads['endTime'], self.framerate * 60, dtype=float) return bursts, reads