class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.get_tid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] self._featureNameMappings = {} # RGID -> ("abstract feature name" -> actual feature name) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] rgFrameRate = ds["FRAMERATEHZ"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate)) # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) featureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._featureNameMappings[rgID] = featureNameMapping self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float)]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } # The pulse features "available" to clients of this file are the intersection # of pulse features available from each read group. self._pulseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._featureNameMappings.values()]) def _loadProgramInfo(self): pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", []) ] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords( pgRecords, dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0.1 try: checkedVersion = self.version if "b" in checkedVersion: raise Exception() else: major, minor, patch = checkedVersion.split('.') assert major >= 3 assert minor >= 0 assert patch >= 1 except: raise IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0.1 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not(self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.get_tid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords( zip(refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] # RGID -> ("abstract feature name" -> actual feature name) self._baseFeatureNameMappings = {} self._pulseFeatureNameMappings = {} for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join( ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] rgFrameRate = ds["FRAMERATEHZ"] # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) baseFeatureNameMapping = { key.split(":")[0]: key for key in ds.keys() if key in BASE_FEATURE_TAGS } pulseFeatureNameMapping = { key.split(":")[0]: key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping readGroupTable_.append( (rgID, rgName, rgReadType, rgChem, rgFrameRate, frozenset(baseFeatureNameMapping.iterkeys()))) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.int32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O"), ("FrameRate", float), ("BaseFeatures", "O") ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} # The base/pulse features "available" to clients of this file are the intersection # of features available from each read group. self._baseFeaturesAvailable = set.intersection(*[ set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values() ]) self._pulseFeaturesAvailable = set.intersection(*[ set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values() ]) def _loadProgramInfo(self): pgRecords = [(pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", [])] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords(pgRecords, dtype=[("ID", "O"), ("Version", "O"), ("CommandLine", "O") ]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set( (c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0.1 badVersionException = IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0.1 are supported)") checkedVersion = self.version if "b" in checkedVersion: raise badVersionException else: major, minor, patch = checkedVersion.split('.') if not (major, minor, patch) >= (3, 0, 1): raise badVersionException def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not (self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasBaseFeature(self, featureName): return featureName in self._baseFeaturesAvailable def baseFeaturesAvailable(self): return self._baseFeaturesAvailable def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable def hasPulseFeatures(self): """ Is this BAM file a product of running analysis with the PacBio-internal analysis mode enabled? """ return self.hasPulseFeature("PulseCall") @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()