class _BamReaderBase(object): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: # Regarding RG ID: BLASR currently outputs a hex digest of # 10 nibbles, instead of the 8 which would fit into a # 32-bit word. So we truncate here for the purposes of # cross-referencing within this API and the PacBioBamIndex # API. We do check for a collision below. rgID = int(rg["ID"][:8], 16) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds["SOFTWAREVERSION"] rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.uint32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O")]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): # TODO: guarantee that these fields are nonoptional in our bams --- check with Marcus # TODO: are we interesting in the PP info? self._programTable = np.rec.fromrecords( [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header["PG"] ], dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, c.length) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb") # Check for sortedness, index. # There doesn't seem to be a "public" way to do this right # now, but that's fine because we're going to have to rewrite # it all anyway once the pysam rewrite lands. if not self.peer._hasIndex: raise ValueError, "Specified bam file lacks a bam index---required for this API" self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None def attach(self, fofnFilename): self.basH5Collection = BasH5Collection(fofnFilename) @property def moviesAttached(self): return (self.basH5Collection is not None) @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") #TODO: change concept to readGroupTable in cmp.h5 @property def movieInfoTable(self): raise Unimplemented() # TODO: change to read group accessor, this is semantically wrong now def movieInfo(self, movieId): raise Unimplemented() @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroup(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) #TODO: elide "Info" innames? @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" #TODO: Marcus needs to put something in the spec for this @property def version(self): raise Unimplemented() #TODO: Marcus needs to put something in the spec for this def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return True @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) # TODO: make this private in cmp.h5 reader def alignmentGroup(self, alnGroupId): raise UnavailableFeature("BAM has no HDF5 groups") def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() def __repr__(self): return "<%s for %s>" % (type(self).__name__, self.filename) def __len__(self): return self.peer.mapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords( zip(refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join( ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] # TODO(dalexander): need FRAMERATEHZ in RG::DS! #rgFrameRate = ds["FRAMERATEHZ"] rgFrameRate = 75.0 readGroupTable_.append( (rgID, rgName, rgReadType, rgChem, rgFrameRate)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.int32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O"), ("FrameRate", float) ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): pgRecords = [(pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", [])] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords(pgRecords, dtype=[("ID", "O"), ("Version", "O"), ("CommandLine", "O") ]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set( (c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0b3 try: checkedVersion = self.version except: raise IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0b3 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not (self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] # TODO(dalexander): need FRAMERATEHZ in RG::DS! #rgFrameRate = ds["FRAMERATEHZ"] rgFrameRate = 75.0 readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float)]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", []) ] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords( pgRecords, dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0b3 try: checkedVersion = self.version except: raise IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0b3 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not(self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
class _BamReaderBase(object): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) self._referenceInfoTable = np.rec.fromrecords( zip(refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: # Regarding RG ID: BLASR currently outputs a hex digest of # 10 nibbles, instead of the 8 which would fit into a # 32-bit word. So we truncate here for the purposes of # cross-referencing within this API and the PacBioBamIndex # API. We do check for a collision below. rgID = int(rg["ID"][:8], 16) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds[ "SOFTWAREVERSION"] rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.uint32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O") ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): # TODO: guarantee that these fields are nonoptional in our bams --- check with Marcus # TODO: are we interesting in the PP info? self._programTable = np.rec.fromrecords( [(pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header["PG"]], dtype=[("ID", "O"), ("Version", "O"), ("CommandLine", "O")]) def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, c.length) for c in ft) bamIdsAndLens = set( (c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb") # Check for sortedness, index. # There doesn't seem to be a "public" way to do this right # now, but that's fine because we're going to have to rewrite # it all anyway once the pysam rewrite lands. if not self.peer._hasIndex: raise ValueError, "Specified bam file lacks a bam index---required for this API" self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None def attach(self, fofnFilename): self.basH5Collection = BasH5Collection(fofnFilename) @property def moviesAttached(self): return (self.basH5Collection is not None) @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") #TODO: change concept to readGroupTable in cmp.h5 @property def movieInfoTable(self): raise Unimplemented() # TODO: change to read group accessor, this is semantically wrong now def movieInfo(self, movieId): raise Unimplemented() @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroup(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) #TODO: elide "Info" innames? @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" #TODO: Marcus needs to put something in the spec for this @property def version(self): raise Unimplemented() #TODO: Marcus needs to put something in the spec for this def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return True @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) # TODO: make this private in cmp.h5 reader def alignmentGroup(self, alnGroupId): raise UnavailableFeature("BAM has no HDF5 groups") def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() def __repr__(self): return "<%s for %s>" % (type(self).__name__, self.filename) def __len__(self): return self.peer.mapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()