示例#1
0
class _BamReaderBase(ReaderBase):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames   = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s    = [r["M5"] for r in refRecords]
        refIds = map(self.peer.get_tid, refNames)
        nRefs = len(refRecords)

        if nRefs > 0:
            self._referenceInfoTable = np.rec.fromrecords(zip(
                refIds,
                refIds,
                refNames,
                refNames,
                refLengths,
                refMD5s,
                np.zeros(nRefs, dtype=np.uint32),
                np.zeros(nRefs, dtype=np.uint32)),
                dtype=[('ID', '<i8'), ('RefInfoID', '<i8'),
                       ('Name', 'O'), ('FullName', 'O'),
                       ('Length', '<i8'), ('MD5', 'O'),
                       ('StartRow', '<u4'), ('EndRow', '<u4')])
            self._referenceDict = {}
            self._referenceDict.update(zip(refIds, self._referenceInfoTable))
            self._referenceDict.update(zip(refNames, self._referenceInfoTable))
        else:
            self._referenceInfoTable = None
            self._referenceDict = None

    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        self._featureNameMappings = {}  # RGID -> ("abstract feature name" -> actual feature name)

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            rgFrameRate = ds["FRAMERATEHZ"]
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            featureNameMapping = { key.split(":")[0] : key
                                   for key in ds.keys()
                                   if key in PULSE_FEATURE_TAGS }
            self._featureNameMappings[rgID] = featureNameMapping

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float)])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        # The pulse features "available" to clients of this file are the intersection
        # of pulse features available from each read group.
        self._pulseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._featureNameMappings.values()])

    def _loadProgramInfo(self):
        pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
                      for pg in self.peer.header.get("PG", []) ]

        if len(pgRecords) > 0:
            self._programTable = np.rec.fromrecords(
                pgRecords,
                dtype=[("ID"     ,     "O"),
                       ("Version",     "O"),
                       ("CommandLine", "O")])
        else:
            self._programTable = None

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, len(c)) for c in ft)
        bamIdsAndLens   = set((c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def _checkFileCompatibility(self):
        # Verify that this is a "pacbio" BAM file of version at least
        # 3.0.1
        try:
            checkedVersion = self.version
            if "b" in checkedVersion:
                raise Exception()
            else:
                major, minor, patch = checkedVersion.split('.')
                assert major >= 3
                assert minor >= 0
                assert patch >= 1
        except:
            raise IncompatibleFile(
                "This BAM file is incompatible with this API " +
                "(only PacBio BAM files version >= 3.0.1 are supported)")

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    @property
    def isUnmapped(self):
        return not(self.isMapped)

    @property
    def isMapped(self):
        return len(self.peer.header["SQ"]) > 0

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroupInfo(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    @property
    def version(self):
        return self.peer.header["HD"]["pb"]

    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return self.peer.header["HD"]["SO"] == "coordinate"

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    @requiresBai
    def __len__(self):
        return self.peer.mapped + self.peer.unmapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
示例#2
0
class _BamReaderBase(ReaderBase):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s = [r["M5"] for r in refRecords]
        refIds = map(self.peer.get_tid, refNames)
        nRefs = len(refRecords)

        if nRefs > 0:
            self._referenceInfoTable = np.rec.fromrecords(
                zip(refIds, refIds, refNames, refNames, refLengths, refMD5s,
                    np.zeros(nRefs, dtype=np.uint32),
                    np.zeros(nRefs, dtype=np.uint32)),
                dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'),
                       ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'),
                       ('StartRow', '<u4'), ('EndRow', '<u4')])
            self._referenceDict = {}
            self._referenceDict.update(zip(refIds, self._referenceInfoTable))
            self._referenceDict.update(zip(refNames, self._referenceInfoTable))
        else:
            self._referenceInfoTable = None
            self._referenceDict = None

    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []

        # RGID -> ("abstract feature name" -> actual feature name)
        self._baseFeatureNameMappings = {}
        self._pulseFeatureNameMappings = {}

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([
                pair.split("=") for pair in rg["DS"].split(";") if pair != ""
            ])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(
                ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            rgFrameRate = ds["FRAMERATEHZ"]

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            baseFeatureNameMapping = {
                key.split(":")[0]: key
                for key in ds.keys() if key in BASE_FEATURE_TAGS
            }
            pulseFeatureNameMapping = {
                key.split(":")[0]: key
                for key in ds.keys() if key in PULSE_FEATURE_TAGS
            }
            self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping
            self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping

            readGroupTable_.append(
                (rgID, rgName, rgReadType, rgChem, rgFrameRate,
                 frozenset(baseFeatureNameMapping.iterkeys())))

        self._readGroupTable = np.rec.fromrecords(readGroupTable_,
                                                  dtype=[
                                                      ("ID", np.int32),
                                                      ("MovieName", "O"),
                                                      ("ReadType", "O"),
                                                      ("SequencingChemistry",
                                                       "O"),
                                                      ("FrameRate", float),
                                                      ("BaseFeatures", "O")
                                                  ])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable}

        # The base/pulse features "available" to clients of this file are the intersection
        # of features available from each read group.
        self._baseFeaturesAvailable = set.intersection(*[
            set(mapping.keys())
            for mapping in self._baseFeatureNameMappings.values()
        ])
        self._pulseFeaturesAvailable = set.intersection(*[
            set(mapping.keys())
            for mapping in self._pulseFeatureNameMappings.values()
        ])

    def _loadProgramInfo(self):
        pgRecords = [(pg["ID"], pg.get("VN", None), pg.get("CL", None))
                     for pg in self.peer.header.get("PG", [])]

        if len(pgRecords) > 0:
            self._programTable = np.rec.fromrecords(pgRecords,
                                                    dtype=[("ID", "O"),
                                                           ("Version", "O"),
                                                           ("CommandLine", "O")
                                                           ])
        else:
            self._programTable = None

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, len(c)) for c in ft)
        bamIdsAndLens = set(
            (c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def _checkFileCompatibility(self):
        # Verify that this is a "pacbio" BAM file of version at least
        # 3.0.1
        badVersionException = IncompatibleFile(
            "This BAM file is incompatible with this API " +
            "(only PacBio BAM files version >= 3.0.1 are supported)")
        checkedVersion = self.version
        if "b" in checkedVersion:
            raise badVersionException
        else:
            major, minor, patch = checkedVersion.split('.')
            if not (major, minor, patch) >= (3, 0, 1):
                raise badVersionException

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    @property
    def isUnmapped(self):
        return not (self.isMapped)

    @property
    def isMapped(self):
        return len(self.peer.header["SQ"]) > 0

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroupInfo(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    @property
    def version(self):
        return self.peer.header["HD"]["pb"]

    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return self.peer.header["HD"]["SO"] == "coordinate"

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasBaseFeature(self, featureName):
        return featureName in self._baseFeaturesAvailable

    def baseFeaturesAvailable(self):
        return self._baseFeaturesAvailable

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    def hasPulseFeatures(self):
        """
        Is this BAM file a product of running analysis with the
        PacBio-internal analysis mode enabled?
        """
        return self.hasPulseFeature("PulseCall")

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    @requiresBai
    def __len__(self):
        return self.peer.mapped + self.peer.unmapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()