Пример #1
0
    def _loadReferenceInfo(self):
        _referenceGroupTbl = np.rec.fromrecords(
            zip(self.file["/RefGroup/ID"],
                self.file["/RefGroup/RefInfoID"],
                [path[1:] for path in self.file["/RefGroup/Path"]]),
            dtype=[("ID"       , int),
                   ("RefInfoID", int),
                   ("Name"     , object)])

        _referenceInfoTbl = np.rec.fromrecords(
            zip(self.file["/RefInfo/ID"],
                self.file["/RefInfo/FullName"],
                self.file["/RefInfo/Length"],
                self.file["/RefInfo/MD5"]) ,
            dtype=[("RefInfoID", int),
                   ("FullName" , object),
                   ("Length"   , int),
                   ("MD5"      , object)])

        self._referenceInfoTable = \
            rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")

        if self.isSorted:
            _offsetTable = self.file["/RefGroup/OffsetTable"].value \
                              .view(dtype=OFFSET_TABLE_DTYPE)       \
                              .view(np.recarray)                    \
                              .flatten()
            self._referenceInfoTable = rec_join("ID",
                                                self._referenceInfoTable,
                                                _offsetTable,
                                                jointype="inner")
        self._referenceDict = {}
        self._readLocatorByKey = {}
        for record in self._referenceInfoTable:
            if record.ID != -1:
                assert record.ID != record.Name
                shortName = splitFastaHeader(record.FullName)[0]
                if (shortName       in self._referenceDict or
                    record.ID       in self._referenceDict or
                    record.Name     in self._referenceDict or
                    record.FullName in self._referenceDict or
                    record.MD5      in self._referenceDict):
                    raise ValueError, "Duplicate reference contig sequence or identifier"
                else:
                    self._referenceDict[shortName]       = record
                    self._referenceDict[record.ID]       = record
                    self._referenceDict[record.Name]     = record
                    self._referenceDict[record.FullName] = record
                    self._referenceDict[record.MD5]      = record

                if self.isSorted:
                    readLocator = makeReadLocator(self, record.ID)
                    self._readLocatorByKey[record.ID] = readLocator
                    self._readLocatorByKey[shortName] = readLocator
Пример #2
0
    def _loadReferenceInfo(self):
        _referenceGroupTbl = np.rec.fromrecords(
            zip(self.file["/RefGroup/ID"][:],
                self.file["/RefGroup/RefInfoID"][:],
                [path[1:] for path in self.file["/RefGroup/Path"]]),
            dtype=[("ID"       , int),
                   ("RefInfoID", int),
                   ("Name"     , object)])

        _referenceInfoTbl = np.rec.fromrecords(
            zip(self.file["/RefInfo/ID"][:],
                self.file["/RefInfo/FullName"][:],
                self.file["/RefInfo/Length"][:],
                self.file["/RefInfo/MD5"][:]) ,
            dtype=[("RefInfoID", int),
                   ("FullName" , object),
                   ("Length"   , int),
                   ("MD5"      , object)])

        self._referenceInfoTable = \
            rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")

        if self.isSorted:
            _offsetTable = self.file["/RefGroup/OffsetTable"].value \
                              .view(dtype=OFFSET_TABLE_DTYPE)       \
                              .view(np.recarray)                    \
                              .flatten()
            self._referenceInfoTable = rec_join("ID",
                                                self._referenceInfoTable,
                                                _offsetTable,
                                                jointype="inner")
        self._referenceDict = {}
        self._readLocatorByKey = {}

        # For cmp.h5 files with large numbers of references, accessing
        # the recarray fields in the inner loop was terribly slow.
        # This makes things faster, though the code is less
        # straightforward.  (One of the tradeoffs we have to make
        # without a compiler to help us...)
        recordID       = self._referenceInfoTable.ID
        recordName     = self._referenceInfoTable.Name
        recordFullName = self._referenceInfoTable.FullName
        recordMD5      = self._referenceInfoTable.MD5

        for i, record in enumerate(self._referenceInfoTable):
            if recordID[i] != -1:
                assert recordID[i] != record.Name
                shortName = splitFastaHeader(record.FullName)[0]
                if (shortName         in self._referenceDict or
                    recordID[i]       in self._referenceDict or
                    recordName[i]     in self._referenceDict or
                    recordFullName[i] in self._referenceDict or
                    recordMD5[i]      in self._referenceDict):
                    raise ValueError, "Duplicate reference contig sequence or identifier"
                else:
                    self._referenceDict[shortName]         = record
                    self._referenceDict[recordID[i]]       = record
                    self._referenceDict[recordName[i]]     = record
                    self._referenceDict[recordFullName[i]] = record
                    self._referenceDict[recordMD5[i]]      = record

                if self.isSorted:
                    readLocator = makeReadLocator(self, recordID[i])
                    self._readLocatorByKey[recordID[i]] = readLocator
                    self._readLocatorByKey[shortName] = readLocator
Пример #3
0
    def __init__(self, filenameOrH5File):
        if isinstance(filenameOrH5File, h5py.File):
            if filenameOrH5File.mode != "r":
                raise ValueError(
                    "HDF5 files used by CmpH5Reader must be opened read-only!")
            self.filename = filenameOrH5File.filename
            self.file = filenameOrH5File
        else:
            try:
                self.filename = abspath(expanduser(filenameOrH5File))
                self.file = h5py.File(self.filename, "r")
            except IOError:
                raise IOError, ("Invalid or nonexistent cmp.h5 file %s" %
                                filenameOrH5File)

        if len(self.file["/AlnInfo/AlnIndex"]) == 0:
            raise ValueError, "Empty cmp.h5 file, cannot be read by CmpH5Reader"
        rawAlignmentIndex = self.file["/AlnInfo/AlnIndex"].value
        self._alignmentIndex = rawAlignmentIndex.view(dtype = ALIGNMENT_INDEX_DTYPE) \
                                                .view(np.recarray)                   \
                                                .flatten()

        # This is the only sneaky part of this whole class.  We do not
        # store the raw h5py group object; rather we cache a dict of {
        # dataset_name -> dataset }.  This way we avoid B-tree
        # scanning in basic data access.
        self._alignmentGroupById = {}
        for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"],
                                              self.file["/AlnGroup/Path"]):
            alnGroup = self.file[alnGroupPath]
            self._alignmentGroupById[alnGroupId] = dict(alnGroup.items())

        numMovies = len(self.file["/MovieInfo/ID"])

        if "FrameRate" in self.file["/MovieInfo"]:
            frameRate = self.file["/MovieInfo/FrameRate"].value
            timeScale = 1.0 / frameRate
        else:
            frameRate = [np.nan] * numMovies
            timeScale = [1.0] * numMovies

        self._movieInfoTable = np.rec.fromrecords(zip(
            self.file["/MovieInfo/ID"], self.file["/MovieInfo/Name"],
            frameRate, timeScale),
                                                  dtype=[("ID", int),
                                                         ("Name", object),
                                                         ("FrameRate", float),
                                                         ("TimeScale", float)])

        self._movieDict = {}
        for record in self._movieInfoTable:
            assert record.ID not in self._movieDict
            self._movieDict[record.ID] = record

        _referenceGroupTbl = np.rec.fromrecords(zip(
            self.file["/RefGroup/ID"], self.file["/RefGroup/RefInfoID"],
            [path[1:] for path in self.file["/RefGroup/Path"]]),
                                                dtype=[("ID", int),
                                                       ("RefInfoID", int),
                                                       ("Name", object)])

        _referenceInfoTbl = np.rec.fromrecords(zip(
            self.file["/RefInfo/ID"], self.file["/RefInfo/FullName"],
            self.file["/RefInfo/Length"], self.file["/RefInfo/MD5"]),
                                               dtype=[("RefInfoID", int),
                                                      ("FullName", object),
                                                      ("Length", int),
                                                      ("MD5", object)])

        self._referenceInfoTable = \
            rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")

        if self.isSorted:
            _offsetTable = self.file["/RefGroup/OffsetTable"].value \
                              .view(dtype=OFFSET_TABLE_DTYPE)       \
                              .view(np.recarray)                    \
                              .flatten()
            self._referenceInfoTable = rec_join("ID",
                                                self._referenceInfoTable,
                                                _offsetTable,
                                                jointype="inner")

        self._referenceDict = {}
        for record in self._referenceInfoTable:
            if record.ID != -1:
                assert record.ID != record.Name
                if (record.ID in self._referenceDict
                        or record.Name in self._referenceDict
                        or record.FullName in self._referenceDict
                        or record.MD5 in self._referenceDict):
                    raise ValueError, "Duplicate reference contig sequence or identifier"
                else:
                    self._referenceDict[record.ID] = record
                    self._referenceDict[record.Name] = record
                    self._referenceDict[record.FullName] = record
                    self._referenceDict[record.MD5] = record

        self._readLocatorById = {}
        if self.isSorted:
            for refId in self.file["/RefGroup/ID"]:
                self._readLocatorById[refId] = makeReadLocator(self, refId)

        if "NumPasses" in self.file["/AlnInfo"]:
            self.numPasses = self.file["/AlnInfo/NumPasses"].value

        if "Barcode" in self.file["/AlnInfo"]:
            # Build forward and backwards id<->label lookup tables
            self._barcodeName = OrderedDict(
                zip(self.file["/BarcodeInfo/ID"],
                    self.file["/BarcodeInfo/Name"]))
            self._barcode = OrderedDict(
                zip(self.file["/BarcodeInfo/Name"],
                    self.file["/BarcodeInfo/ID"]))
            # Barcode ID per row
            self._barcodes = self.file["/AlnInfo/Barcode"].value[:, 1]

        if "ZScore" in self.file["/AlnInfo"]:
            self.zScore = self.file["/AlnInfo/ZScore"].value

        self.basH5Collection = None
        self._sequencingChemistry = None
Пример #4
0
    def _loadReferenceInfo(self):
        _referenceGroupTbl = np.rec.fromrecords(zip(
            self.file["/RefGroup/ID"][:], self.file["/RefGroup/RefInfoID"][:],
            [path[1:] for path in self.file["/RefGroup/Path"]]),
                                                dtype=[("ID", int),
                                                       ("RefInfoID", int),
                                                       ("Name", object)])

        _referenceInfoTbl = np.rec.fromrecords(zip(
            self.file["/RefInfo/ID"][:], self.file["/RefInfo/FullName"][:],
            self.file["/RefInfo/Length"][:], self.file["/RefInfo/MD5"][:]),
                                               dtype=[("RefInfoID", int),
                                                      ("FullName", object),
                                                      ("Length", int),
                                                      ("MD5", object)])

        self._referenceInfoTable = \
            rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")

        if self.isSorted:
            _offsetTable = self.file["/RefGroup/OffsetTable"].value \
                              .view(dtype=OFFSET_TABLE_DTYPE)       \
                              .view(np.recarray)                    \
                              .flatten()
            self._referenceInfoTable = rec_join("ID",
                                                self._referenceInfoTable,
                                                _offsetTable,
                                                jointype="inner")
        self._referenceDict = {}
        self._readLocatorByKey = {}

        # For cmp.h5 files with large numbers of references, accessing
        # the recarray fields in the inner loop was terribly slow.
        # This makes things faster, though the code is less
        # straightforward.  (One of the tradeoffs we have to make
        # without a compiler to help us...)
        recordID = self._referenceInfoTable.ID
        recordName = self._referenceInfoTable.Name
        recordFullName = self._referenceInfoTable.FullName
        recordMD5 = self._referenceInfoTable.MD5

        for i, record in enumerate(self._referenceInfoTable):
            if recordID[i] != -1:
                assert recordID[i] != record.Name
                shortName = splitFastaHeader(record.FullName)[0]
                if (shortName in self._referenceDict
                        or recordID[i] in self._referenceDict
                        or recordName[i] in self._referenceDict
                        or recordFullName[i] in self._referenceDict
                        or recordMD5[i] in self._referenceDict):
                    raise ValueError(
                        "Duplicate reference contig sequence or identifier")
                else:
                    self._referenceDict[shortName] = record
                    self._referenceDict[recordID[i]] = record
                    self._referenceDict[recordName[i]] = record
                    self._referenceDict[recordFullName[i]] = record
                    self._referenceDict[recordMD5[i]] = record

                if self.isSorted:
                    readLocator = makeReadLocator(self, recordID[i])
                    self._readLocatorByKey[recordID[i]] = readLocator
                    self._readLocatorByKey[shortName] = readLocator
Пример #5
0
    def __init__(self, filenameOrH5File):
        if isinstance(filenameOrH5File, h5py.File):
            if filenameOrH5File.mode != "r":
                raise ValueError("HDF5 files used by CmpH5Reader must be opened read-only!")
            self.filename = filenameOrH5File.filename
            self.file = filenameOrH5File
        else:
            self.filename = abspath(expanduser(filenameOrH5File))
            self.file = h5py.File(self.filename, "r")
        rawAlignmentIndex = self.file["/AlnInfo/AlnIndex"].value
        self._alignmentIndex = rawAlignmentIndex.view(dtype = ALIGNMENT_INDEX_DTYPE) \
                                                .view(np.recarray)                   \
                                                .flatten()

        # This is the only sneaky part of this whole class.  We do not
        # store the raw h5py group object; rather we cache a dict of {
        # dataset_name -> dataset }.  This way we avoid B-tree
        # scanning in basic data access.
        self._alignmentGroupById = {}
        for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"],
                                              self.file["/AlnGroup/Path"]):
            alnGroup = self.file[alnGroupPath]
            self._alignmentGroupById[alnGroupId] = dict(alnGroup.items())

        numMovies = len(self.file["/MovieInfo/ID"])

        if "FrameRate" in self.file["/MovieInfo"]:
            frameRate = self.file["/MovieInfo/FrameRate"].value
            timeScale = 1.0/frameRate
        else:
            frameRate = [np.nan] * numMovies
            timeScale = [1.0] * numMovies

        if "SequencingChemistry" in self.file["/MovieInfo"]:
            sequencingChemistry = self.file["/MovieInfo/SequencingChemistry"].value
        else:
            sequencingChemistry = ["unknown"] * numMovies


        self._movieInfoTable = np.rec.fromrecords(
            zip(self.file["/MovieInfo/ID"],
                self.file["/MovieInfo/Name"],
                frameRate,
                timeScale,
                sequencingChemistry),
            dtype=[("ID"                  , int),
                   ("Name"                , object),
                   ("FrameRate"           , float),
                   ("TimeScale"           , float),
                   ("SequencingChemistry" , object)])

        self._movieDict = {}
        for record in self._movieInfoTable:
            assert record.ID not in self._movieDict
            self._movieDict[record.ID] = record

        _referenceGroupTbl = np.rec.fromrecords(
            zip(self.file["/RefGroup/ID"],
                self.file["/RefGroup/RefInfoID"],
                [path[1:] for path in self.file["/RefGroup/Path"]]),
            dtype=[("ID"       , int),
                   ("RefInfoID", int),
                   ("Name"     , object)])

        _referenceInfoTbl = np.rec.fromrecords(
            zip(self.file["/RefInfo/ID"],
                self.file["/RefInfo/FullName"],
                self.file["/RefInfo/Length"],
                self.file["/RefInfo/MD5"]) ,
            dtype=[("RefInfoID", int),
                   ("FullName" , object),
                   ("Length"   , int),
                   ("MD5"      , object)])

        self._referenceInfoTable = \
            rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")

        if self.isSorted:
            _offsetTable = self.file["/RefGroup/OffsetTable"].value \
                              .view(dtype=OFFSET_TABLE_DTYPE)       \
                              .view(np.recarray)                    \
                              .flatten()
            self._referenceInfoTable = rec_join("ID",
                                                self._referenceInfoTable,
                                                _offsetTable,
                                                jointype="inner")

        self._referenceDict = {}
        for record in self._referenceInfoTable:
            if record.ID != -1:
                assert record.ID != record.Name
                if (record.ID       in self._referenceDict or
                    record.Name     in self._referenceDict or
                    record.FullName in self._referenceDict or
                    record.MD5      in self._referenceDict):
                    raise ValueError, "Duplicate reference contig sequence or identifier"
                else:
                    self._referenceDict[record.ID]       = record
                    self._referenceDict[record.Name]     = record
                    self._referenceDict[record.FullName] = record
                    self._referenceDict[record.MD5]      = record

        self._readLocatorById = {}
        if self.isSorted:
            for refId in self.file["/RefGroup/ID"]:
                self._readLocatorById[refId] = makeReadLocator(self, refId)

        if "NumPasses" in self.file["/AlnInfo"]:
            self.numPasses = self.file["/AlnInfo/NumPasses"].value

        if "Barcode" in self.file["/AlnInfo"]:
            # Build forward and backwards id<->label lookup tables
            self._barcodeName = OrderedDict(zip(self.file["/BarcodeInfo/ID"],
                                                self.file["/BarcodeInfo/Name"]))
            self._barcode     = OrderedDict(zip(self.file["/BarcodeInfo/Name"],
                                                self.file["/BarcodeInfo/ID"]))
            # Barcode ID per row
            self._barcodes = self.file["/AlnInfo/Barcode"].value[:,1]

        if "ZScore" in self.file["/AlnInfo"]:
            self.zScore = self.file["/AlnInfo/ZScore"].value

        self.basH5Collection = None