def _loadReferenceInfo(self): _referenceGroupTbl = np.rec.fromrecords( zip(self.file["/RefGroup/ID"], self.file["/RefGroup/RefInfoID"], [path[1:] for path in self.file["/RefGroup/Path"]]), dtype=[("ID" , int), ("RefInfoID", int), ("Name" , object)]) _referenceInfoTbl = np.rec.fromrecords( zip(self.file["/RefInfo/ID"], self.file["/RefInfo/FullName"], self.file["/RefInfo/Length"], self.file["/RefInfo/MD5"]) , dtype=[("RefInfoID", int), ("FullName" , object), ("Length" , int), ("MD5" , object)]) self._referenceInfoTable = \ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner") if self.isSorted: _offsetTable = self.file["/RefGroup/OffsetTable"].value \ .view(dtype=OFFSET_TABLE_DTYPE) \ .view(np.recarray) \ .flatten() self._referenceInfoTable = rec_join("ID", self._referenceInfoTable, _offsetTable, jointype="inner") self._referenceDict = {} self._readLocatorByKey = {} for record in self._referenceInfoTable: if record.ID != -1: assert record.ID != record.Name shortName = splitFastaHeader(record.FullName)[0] if (shortName in self._referenceDict or record.ID in self._referenceDict or record.Name in self._referenceDict or record.FullName in self._referenceDict or record.MD5 in self._referenceDict): raise ValueError, "Duplicate reference contig sequence or identifier" else: self._referenceDict[shortName] = record self._referenceDict[record.ID] = record self._referenceDict[record.Name] = record self._referenceDict[record.FullName] = record self._referenceDict[record.MD5] = record if self.isSorted: readLocator = makeReadLocator(self, record.ID) self._readLocatorByKey[record.ID] = readLocator self._readLocatorByKey[shortName] = readLocator
def _loadReferenceInfo(self): _referenceGroupTbl = np.rec.fromrecords( zip(self.file["/RefGroup/ID"][:], self.file["/RefGroup/RefInfoID"][:], [path[1:] for path in self.file["/RefGroup/Path"]]), dtype=[("ID" , int), ("RefInfoID", int), ("Name" , object)]) _referenceInfoTbl = np.rec.fromrecords( zip(self.file["/RefInfo/ID"][:], self.file["/RefInfo/FullName"][:], self.file["/RefInfo/Length"][:], self.file["/RefInfo/MD5"][:]) , dtype=[("RefInfoID", int), ("FullName" , object), ("Length" , int), ("MD5" , object)]) self._referenceInfoTable = \ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner") if self.isSorted: _offsetTable = self.file["/RefGroup/OffsetTable"].value \ .view(dtype=OFFSET_TABLE_DTYPE) \ .view(np.recarray) \ .flatten() self._referenceInfoTable = rec_join("ID", self._referenceInfoTable, _offsetTable, jointype="inner") self._referenceDict = {} self._readLocatorByKey = {} # For cmp.h5 files with large numbers of references, accessing # the recarray fields in the inner loop was terribly slow. # This makes things faster, though the code is less # straightforward. (One of the tradeoffs we have to make # without a compiler to help us...) recordID = self._referenceInfoTable.ID recordName = self._referenceInfoTable.Name recordFullName = self._referenceInfoTable.FullName recordMD5 = self._referenceInfoTable.MD5 for i, record in enumerate(self._referenceInfoTable): if recordID[i] != -1: assert recordID[i] != record.Name shortName = splitFastaHeader(record.FullName)[0] if (shortName in self._referenceDict or recordID[i] in self._referenceDict or recordName[i] in self._referenceDict or recordFullName[i] in self._referenceDict or recordMD5[i] in self._referenceDict): raise ValueError, "Duplicate reference contig sequence or identifier" else: self._referenceDict[shortName] = record self._referenceDict[recordID[i]] = record self._referenceDict[recordName[i]] = record self._referenceDict[recordFullName[i]] = record self._referenceDict[recordMD5[i]] = record if self.isSorted: readLocator = makeReadLocator(self, recordID[i]) self._readLocatorByKey[recordID[i]] = readLocator self._readLocatorByKey[shortName] = readLocator
def __init__(self, filenameOrH5File): if isinstance(filenameOrH5File, h5py.File): if filenameOrH5File.mode != "r": raise ValueError( "HDF5 files used by CmpH5Reader must be opened read-only!") self.filename = filenameOrH5File.filename self.file = filenameOrH5File else: try: self.filename = abspath(expanduser(filenameOrH5File)) self.file = h5py.File(self.filename, "r") except IOError: raise IOError, ("Invalid or nonexistent cmp.h5 file %s" % filenameOrH5File) if len(self.file["/AlnInfo/AlnIndex"]) == 0: raise ValueError, "Empty cmp.h5 file, cannot be read by CmpH5Reader" rawAlignmentIndex = self.file["/AlnInfo/AlnIndex"].value self._alignmentIndex = rawAlignmentIndex.view(dtype = ALIGNMENT_INDEX_DTYPE) \ .view(np.recarray) \ .flatten() # This is the only sneaky part of this whole class. We do not # store the raw h5py group object; rather we cache a dict of { # dataset_name -> dataset }. This way we avoid B-tree # scanning in basic data access. self._alignmentGroupById = {} for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"], self.file["/AlnGroup/Path"]): alnGroup = self.file[alnGroupPath] self._alignmentGroupById[alnGroupId] = dict(alnGroup.items()) numMovies = len(self.file["/MovieInfo/ID"]) if "FrameRate" in self.file["/MovieInfo"]: frameRate = self.file["/MovieInfo/FrameRate"].value timeScale = 1.0 / frameRate else: frameRate = [np.nan] * numMovies timeScale = [1.0] * numMovies self._movieInfoTable = np.rec.fromrecords(zip( self.file["/MovieInfo/ID"], self.file["/MovieInfo/Name"], frameRate, timeScale), dtype=[("ID", int), ("Name", object), ("FrameRate", float), ("TimeScale", float)]) self._movieDict = {} for record in self._movieInfoTable: assert record.ID not in self._movieDict self._movieDict[record.ID] = record _referenceGroupTbl = np.rec.fromrecords(zip( self.file["/RefGroup/ID"], self.file["/RefGroup/RefInfoID"], [path[1:] for path in self.file["/RefGroup/Path"]]), dtype=[("ID", int), ("RefInfoID", int), ("Name", object)]) _referenceInfoTbl = np.rec.fromrecords(zip( self.file["/RefInfo/ID"], self.file["/RefInfo/FullName"], self.file["/RefInfo/Length"], self.file["/RefInfo/MD5"]), dtype=[("RefInfoID", int), ("FullName", object), ("Length", int), ("MD5", object)]) self._referenceInfoTable = \ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner") if self.isSorted: _offsetTable = self.file["/RefGroup/OffsetTable"].value \ .view(dtype=OFFSET_TABLE_DTYPE) \ .view(np.recarray) \ .flatten() self._referenceInfoTable = rec_join("ID", self._referenceInfoTable, _offsetTable, jointype="inner") self._referenceDict = {} for record in self._referenceInfoTable: if record.ID != -1: assert record.ID != record.Name if (record.ID in self._referenceDict or record.Name in self._referenceDict or record.FullName in self._referenceDict or record.MD5 in self._referenceDict): raise ValueError, "Duplicate reference contig sequence or identifier" else: self._referenceDict[record.ID] = record self._referenceDict[record.Name] = record self._referenceDict[record.FullName] = record self._referenceDict[record.MD5] = record self._readLocatorById = {} if self.isSorted: for refId in self.file["/RefGroup/ID"]: self._readLocatorById[refId] = makeReadLocator(self, refId) if "NumPasses" in self.file["/AlnInfo"]: self.numPasses = self.file["/AlnInfo/NumPasses"].value if "Barcode" in self.file["/AlnInfo"]: # Build forward and backwards id<->label lookup tables self._barcodeName = OrderedDict( zip(self.file["/BarcodeInfo/ID"], self.file["/BarcodeInfo/Name"])) self._barcode = OrderedDict( zip(self.file["/BarcodeInfo/Name"], self.file["/BarcodeInfo/ID"])) # Barcode ID per row self._barcodes = self.file["/AlnInfo/Barcode"].value[:, 1] if "ZScore" in self.file["/AlnInfo"]: self.zScore = self.file["/AlnInfo/ZScore"].value self.basH5Collection = None self._sequencingChemistry = None
def _loadReferenceInfo(self): _referenceGroupTbl = np.rec.fromrecords(zip( self.file["/RefGroup/ID"][:], self.file["/RefGroup/RefInfoID"][:], [path[1:] for path in self.file["/RefGroup/Path"]]), dtype=[("ID", int), ("RefInfoID", int), ("Name", object)]) _referenceInfoTbl = np.rec.fromrecords(zip( self.file["/RefInfo/ID"][:], self.file["/RefInfo/FullName"][:], self.file["/RefInfo/Length"][:], self.file["/RefInfo/MD5"][:]), dtype=[("RefInfoID", int), ("FullName", object), ("Length", int), ("MD5", object)]) self._referenceInfoTable = \ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner") if self.isSorted: _offsetTable = self.file["/RefGroup/OffsetTable"].value \ .view(dtype=OFFSET_TABLE_DTYPE) \ .view(np.recarray) \ .flatten() self._referenceInfoTable = rec_join("ID", self._referenceInfoTable, _offsetTable, jointype="inner") self._referenceDict = {} self._readLocatorByKey = {} # For cmp.h5 files with large numbers of references, accessing # the recarray fields in the inner loop was terribly slow. # This makes things faster, though the code is less # straightforward. (One of the tradeoffs we have to make # without a compiler to help us...) recordID = self._referenceInfoTable.ID recordName = self._referenceInfoTable.Name recordFullName = self._referenceInfoTable.FullName recordMD5 = self._referenceInfoTable.MD5 for i, record in enumerate(self._referenceInfoTable): if recordID[i] != -1: assert recordID[i] != record.Name shortName = splitFastaHeader(record.FullName)[0] if (shortName in self._referenceDict or recordID[i] in self._referenceDict or recordName[i] in self._referenceDict or recordFullName[i] in self._referenceDict or recordMD5[i] in self._referenceDict): raise ValueError( "Duplicate reference contig sequence or identifier") else: self._referenceDict[shortName] = record self._referenceDict[recordID[i]] = record self._referenceDict[recordName[i]] = record self._referenceDict[recordFullName[i]] = record self._referenceDict[recordMD5[i]] = record if self.isSorted: readLocator = makeReadLocator(self, recordID[i]) self._readLocatorByKey[recordID[i]] = readLocator self._readLocatorByKey[shortName] = readLocator
def __init__(self, filenameOrH5File): if isinstance(filenameOrH5File, h5py.File): if filenameOrH5File.mode != "r": raise ValueError("HDF5 files used by CmpH5Reader must be opened read-only!") self.filename = filenameOrH5File.filename self.file = filenameOrH5File else: self.filename = abspath(expanduser(filenameOrH5File)) self.file = h5py.File(self.filename, "r") rawAlignmentIndex = self.file["/AlnInfo/AlnIndex"].value self._alignmentIndex = rawAlignmentIndex.view(dtype = ALIGNMENT_INDEX_DTYPE) \ .view(np.recarray) \ .flatten() # This is the only sneaky part of this whole class. We do not # store the raw h5py group object; rather we cache a dict of { # dataset_name -> dataset }. This way we avoid B-tree # scanning in basic data access. self._alignmentGroupById = {} for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"], self.file["/AlnGroup/Path"]): alnGroup = self.file[alnGroupPath] self._alignmentGroupById[alnGroupId] = dict(alnGroup.items()) numMovies = len(self.file["/MovieInfo/ID"]) if "FrameRate" in self.file["/MovieInfo"]: frameRate = self.file["/MovieInfo/FrameRate"].value timeScale = 1.0/frameRate else: frameRate = [np.nan] * numMovies timeScale = [1.0] * numMovies if "SequencingChemistry" in self.file["/MovieInfo"]: sequencingChemistry = self.file["/MovieInfo/SequencingChemistry"].value else: sequencingChemistry = ["unknown"] * numMovies self._movieInfoTable = np.rec.fromrecords( zip(self.file["/MovieInfo/ID"], self.file["/MovieInfo/Name"], frameRate, timeScale, sequencingChemistry), dtype=[("ID" , int), ("Name" , object), ("FrameRate" , float), ("TimeScale" , float), ("SequencingChemistry" , object)]) self._movieDict = {} for record in self._movieInfoTable: assert record.ID not in self._movieDict self._movieDict[record.ID] = record _referenceGroupTbl = np.rec.fromrecords( zip(self.file["/RefGroup/ID"], self.file["/RefGroup/RefInfoID"], [path[1:] for path in self.file["/RefGroup/Path"]]), dtype=[("ID" , int), ("RefInfoID", int), ("Name" , object)]) _referenceInfoTbl = np.rec.fromrecords( zip(self.file["/RefInfo/ID"], self.file["/RefInfo/FullName"], self.file["/RefInfo/Length"], self.file["/RefInfo/MD5"]) , dtype=[("RefInfoID", int), ("FullName" , object), ("Length" , int), ("MD5" , object)]) self._referenceInfoTable = \ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner") if self.isSorted: _offsetTable = self.file["/RefGroup/OffsetTable"].value \ .view(dtype=OFFSET_TABLE_DTYPE) \ .view(np.recarray) \ .flatten() self._referenceInfoTable = rec_join("ID", self._referenceInfoTable, _offsetTable, jointype="inner") self._referenceDict = {} for record in self._referenceInfoTable: if record.ID != -1: assert record.ID != record.Name if (record.ID in self._referenceDict or record.Name in self._referenceDict or record.FullName in self._referenceDict or record.MD5 in self._referenceDict): raise ValueError, "Duplicate reference contig sequence or identifier" else: self._referenceDict[record.ID] = record self._referenceDict[record.Name] = record self._referenceDict[record.FullName] = record self._referenceDict[record.MD5] = record self._readLocatorById = {} if self.isSorted: for refId in self.file["/RefGroup/ID"]: self._readLocatorById[refId] = makeReadLocator(self, refId) if "NumPasses" in self.file["/AlnInfo"]: self.numPasses = self.file["/AlnInfo/NumPasses"].value if "Barcode" in self.file["/AlnInfo"]: # Build forward and backwards id<->label lookup tables self._barcodeName = OrderedDict(zip(self.file["/BarcodeInfo/ID"], self.file["/BarcodeInfo/Name"])) self._barcode = OrderedDict(zip(self.file["/BarcodeInfo/Name"], self.file["/BarcodeInfo/ID"])) # Barcode ID per row self._barcodes = self.file["/AlnInfo/Barcode"].value[:,1] if "ZScore" in self.file["/AlnInfo"]: self.zScore = self.file["/AlnInfo/ZScore"].value self.basH5Collection = None