def _get_post_mapping_from_movies(allMovies, cmp_h5): """ Go through all movies post alignment. returns dict of {movie:MovieStats} """ postMappingMovies = {} reader = CmpH5Reader(cmp_h5) for alignment in reader: # returns a tuple of # (2, 'm101210_151341_Jan_p1_b15', 100.0, 0.009999999776482582) movie_info = alignment.movieInfo movie = movie_info[1] if movie not in postMappingMovies: stats = allMovies[movie] postMappingMovies[movie] = MovieStats(stats.expt, stats.chip, stats.movie, stats.inst, movieType=stats.movieType, setId=stats.setId, partId=stats.partId, cellId=stats.cellId, date=stats.date) postMappingMovies[movie].add(alignment) reader.close() return postMappingMovies
def cmpH5Summarize(inCmp, movieSummary=True, refSummary=True): """Summarize a cmp.h5 file""" reader = CmpH5Reader(inCmp) tstr = "filename: %s\nversion: %s\nn reads: %d\nn refs: " + \ "%d\nn movies: %d\nn bases: %d\navg rl: %d\navg acc: %g" rl, acc, mov = zip(*[(r.readLength, r.accuracy, r.movieInfo[0]) for r in reader]) summaryStr = ( tstr % (os.path.basename(reader.file.filename), reader.version, len(reader), len(reader.referenceInfoTable), len(set(mov)), NP.sum(rl), NP.round(NP.mean(rl)), NP.round(NP.mean(acc), 4))) eTbl = Tbl(nBases=Sum(ReadLength), avgReadLength=Mean(ReadLength), avgAccuracy=Mean(Accuracy)) movieSummaryTxt = rec2txt(toRecArray( query(reader, what=eTbl, groupBy=Movie)), padding=5, precision=1) refSummaryTxt = rec2txt(toRecArray( query(reader, what=eTbl, groupBy=Reference)), padding=5, precision=1) return (summaryStr + ("\n\n\t Movie Summary:\n" + (movieSummaryTxt if movieSummary else "\n")) + ("\n\n\t Reference Summary:\n" + (refSummaryTxt if refSummary else "\n")))
def _mainLoop(self): """ Main loop First launch the worker and writer processes Then we loop over ReferenceGroups in the cmp.h5. For each contig we will: 1. Load the sequence into the main memory of the parent process 3. Chunk up the contig and submit the chunk descriptions to the work queue Finally, wait for the writer process to finish. """ # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() # Load reference and IpdModel self.loadReferenceAndModel(self.args.reference, self.args.infile) # Spawn workers self._launchSlaveProcesses() # WARNING -- cmp.h5 file must be opened AFTER worker processes have been spawned # cmp.h5 we're using -- use this to orchestrate the work self.cmph5 = CmpH5Reader(self.args.infile) logging.info('Generating kinetics summary for [%s]' % self.args.infile) #self.referenceMap = self.cmph5['/RefGroup'].asDict('RefInfoID', 'ID') #self.alnInfo = self.cmph5['/AlnInfo'].asRecArray() # Main loop -- we loop over ReferenceGroups in the cmp.h5. For each contig we will: # 1. Load the sequence into the main memory of the parent process # 2. Fork the workers # 3. chunk up the contig and self.workChunkCounter = 0 # Iterate over references for ref in self.refInfo: logging.info('Processing reference entry: [%s]' % ref.ID) self._queueChunksForReference(ref) # Shutdown worker threads with None sentinels for i in xrange(self.args.numWorkers): self._workQueue.put(None) for w in self._workers: w.join() # Join on the result queue and the resultsCollector process. # This ensures all the results are written before shutdown. self.monitoringThread.join() self._resultsQueue.join() self._resultCollectorProcess.join() logging.info("ipdSummary.py finished. Exiting.") del self.cmph5 return 0
def loadCmpH5Chemistry(cmpH5File): with CmpH5Reader(cmpH5File) as f: chems = f.sequencingChemistry chemCounts = {k: len(list(v)) for k, v in itertools.groupby(chems)} majorityChem = max(chemCounts, key=chemCounts.get) return majorityChem
def cmpH5Validate(inCmp): """Validate a cmp.h5 file""" try: reader = CmpH5Reader(inCmp) return True except: return False
def cmpH5Select(inCmpFile, outCmp, idxs=None, groupByStr=None, groupByCsv=None, whereStr=None, outDir="."): """Take a vector of indices or a where expression and select a set of alignments. If a groupBy is specified, then produce a cmp.h5 file for each distinct member of the grouping.""" if idxs: doSelect(inCmpFile, outCmp, idxs) else: where = DefaultWhere if whereStr is None else eval(whereStr) groupBy = DefaultGroupBy if groupByStr is None else eval(groupByStr) idxVecs = query(CmpH5Reader(inCmpFile), what=AlignmentIdx, where=where, groupBy=groupBy, groupByCsv=groupByCsv) keys = idxVecs.keys() ## XXX: Should the resultant files be sorted? if len(keys) == 1: doSelect(inCmpFile, outCmp, idxVecs[keys[0]]) else: for k in keys: #For groupByCsv, skip group of indexes not identified in csv if k == NOTINCSV_LABEL: continue logging.debug("Processing output for %s" % str(k)) doSelect(inCmpFile, "/".join([outDir, "%s.cmp.h5" % str(k)]), idxVecs[k])
def cmpH5Equal(inCmp1, inCmp2): """Compare two cmp.h5 files for equality. Here equality means the alignments are the same and they are in the same order. Additionally, the reference information in the files has to be the same.""" cmp1 = CmpH5Reader(inCmp1) cmp2 = CmpH5Reader(inCmp2) if not len(cmp1) == len(cmp2): return (False, "cmp.h5 files differ in length (%d, %d)" % (len(cmp1), len(cmp2))) aeq = [a1 == a2 for a1, a2 in zip(cmp1, cmp2)] if not all(aeq): return (False, "%d alignments differ" % (len(aeq) - sum(aeq))) return (True, )
def loadCmpH5Tables(cmpH5File): """Load the cmp.h5, get the ReferenceInfo table, in order to correctly number the contigs, then close the cmp.h5""" cmph5 = CmpH5Reader(cmpH5File) refInfoTable = cmph5.referenceInfoTable movieInfoTable = cmph5.movieInfoTable cmph5.close() del cmph5 return (refInfoTable, movieInfoTable)
def labelAlignments(): logging.info("Labeling alignments using: %s" % runner.args.inputFofn) bcFofn = BarcodeH5Fofn(runner.args.inputFofn) with CmpH5Reader(runner.args.cmpH5) as cmpH5: bcDS = n.zeros((len(cmpH5), 5), dtype="int32") for (i, aln) in enumerate(cmpH5): bcReader = bcFofn.readerForMovie(aln.movieInfo.Name) try: lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber) if lZmw.nScored < runner.args.minNumBarcodes or \ lZmw.averageScore < runner.args.minAvgBarcodeScore or \ lZmw.scoreRatio < runner.args.minScoreRatio: lZmw = None except KeyError: lZmw = None if lZmw: bcDS[i, :] = n.array([ lZmw.nScored, lZmw.bestIdx, lZmw.bestScore, lZmw.secondBestIdx, lZmw.secondBestScore ]) else: # either no barcode was found for this guy or they got # filtered, hence the NULL_BARCODE bcDS[i, :] = n.array([ 0, len(bcReader.barcodeLabels), 0, len(bcReader.barcodeLabels), 0 ]) # write to the cmp.h5 file. H5 = h5.File(runner.args.cmpH5, 'r+') if BC_INFO_ID in H5: del H5[BC_INFO_ID] if BC_INFO_NAME in H5: del H5[BC_INFO_NAME] # we use the first one to get the labels, if somehow they # don't have all of the same stuff that will be an issue. bcLabels = n.concatenate( (bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER]))) H5.create_dataset(BC_INFO_ID, data=n.array(range(0, len(bcLabels))), dtype='int32') H5.create_dataset(BC_INFO_NAME, data=bcLabels, dtype=h5.new_vlen(str)) if BC_ALN_INFO_DS in H5: del H5[BC_ALN_INFO_DS] bcDS = H5.create_dataset(BC_ALN_INFO_DS, data=bcDS, dtype='int32') bcDS.attrs['ColumnNames'] = n.array( ['count', 'index1', 'score1', 'index2', 'score2']) #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine bcDS.attrs['BarcodeMode'] = n.array(bcFofn.scoreMode) H5.close()
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (cmp.h5 or BAM), not requiring index capability (A `sharedIndex` can still be passed for opening a cmp.h5, for which the index is compulsory.) """ if fname.endswith("cmp.h5"): return CmpH5Reader(fname, sharedIndex=sharedIndex) elif fname.endswith("bam"): return BamReader(fname, referenceFastaFname)
def _run(self): logging.info("Worker %s (PID=%d) started running" % (self.name, self.pid)) self.caseCmpH5 = CmpH5Reader(self.options.infile) if not self.options.control is None: # We have a cmp.h5 with control vales -- load that cmp.h5 self.controlCmpH5 = CmpH5Reader(self.options.control) else: self.controlCmpH5 = None self.onStart() while True: if self.isTerminated(): break chunkDesc = self._workQueue.get() if chunkDesc is None: # Sentinel indicating end of input. Place a sentinel # on the results queue and end this worker process. self._resultsQueue.put(None) self._workQueue.task_done() break else: (chunkId, datum) = chunkDesc logging.info("Got chunk: (%s, %s) -- Process: %s" % (chunkId, str(datum), current_process())) result = self.onChunk(datum) logging.debug("Process %s: putting result." % current_process()) self._resultsQueue.put((chunkId, result)) self._workQueue.task_done() self.onFinish() logging.info("Process %s (PID=%d) done; exiting." % (self.name, self.pid))
def loadCmpH5(filename, disableChunkCache=False): """ Get a CmpH5Reader object, disabling the chunk cache if requested. """ filename = os.path.abspath(os.path.expanduser(filename)) if not disableChunkCache: file = h5py.File(filename, "r") else: propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS) propfaid.set_cache(0, 0, 0, 0) fid = h5py.h5f.open(filename, flags=h5py.h5f.ACC_RDONLY, fapl=propfaid) file = h5py.File(fid) return CmpH5Reader(file)
def _readCmpH5Input(self): """ Read the CmpH5 input file into a CmpH5 object and store it as self._inCmpH5. """ fname = options.inputFilename if options.usingBam: self._inCmpH5 = BamReader(fname) else: logging.debug( "Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) self._inCmpH5 = CmpH5Reader(fname)
def testLazyChemistryResolution(self): """ The CmpH5Reader allows reading of files that have missing chemistry information---an exception will be thrown only upon attempts to access the information. We need to retain this behavior for compatibility. """ oldCmpH5 = data.getCmpH5() C = CmpH5Reader(oldCmpH5) # no exception here with assert_raises(ChemistryLookupError): C.sequencingChemistry with assert_raises(ChemistryLookupError): C[0].sequencingChemistry
def _get_control_reads(control_cmph5): """ Return a tuple of len == 2: Position 0: (string) control name Position 1: (dict) dict of string to tuple (int,float) . The key is control readId, position 0 of the tuple is accuracy, position 1 is length. :param control_cmph5: (str) path to control_reads.cmp.h5 """ control_reads = {} c = CmpH5Reader(control_cmph5) for ca in c: read_id = '%s/%d' % (ca.movieInfo.Name, ca.HoleNumber) if read_id in control_reads: log.warn( 'read {i} is control read and has subreads?'.format(i=read_id)) control_reads[read_id] = (ca.accuracy, ca.readLength) name = c.referenceInfo('ref000001').FullName return name, control_reads
def openIndexedAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (cmp.h5 or BAM), requiring index capability (built-in for cmp.h5; requires bam.pbi index for BAM The reference FASTA, if provided, must have a FASTA index (fasta.fai). """ if fname.endswith("cmp.h5"): return CmpH5Reader(fname, sharedIndex=sharedIndex) elif fname.endswith("bam"): return IndexedBamReader(fname, referenceFastaFname=referenceFastaFname, sharedIndex=sharedIndex) else: raise ValueError, "Invalid alignment file suffix"
def _extractAlignedReads(self): """Grab a mapping of all movie names of aligned reads to hole numbers. and return { Movie: [HoleNumbers ...] }. """ alignedReads = {} try: reader = CmpH5Reader(self.inCmpFile) for movie in reader.movieInfoTable.Name: alignedReads.setdefault(movie, set()) for i in reader: alignedReads[i.movieInfo.Name].add(i.HoleNumber) reader.close() except (IndexError, EmptyCmpH5Error): msg = "No aligned reads found in {x}".format(x=self.inCmpFile) sys.stderr.write(msg + "\n") logging.warn(msg) return alignedReads
def setUp(self): # Load the lambda genome from our sample data dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') ref = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta') cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5") self.contigs = ReferenceUtils.loadReferenceContigs(ref, cmpFile) self.ipdModel = IpdModel(self.contigs) # Create a functional KineticWorker object that can be poked at manually. self.kw = KineticWorker(self.ipdModel) self.cmpH5 = CmpH5Reader(cmpFile) # Put in our cmp.h5 - this is normally supplied by the Worker superclass self.kw.caseCmpH5 = self.cmpH5 self.kw.controlCmpH5 = None self.kw.options = self.getOpts()
def _mainLoop(self): # See comments in ipdSummary.py gc.disable() # Load reference and IpdModel # self.loadReference() # Load reference and IpdModel self.loadReferenceAndModel(self.args.reference, self.args.infile) # Spawn workers self._launchSlaveProcesses() # cmp.h5 we're using -- use this to orchestrate the work self.cmph5 = CmpH5Reader(self.args.infile) logging.info('Generating kinetics summary for [%s]' % self.args.infile) self.workChunkCounter = 0 self._queueChunksForReference() # Shutdown worker threads with None sentinels for i in xrange(self.args.numWorkers): self._workQueue.put(None) for w in self._workers: w.join() # Join on the result queue and the resultsCollector process. # This ensures all the results are written before shutdown. self.monitoringThread.join() self._resultsQueue.join() self._resultCollectorProcess.join() logging.info("reprocessMotifSites.py finished. Exiting.") del self.cmph5 return 0
def __init__(self): bamFname, cmpFname = D.getBamAndCmpH5() lambdaFasta = D.getLambdaFasta() self.b = PacBioBamReader(bamFname, lambdaFasta) self.c = CmpH5Reader(cmpFname) self.bBasic = BamReader(bamFname) # Note that sorting orders are not generally the same... BAM # sorts + alns before - alns, when there is a tie on tStart; # we don't do this in cmp.h5 (we next sort on tEnd). However # in this file there are no ties on tStart. self.bAlns = list(self.b) self.bFwd = self.bAlns[0] self.bRev = self.bAlns[1] self.cAlns = list(self.c) self.cFwd = self.cAlns[0] self.cRev = self.cAlns[1] self.cFwdClipped = self.cFwd.clippedTo(10, 60) self.bFwdClipped = self.bFwd.clippedTo(10, 60) self.cRevClipped = self.cRev.clippedTo(310, 360) self.bRevClipped = self.bRev.clippedTo(310, 360)
def __init__(self): self.h5FileName = data.getCmpH5() self.cmpH5 = CmpH5Reader(self.h5FileName)
help='index of reference contig (1 if single contig)') parser.add_argument( 'refPos', type=int, help='position of modified cognate, 4th column in motifs.gff') parser.add_argument('-f', dest='fwdStrand', action='store_true', help='us -f flag if + strand in motifs.gff') parser.add_argument('-r', dest='fwdStrand', action='store_false', help='us -r flag if - strand in motifs.gff') parser.add_argument( '-k', type=int, default=1, help= 'min number of bases on each side of modified base which must align in read' ) parser.add_argument('-q,--minMapQV', dest='minMapQv', type=int, default=10, help='minimum mapping QV of read') args = parser.parse_args() print( getIPD(CmpH5Reader(args.cmpH5), args.refIdx, args.refPos, args.fwdStrand, args.k, args.minMapQv))
def writeLinesFromCmph5 (cmph5, leftAnchor, rightAnchor, offsetDict): reader = CmpH5Reader(cmph5) alignments_list = [r for r in reader] #refInfoTable = reader.referenceInfoTable #refDict = {} #for i in range (len(refInfoTable)): # rid = refInfoTable[i][0] # rn = refInfoTable[i][2] # rname = refInfoTable[i][3] # refDict[rn] = rname #print refInfoTable[i] for i, alignment in enumerate(alignments_list): #movieID = str(alignment.movieInfo[0]) alignedLength = alignment.alignedLength fps = alignment.movieInfo[2] #refName = alignment.referenceName #refName = refDict[refName] refName = str(alignment.referenceInfo[3]) #refGroupID = alignment.refGroupID #refName = refDict[refGroupID] #zmw = str(alignment.HoleNumber) #mol = str(alignment.MoleculeID) if alignment.isForwardStrand: strand = str(0) else: strand = str(1) ref_bases = alignment.reference() read_calls = alignment.transcript() ref_pos = list(alignment.referencePositions()) IPD = list(alignment.IPD()) delim = " " error_mk = [] for read_call in read_calls: # Go through all entries and flag which positions are MM/indels if read_call != "M": # Mismatch or indel at this position! error_mk.append(1) else: error_mk.append(0) # Get the indices of all the non-matches error_idx = [i for (i,val) in enumerate(error_mk) if val == 1] for error_id in error_idx: try: for j in range(leftAnchor): error_mk[error_id - (j+1)] = 1 for j in range(rightAnchor): error_mk[error_id + (j+1)] = 1 except IndexError: pass error_mk = np.array(error_mk) ipds = np.array(IPD) / fps strands = np.array([strand] * alignedLength) ref_bases = np.array(list(ref_bases)) ref_pos = np.array(ref_pos) read_calls = np.array(list(read_calls)) ref_bases = ref_bases[error_mk==0] ref_pos = ref_pos[error_mk==0] read_calls = np.array(read_calls)[error_mk==0] ipds = ipds[error_mk==0] strands = strands[error_mk==0] ipds = ipds/np.median(ipds) for i in range (ipds.size): newpos = ref_pos[i] + offsetDict[refName] print newpos, ipds[i], strand
import numpy as np from pbcore.io import CmpH5Reader from GenomicConsensus import reference from projutils import getReads from bqcy.bqcy import run_bqcy cmpH5 = CmpH5Reader( '/home/nick/workspace/btry6790_project/PXO99A_ref_wo_one_copy_212kb_repeat.cmp.h5' ) reference.loadFromFile( "/home/nick/workspace/btry6790_project/ref_PXO99A_genome_reference_wo_one_copy_212k_repeat/sequence/ref_PXO99A_genome_reference_wo_one_copy_212k_repeat.fasta", cmpH5) tmplSeq, realTmplLen, readSeqs, qvInfo = getReads(cmpH5, reference, (146000, 146050), 64, 100) #print(readSeqs[:, 65:]) #exit() print("POA Consensus: " + ''.join(map(chr, tmplSeq.tolist()))) tmplSeq = np.zeros((64), dtype=np.uint8) tmplOrds = map(ord, "A" * 50) tmplSeq[:len(tmplOrds)] = tmplOrds results = np.zeros(8 * tmplSeq.shape[0], dtype=np.float64) origTmplScore, bestMutantScore, bestMutatedSeq = run_bqcy( tmplSeq, readSeqs, qvInfo, results) print("Polished: " + ''.join(map(chr, np.asarray(bestMutatedSeq).tolist()))) print("Fake Template: " + ''.join(map(chr, np.asarray(tmplSeq).tolist()))) print(results)
def test_openFromH5File(self): cmpH5Filename = data.getCmpH5() c = CmpH5Reader(h5py.File(cmpH5Filename, "r")) EQ("1.2.0.SF", c.version)
def __init__(self): cmpH5Filename = data.getCmpH5() self._inCmpH5 = CmpH5Reader(cmpH5Filename) self.hit0 = self._inCmpH5[0] self.hit1 = self._inCmpH5[1]
def _openAlignments(): if in_fn.endswith(".cmp.h5"): return CmpH5Reader(in_fn) else: return openDataFile(in_fn)
def main(self): # This looks scary but it's not. Python uses reference # counting and has a secondary, optional garbage collector for # collecting garbage cycles. Unfortunately when a cyclic GC # happens when a thread is calling cPickle.dumps, the # interpreter crashes sometimes. See Bug 19704. Since we # don't leak garbage cycles, disabling the cyclic GC is # essentially harmless. gc.disable() parseOptions() self._algorithm = self._algorithmByName(options.algorithm) self._setupLogging() random.seed(42) logging.info("h5py version: %s" % h5py.version.version) logging.info("hdf5 version: %s" % h5py.version.hdf5_version) logging.info("ConsensusCore version: %s" % (consensusCoreVersion() or "ConsensusCore unavailable")) logging.info("Starting.") atexit.register(self._cleanup) if options.doProfiling: self._makeTemporaryDirectory() if options.usingBam: logging.warn( "'fancyChunking' not yet available for BAM, disabling") options.fancyChunking = False # Peek at the bam file to build tables with BamReader(options.inputFilename) as peekCmpH5: logging.info("Peeking at BAM file %s" % options.inputFilename) logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5)) resolveOptions(peekCmpH5) self._loadReference(peekCmpH5) self._checkFileCompatibility(peekCmpH5) self._configureAlgorithm(options, peekCmpH5) else: # We need to peek at the cmp.h5 file to build the The # refGroupId<->refGroupFullName mapping, and to determine # whether the selected algorithm parameters (Quiver) are # compatible with the data. But we then have to close the # file, and let the "real" open happen after the fork. with CmpH5Reader(options.inputFilename) as peekCmpH5: logging.info("Peeking at CmpH5 file %s" % options.inputFilename) logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5)) resolveOptions(peekCmpH5) self._loadReference(peekCmpH5) self._checkFileCompatibility(peekCmpH5) self._configureAlgorithm(options, peekCmpH5) options.disableHdf5ChunkCache = self._shouldDisableChunkCache( peekCmpH5) if options.disableHdf5ChunkCache: logging.info( "Will disable HDF5 chunk cache (large number of datasets)" ) logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count()) if options.dumpEvidence: self._setupEvidenceDumpDirectory(options.evidenceDirectory) self._launchSlaves() self._readCmpH5Input() monitoringThread = threading.Thread(target=monitorSlaves, args=(self, )) monitoringThread.start() try: if options.doProfiling: cProfile.runctx("self._mainLoop()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-main.out")) elif options.doDebugging: logging.info("PID: %d", os.getpid()) try: import ipdb as pdb except: import pdb return pdb.runeval("self._mainLoop()", globals(), locals()) else: self._mainLoop() except: why = traceback.format_exc() self.abortWork(why) monitoringThread.join() if self._aborting: logging.error("Aborting") return -1 else: logging.info("Finished.") if options.doProfiling: self._printProfiles() # close h5 file. self._inCmpH5.close() return 0
return (h[1][0:-1], h[0]) dh = StatsHist(dStats, dataset="rs", which="m", minValue=0.25) fh = StatsHist(fStats, dataset="rs", which="m", minValue=0.25) duh = StatsHist(dStats, dataset="rs", which="um", minValue=0.25) fuh = StatsHist(fStats, dataset="rs", which="um", minValue=0.25) ax = plt.axes plt.scatter(dh[0], dh[1], axes=ax) plt.scatter(fh[0], fh[1], axes=ax, color="red") plt.scatter(duh[0], duh[1], axes=ax, color="LightBlue") plt.scatter(fuh[0], fuh[1], axes=ax, color="pink") plt.show() dCmpR = CmpH5Reader(dcmp) mgc = numpy.array([GetSubreadGC(sr) for sr in gStats.npdata["s"]]) umgc = numpy.array([GetSubreadGC(sr) for sr in gStats.npdata["us"]]) dmgc = numpy.array([GetSubreadGC(sr) for sr in dStats.npdata["s"]]) dumgc = numpy.array([GetSubreadGC(sr) for sr in dStats.npdata["us"]]) hmgc = numpy.array([GetSubreadGC(sr) for sr in hStats.npdata["s"]]) humgc = numpy.array([GetSubreadGC(sr) for sr in hStats.npdata["us"]]) def GetLengths(subreads): return numpy.array([len(sr.basecalls()) for sr in subreads])
def _openCmpH5(self, aset_path): print aset_path return CmpH5Reader(aset_path), True