def callConsensus(): def makeReadAndReads(zmwsForBC): ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw]) srData = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in zmwsForBC if zmw], []) if not srData and not ccsData: return (None,None) def getSeedRead(reads, lq = 80, uq = 90, sLambda = lambda x : -x.zmw.readScore): lens = map(len, reads) candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq)) pfReads = [read for read,l in zip(reads, lens) if l >= candidateRange[0] and l <= candidateRange[1]] pfReads.sort(key = sLambda) return pfReads[0] if len(pfReads) else None if ccsData: ## all CCS reads should be the *same* length for an ## amplicon. Let's take the middle ones seedRead = getSeedRead(ccsData, lq = 30, uq = 70, sLambda = lambda x: -x.zmw.numPasses) if not seedRead: seedRead = getSeedRead(srData) logging.info("Unable to use a CCS read for the seed read.") else: logging.info("Using a CCS read for the seed read.") else: logging.info("Using a raw read for the seed read") seedRead = getSeedRead(srData) return (seedRead, srData) # check to make sure that you have the necessary dependencies, # i.e., hgap script, blasr, etc. try: import pbtools.pbdagcon except ImportError: raise ImportError("Unable to find dependency `pbdagcon` - please install.") # retrieve ZMWs by barcode if runner.args.barcode: zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) else: zmwsForBCs = getZmwsForBarcodes() # subsample zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()} logging.info("unfiltered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # filter ZMWs zmwsForBCs = filterZmws(zmwsForBCs) logging.info("filtered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # now choose the best subread to seed the assembly if runner.args.ccsFofn: # XXX: This part depends on the filenames of the ccs and input # fofns, this is essentially a workaround to the fact the the # part isn't part of the API ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in open(runner.args.ccsFofn).read().splitlines()} # fill in the CCS spot. for k,v in zmwsForBCs.items(): l = [] for zmw,lZmw in v: r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] l.append((zmw,lZmw,r[zmw.holeNumber])) zmwsForBCs[k] = l else: # add none to the CCS spot. zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v] for k,v in zmwsForBCs.iteritems()} readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() } # remove barcodes that don't have a seed read and a set of useable reads. readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] } # generate FASTA files outDir = runner.args.outDir for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) if not os.path.exists(bcdir): os.makedirs(bcdir) # emit the seeds to separte files with FastaWriter("%s/seed_read.fasta" % bcdir) as w: w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) subreads = reads[1] # emit the subreads to a single file with FastaWriter("%s/subreads.fasta" % bcdir) as w: for r in subreads: w.writeRecord(FastaRecord(r.readName, r.basecalls())) # construct the region file by subsetting the ZMWs that you # are interested in. nfofn = [] for inFof, in zipFofns(runner.args.inputFofn): bh5 = BaxH5Reader(inFof) reg = bh5.file['/PulseData/Regions'] inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName, subreads) holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie])) if any(holes): nreg = reg[holes,:] else: nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32') fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) nfile = h5.File(fname, 'w') ndset = nfile.create_dataset('/PulseData/Regions', data = nreg, maxshape = (None, None)) copyAttributes(reg, ndset) nfile.close() nfofn.append(fname) ofile = open('%s/region.fofn' % bcdir, 'w') ofile.writelines("\n".join(nfofn)) ofile.close() ## call gcon outDirs = [ (outDir, k) for k in readAndReads.keys() ] if runner.args.nProcs == 1: outFasta = filter(lambda z: z, map(gconFunc, outDirs)) else: pool = Pool(runner.args.nProcs) outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs)) ## write the results with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: for r in outFasta: w.writeRecord(r) ## optionally cleanup if not runner.args.keepTmpDir: for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) shutil.rmtree(bcdir)
def cmpH5Merge(inFiles, outFile, referencesFile=None): # expand any fofns in inFiles expandedInFiles = [] for fileName in inFiles: if fileName.endswith(".fofn"): expandedInFiles.extend( [k.strip() for k in open(fileName).readlines()]) else: expandedInFiles.append(fileName) # input validation. This is kinda clunky inps = {_fileExists(f) for f in expandedInFiles} # Not sure if this is the expected behavior if os.path.isabs(outFile): outp = outFile else: outp = os.path.join(os.getcwd(), outFile) if outp in inps: raise ValueError( "Outfile {f} was provided as an input file.".format(f=outp)) if referencesFile is not None: selectedReferences = [ k.strip() for k in open(referencesFile).readlines() ] else: selectedReferences = None # start the analysis try: logging.debug("Processing:\n\t" + "\t\n".join(expandedInFiles)) logging.debug("Writing to:" + str(outFile)) inCmps = [H5.File(z, 'r') for z in expandedInFiles] outCmp = H5.File(outFile, 'w') logging.debug("Loaded input and output h5 files.") if not allEqual([CmpH5Format(z).VERSION for z in inCmps]): raise PBH5ToolsException("merge", "Different cmp.h5 versions.") fmt = CmpH5Format(inCmps[0]) if not allEqual([z[fmt.REF_INFO]['MD5'].value for z in inCmps]): raise PBH5ToolsException("merge", "Different reference sequences.") # Remove cmp.h5 files which have no alignment inNonEmptyCmps = [] for f in inCmps: alnNum = 0 try: alnNum = f['/AlnInfo/AlnIndex'].shape[0] if alnNum > 0: inNonEmptyCmps.append(f) else: logging.warn("Skipping emtpy file: %s" % f.filename) except Exception: logging.warn("Skipping emtpy file: %s" % f.filename) inCmps = inNonEmptyCmps if not len(inCmps): raise PBH5ToolsException("merge", "No non-empty files to merge.") # check for consistency of things like barcode and edna/z score # datasets. hasBarcode = all([fmt.BARCODE_INFO in z for z in inCmps]) extraDatasets = [ set( filter(lambda x: not x == fmt.ALN_INDEX_NAME, z[fmt.ALN_INFO].keys())) for z in inCmps ] extraDatasets = reduce(set.intersection, extraDatasets) def filterPrint(x): if empty(x): logging.warn("Skipping emtpy file: %s" % x.filename) return False else: return True inCmps = filter(filterPrint, inCmps) if not len(inCmps): raise PBH5ToolsException("merge", "No non-empty files to merge.") # copy REF_INFO, FILE_LOG, and BARCODE_INFO if its there. outCmp.copy(inCmps[0][fmt.REF_INFO], fmt.REF_INFO) outCmp.copy(inCmps[0][fmt.FILE_LOG], fmt.FILE_LOG) if hasBarcode: outCmp.copy(inCmps[0][fmt.BARCODE_INFO], fmt.BARCODE_INFO) # top-level attributes. copyAttributes(inCmps[0], outCmp) deleteAttrIfExists(outCmp, fmt.INDEX_ATTR) # go through by REF_INFO_ID and select the relevant bits from each file. refInfoIDs = outCmp[fmt.REF_INFO]['ID'].value # process the movies upfront. umovies = processMovies(outCmp, inCmps, fmt) # an increment for new ALN_GROUP/ID values alnIDBegin = 1 # either way you structure the loops annoyances arise. for cmpH5 in inCmps: logging.debug("Processing: %s" % cmpH5.filename) # we are going to map the ref ids into the globaly unique # refInfoIDs. refIDMap = dict( zip(cmpH5[fmt.REF_GROUP_ID].value, cmpH5[fmt.REF_GROUP_INFO_ID].value)) refPathMap = dict( zip(cmpH5[fmt.REF_GROUP_INFO_ID].value, [os.path.basename(k) for k in cmpH5[fmt.REF_GROUP_PATH]])) # make a map from this cmpH5's movies to the new movie ID. movieMap = {} for oid, nm in zip(cmpH5[fmt.MOVIE_INFO_ID], cmpH5[fmt.MOVIE_INFO_NAME]): newID = [z[0] for z in umovies if z[1] == nm] if len(newID) == 1: movieMap[oid] = newID[0] else: raise PBH5ToolsException("merge", "Error processing movies.") for rID in refInfoIDs: if rID not in refIDMap.values(): logging.info("Skipping reference with no reads.") continue if selectedReferences is not None: if refPathMap[rID] not in selectedReferences: continue # compute new reference ID. aIdx = cmpH5[fmt.ALN_INDEX].value refID = {x: y for y, x in refIDMap.iteritems()}[rID] refName = makeRefName(rID) # which reads go to this reference. whichReads = aIdx[:, fmt.REF_ID] == refID if not any(whichReads): # this should be covered by the test at the top, # but it is not really perfectly defined by the # spec as to whether something in the ref group # *has* to have alignments. continue aIdx = aIdx[whichReads, ] aIdx[:, fmt.REF_ID] = rID # make a map between old and new IDs uAlnIDs = NP.unique(aIdx[:, fmt.ALN_ID]) alnIDMap = dict( zip(uAlnIDs, NP.array(range(0, len(uAlnIDs))) + alnIDBegin)) alnGroup = {k:v for k,v in zip(cmpH5[fmt.ALN_GROUP_ID].value, cmpH5[fmt.ALN_GROUP_PATH].value) if \ k in uAlnIDs} newAlnGroup = [ (alnIDMap[k], "/%s/%s-%d" % (refName, os.path.basename(alnGroup[k]), alnIDMap[k]), alnGroup[k]) for k in alnGroup.keys() ] # Set the new ALN_ID vals in the ALN_INDEX. aIdx[:, fmt.ALN_ID] = NP.array([ alnIDMap[aIdx[i, fmt.ALN_ID]] for i in range(0, aIdx.shape[0]) ]) # Set the new MOVIE_ID vals. aIdx[:, fmt.MOVIE_ID] = NP.array([ movieMap[aIdx[i, fmt.MOVIE_ID]] for i in range(0, aIdx.shape[0]) ]) # copy the array data. for (nid, newGroup, oldGroup) in newAlnGroup: logging.debug("Copying: \nfrom: %s \ninto: %s" % \ (oldGroup, newGroup)) if not os.path.dirname(newGroup) in outCmp: outCmp.create_group(refName) outCmp.copy(cmpH5[oldGroup], outCmp[refName], name=os.path.basename(newGroup)) # increment the ALN_GROUP id offset. alnIDBegin = alnIDBegin + len(uAlnIDs) # write the adjusted alignment information. makeOrAppend(outCmp, fmt.ALN_INDEX, aIdx) # write extra datasets in the ALN_INFO group for extra in extraDatasets: pth = '/'.join([fmt.ALN_INFO, extra]) logging.info("Processing extra dataset: %s" % pth) makeOrAppend(outCmp, pth, cmpH5[pth].value[whichReads, ]) # write the ALN_GROUP. makeOrAppend( outCmp, fmt.ALN_GROUP_ID, NP.array([nid for nid, a, b in newAlnGroup], dtype=cmpH5[fmt.ALN_GROUP_ID].dtype)) makeOrAppend( outCmp, fmt.ALN_GROUP_PATH, NP.array([npth for a, npth, b in newAlnGroup], dtype=cmpH5[fmt.ALN_GROUP_PATH].dtype)) # now depending on what references had alignments we'll make the # new REF_GROUP. uRefsWithAlignments = NP.unique(outCmp[fmt.ALN_INDEX][:, fmt.REF_ID]) outCmp.create_dataset(fmt.REF_GROUP_ID, data=uRefsWithAlignments, dtype=inCmps[0][fmt.REF_GROUP_ID].dtype) outCmp.create_dataset(fmt.REF_GROUP_PATH, data=NP.array([('/' + makeRefName(z)) for z in uRefsWithAlignments]), dtype=inCmps[0][fmt.REF_GROUP_PATH].dtype) outCmp.create_dataset(fmt.REF_GROUP_INFO_ID, data=uRefsWithAlignments, dtype=inCmps[0][fmt.REF_GROUP_INFO_ID].dtype) # reset the IDs outCmp[fmt.ALN_INDEX][:, fmt.ID] = range( 1, outCmp[fmt.ALN_INDEX].shape[0] + 1) # reset the molecule IDs outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID] = \ ((NP.max(outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID]) * (outCmp[fmt.ALN_INDEX][:,fmt.MOVIE_ID] - 1)) + outCmp[fmt.ALN_INDEX][:,fmt.HOLE_NUMBER] + 1) # close the sucker. outCmp.close() except Exception, e: try: # remove the file as it won't be correct if os.path.exists(outFile): os.remove(outFile) except: pass raise
def cmpH5Merge(inFiles, outFile, referencesFile=None): # expand any fofns in inFiles expandedInFiles = [] for fileName in inFiles: if fileName.endswith(".fofn"): expandedInFiles.extend([k.strip() for k in open(fileName).readlines()]) else: expandedInFiles.append(fileName) # input validation. This is kinda clunky inps = {_fileExists(f) for f in expandedInFiles} # Not sure if this is the expected behavior if os.path.isabs(outFile): outp = outFile else: outp = os.path.join(os.getcwd(), outFile) if outp in inps: raise ValueError("Outfile {f} was provided as an input file.".format(f=outp)) if referencesFile is not None: selectedReferences = [k.strip() for k in open(referencesFile).readlines()] else: selectedReferences = None # start the analysis try: logging.debug("Processing:\n\t" + "\t\n".join(expandedInFiles)) logging.debug("Writing to:" + str(outFile)) inCmps = [H5.File(z, 'r') for z in expandedInFiles] outCmp = H5.File(outFile, 'w') logging.debug("Loaded input and output h5 files.") if not allEqual([CmpH5Format(z).VERSION for z in inCmps]): raise PBH5ToolsException("merge", "Different cmp.h5 versions.") fmt = CmpH5Format(inCmps[0]) if not allEqual([z[fmt.REF_INFO]['MD5'].value for z in inCmps]): raise PBH5ToolsException("merge", "Different reference sequences.") # Remove cmp.h5 files which have no alignment inNonEmptyCmps = [] for f in inCmps: alnNum = 0 try: alnNum = f['/AlnInfo/AlnIndex'].shape[0] if alnNum > 0: inNonEmptyCmps.append(f) else: logging.warn("Skipping emtpy file: %s" % f.filename) except Exception: logging.warn("Skipping emtpy file: %s" % f.filename) inCmps = inNonEmptyCmps if not len(inCmps): raise PBH5ToolsException("merge", "No non-empty files to merge.") # check for consistency of things like barcode and edna/z score # datasets. hasBarcode = all([ fmt.BARCODE_INFO in z for z in inCmps ]) extraDatasets = [set(filter(lambda x : not x == fmt.ALN_INDEX_NAME, z[fmt.ALN_INFO].keys())) for z in inCmps ] extraDatasets = reduce(set.intersection, extraDatasets) def filterPrint(x): if empty(x): logging.warn("Skipping emtpy file: %s" % x.filename) return False else: return True inCmps = filter(filterPrint, inCmps) if not len(inCmps): raise PBH5ToolsException("merge", "No non-empty files to merge.") # copy REF_INFO, FILE_LOG, and BARCODE_INFO if its there. outCmp.copy(inCmps[0][fmt.REF_INFO], fmt.REF_INFO) outCmp.copy(inCmps[0][fmt.FILE_LOG], fmt.FILE_LOG) if hasBarcode: outCmp.copy(inCmps[0][fmt.BARCODE_INFO], fmt.BARCODE_INFO) # top-level attributes. copyAttributes(inCmps[0], outCmp) deleteAttrIfExists(outCmp, fmt.INDEX_ATTR) # go through by REF_INFO_ID and select the relevant bits from each file. refInfoIDs = outCmp[fmt.REF_INFO]['ID'].value # process the movies upfront. umovies = processMovies(outCmp, inCmps, fmt) # an increment for new ALN_GROUP/ID values alnIDBegin = 1 # either way you structure the loops annoyances arise. for cmpH5 in inCmps: logging.debug("Processing: %s" % cmpH5.filename) # we are going to map the ref ids into the globaly unique # refInfoIDs. refIDMap = dict(zip(cmpH5[fmt.REF_GROUP_ID].value, cmpH5[fmt.REF_GROUP_INFO_ID].value)) refPathMap = dict(zip(cmpH5[fmt.REF_GROUP_INFO_ID].value, [os.path.basename(k) for k in cmpH5[fmt.REF_GROUP_PATH]])) # make a map from this cmpH5's movies to the new movie ID. movieMap = {} for oid,nm in zip(cmpH5[fmt.MOVIE_INFO_ID], cmpH5[fmt.MOVIE_INFO_NAME]): newID = [z[0] for z in umovies if z[1] == nm] if len(newID) == 1: movieMap[oid] = newID[0] else: raise PBH5ToolsException("merge", "Error processing movies.") for rID in refInfoIDs: if rID not in refIDMap.values(): logging.info("Skipping reference with no reads.") continue if selectedReferences is not None: if refPathMap[rID] not in selectedReferences: continue # compute new reference ID. aIdx = cmpH5[fmt.ALN_INDEX].value refID = {x:y for y,x in refIDMap.iteritems()}[rID] refName = makeRefName(rID) # which reads go to this reference. whichReads = aIdx[:,fmt.REF_ID] == refID if not any(whichReads): # this should be covered by the test at the top, # but it is not really perfectly defined by the # spec as to whether something in the ref group # *has* to have alignments. continue aIdx = aIdx[whichReads, ] aIdx[:,fmt.REF_ID] = rID # make a map between old and new IDs uAlnIDs = NP.unique(aIdx[:,fmt.ALN_ID]) alnIDMap = dict(zip(uAlnIDs, NP.array(range(0, len(uAlnIDs))) + alnIDBegin)) alnGroup = {k:v for k,v in zip(cmpH5[fmt.ALN_GROUP_ID].value, cmpH5[fmt.ALN_GROUP_PATH].value) if \ k in uAlnIDs} newAlnGroup = [(alnIDMap[k], "/%s/%s-%d" % (refName, os.path.basename(alnGroup[k]), alnIDMap[k]), alnGroup[k]) for k in alnGroup.keys()] # Set the new ALN_ID vals in the ALN_INDEX. aIdx[:,fmt.ALN_ID] = NP.array([alnIDMap[aIdx[i,fmt.ALN_ID]] for i in range(0, aIdx.shape[0])]) # Set the new MOVIE_ID vals. aIdx[:,fmt.MOVIE_ID] = NP.array([movieMap[aIdx[i,fmt.MOVIE_ID]] for i in range(0, aIdx.shape[0])]) # copy the array data. for (nid,newGroup,oldGroup) in newAlnGroup: logging.debug("Copying: \nfrom: %s \ninto: %s" % \ (oldGroup, newGroup)) if not os.path.dirname(newGroup) in outCmp: outCmp.create_group(refName) outCmp.copy(cmpH5[oldGroup], outCmp[refName], name = os.path.basename(newGroup)) # increment the ALN_GROUP id offset. alnIDBegin = alnIDBegin + len(uAlnIDs) # write the adjusted alignment information. makeOrAppend(outCmp, fmt.ALN_INDEX, aIdx) # write extra datasets in the ALN_INFO group for extra in extraDatasets: pth = '/'.join([fmt.ALN_INFO, extra]) logging.info("Processing extra dataset: %s" % pth) makeOrAppend(outCmp, pth, cmpH5[pth].value[whichReads,]) # write the ALN_GROUP. makeOrAppend(outCmp, fmt.ALN_GROUP_ID, NP.array([nid for nid,a,b in newAlnGroup], dtype = cmpH5[fmt.ALN_GROUP_ID].dtype)) makeOrAppend(outCmp, fmt.ALN_GROUP_PATH, NP.array([npth for a,npth,b in newAlnGroup], dtype = cmpH5[fmt.ALN_GROUP_PATH].dtype)) # now depending on what references had alignments we'll make the # new REF_GROUP. uRefsWithAlignments = NP.unique(outCmp[fmt.ALN_INDEX][:,fmt.REF_ID]) outCmp.create_dataset(fmt.REF_GROUP_ID, data = uRefsWithAlignments, dtype = inCmps[0][fmt.REF_GROUP_ID].dtype) outCmp.create_dataset(fmt.REF_GROUP_PATH, data = NP.array([('/' + makeRefName(z)) for z in uRefsWithAlignments]), dtype = inCmps[0][fmt.REF_GROUP_PATH].dtype) outCmp.create_dataset(fmt.REF_GROUP_INFO_ID, data = uRefsWithAlignments, dtype = inCmps[0][fmt.REF_GROUP_INFO_ID].dtype) # reset the IDs outCmp[fmt.ALN_INDEX][:,fmt.ID] = range(1, outCmp[fmt.ALN_INDEX].shape[0] + 1) # reset the molecule IDs outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID] = \ ((NP.max(outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID]) * (outCmp[fmt.ALN_INDEX][:,fmt.MOVIE_ID] - 1)) + outCmp[fmt.ALN_INDEX][:,fmt.HOLE_NUMBER] + 1) # close the sucker. outCmp.close() except Exception, e: try: # remove the file as it won't be correct if os.path.exists(outFile): os.remove(outFile) except: pass raise
def callConsensus(): def makeReadAndReads(zmwsForBC): ccsData = filter(lambda x: x, [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw]) srData = reduce(lambda x, y: x + y, [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], []) if not srData and not ccsData: return (None, None) def getSeedRead(reads, lq=80, uq=90, sLambda=lambda x: -x.zmw.readScore): lens = map(len, reads) candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq)) pfReads = [ read for read, l in zip(reads, lens) if l >= candidateRange[0] and l <= candidateRange[1] ] pfReads.sort(key=sLambda) return pfReads[0] if len(pfReads) else None if ccsData: ## all CCS reads should be the *same* length for an ## amplicon. Let's take the middle ones seedRead = getSeedRead(ccsData, lq=30, uq=70, sLambda=lambda x: -x.zmw.numPasses) if not seedRead: seedRead = getSeedRead(srData) logging.info("Unable to use a CCS read for the seed read.") else: logging.info("Using a CCS read for the seed read.") else: logging.info("Using a raw read for the seed read") seedRead = getSeedRead(srData) return (seedRead, srData) # check to make sure that you have the necessary dependencies, # i.e., hgap script, blasr, etc. try: import pbtools.pbdagcon except ImportError: raise ImportError( "Unable to find dependency `pbdagcon` - please install.") # retrieve ZMWs by barcode if runner.args.barcode: zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) else: zmwsForBCs = getZmwsForBarcodes() # subsample zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()} logging.info("unfiltered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # filter ZMWs zmwsForBCs = filterZmws(zmwsForBCs) logging.info("filtered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # now choose the best subread to seed the assembly if runner.args.ccsFofn: # XXX: This part depends on the filenames of the ccs and input # fofns, this is essentially a workaround to the fact the the # part isn't part of the API ccsReaders = { movieNameFromFile(l): BasH5Reader(l) for l in open(runner.args.ccsFofn).read().splitlines() } # fill in the CCS spot. for k, v in zmwsForBCs.items(): l = [] for zmw, lZmw in v: r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] l.append((zmw, lZmw, r[zmw.holeNumber])) zmwsForBCs[k] = l else: # add none to the CCS spot. zmwsForBCs = { k: [(zmw, lZmw, None) for zmw, lZmw in v] for k, v in zmwsForBCs.iteritems() } readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()} # remove barcodes that don't have a seed read and a set of useable reads. readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]} # generate FASTA files outDir = runner.args.outDir for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) if not os.path.exists(bcdir): os.makedirs(bcdir) # emit the seeds to separte files with FastaWriter("%s/seed_read.fasta" % bcdir) as w: w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) subreads = reads[1] # emit the subreads to a single file with FastaWriter("%s/subreads.fasta" % bcdir) as w: for r in subreads: w.writeRecord(FastaRecord(r.readName, r.basecalls())) # construct the region file by subsetting the ZMWs that you # are interested in. nfofn = [] for inFof, in zipFofns(runner.args.inputFofn): bh5 = BaxH5Reader(inFof) reg = bh5.file['/PulseData/Regions'] inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName, subreads) holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie])) if any(holes): nreg = reg[holes, :] else: nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32') fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) nfile = h5.File(fname, 'w') ndset = nfile.create_dataset('/PulseData/Regions', data=nreg, maxshape=(None, None)) copyAttributes(reg, ndset) nfile.close() nfofn.append(fname) ofile = open('%s/region.fofn' % bcdir, 'w') ofile.writelines("\n".join(nfofn)) ofile.close() ## call gcon outDirs = [(outDir, k) for k in readAndReads.keys()] if runner.args.nProcs == 1: outFasta = filter(lambda z: z, map(gconFunc, outDirs)) else: pool = Pool(runner.args.nProcs) outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs)) ## write the results with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: for r in outFasta: w.writeRecord(r) ## optionally cleanup if not runner.args.keepTmpDir: for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) shutil.rmtree(bcdir)