Пример #1
0
def callConsensus():
    def makeReadAndReads(zmwsForBC):
        ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw])
        srData  = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in
                                            zmwsForBC if zmw], [])
        if not srData and not ccsData:
            return (None,None)

        def getSeedRead(reads, lq = 80, uq = 90, 
                        sLambda = lambda x : -x.zmw.readScore):
            lens = map(len, reads)
            candidateRange = (n.percentile(lens, lq), 
                              n.percentile(lens, uq))
            pfReads = [read for read,l in zip(reads, lens) if 
                       l >= candidateRange[0] and l <= candidateRange[1]]
            pfReads.sort(key = sLambda)
            return pfReads[0] if len(pfReads) else None

        if ccsData:
            ## all CCS reads should be the *same* length for an
            ## amplicon. Let's take the middle ones
            seedRead = getSeedRead(ccsData, lq = 30, uq = 70,
                                   sLambda = lambda x: -x.zmw.numPasses)
            if not seedRead:
                seedRead = getSeedRead(srData)
                logging.info("Unable to use a CCS read for the seed read.")
            else:
                logging.info("Using a CCS read for the seed read.")
        else:
            logging.info("Using a raw read for the seed read")
            seedRead = getSeedRead(srData)
        
        return (seedRead, srData)
    
    # check to make sure that you have the necessary dependencies,
    # i.e., hgap script, blasr, etc.
    try:
        import pbtools.pbdagcon
    except ImportError:
        raise ImportError("Unable to find dependency `pbdagcon` - please install.")

    # retrieve ZMWs by barcode
    if runner.args.barcode:
        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
    else:
        zmwsForBCs = getZmwsForBarcodes()
    
    # subsample
    zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()}

    logging.info("unfiltered average zmws per barcode: %g" % 
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # filter ZMWs
    zmwsForBCs = filterZmws(zmwsForBCs)
    
    logging.info("filtered average zmws per barcode: %g" % 
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # now choose the best subread to seed the assembly
    if runner.args.ccsFofn:
        # XXX: This part depends on the filenames of the ccs and input
        # fofns, this is essentially a workaround to the fact the the
        # part isn't part of the API
        ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in 
                      open(runner.args.ccsFofn).read().splitlines()}
        
        # fill in the CCS spot.
        for k,v in zmwsForBCs.items():
            l = []
            for zmw,lZmw in v:
                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
                l.append((zmw,lZmw,r[zmw.holeNumber]))
            zmwsForBCs[k] = l
    else:
        # add none to the CCS spot.
        zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v] 
                      for k,v in zmwsForBCs.iteritems()}

    readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() }

    # remove barcodes that don't have a seed read and a set of useable reads.
    readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] }
   
    # generate FASTA files
    outDir = runner.args.outDir

    for barcode, reads in readAndReads.items():
        bcdir = '/'.join((outDir, barcode))
        if not os.path.exists(bcdir):
            os.makedirs(bcdir)

        # emit the seeds to separte files
        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))

        subreads = reads[1]
        
        # emit the subreads to a single file
        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
            for r in subreads:
                w.writeRecord(FastaRecord(r.readName, r.basecalls()))

        # construct the region file by subsetting the ZMWs that you
        # are interested in.
        nfofn = []
        for inFof, in zipFofns(runner.args.inputFofn):
            bh5 = BaxH5Reader(inFof)
            reg = bh5.file['/PulseData/Regions']
            inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName, 
                             subreads)
            holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie]))
            if any(holes): 
                nreg = reg[holes,:]
            else:
                nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32')

            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
            nfile = h5.File(fname, 'w')
            ndset = nfile.create_dataset('/PulseData/Regions', data = nreg, 
                                         maxshape = (None, None))
            copyAttributes(reg, ndset)
            nfile.close()
            nfofn.append(fname)
        
        ofile = open('%s/region.fofn' % bcdir, 'w')
        ofile.writelines("\n".join(nfofn))
        ofile.close()
    
    ## call gcon
    outDirs  = [ (outDir, k) for k in readAndReads.keys() ]
    if runner.args.nProcs == 1:
        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
    else:
        pool = Pool(runner.args.nProcs)
        outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs))

    ## write the results
    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
        for r in outFasta:
            w.writeRecord(r)

    ## optionally cleanup
    if not runner.args.keepTmpDir:
        for barcode, reads in readAndReads.items():
             bcdir = '/'.join((outDir, barcode))
             shutil.rmtree(bcdir)
Пример #2
0
def cmpH5Merge(inFiles, outFile, referencesFile=None):

    # expand any fofns in inFiles
    expandedInFiles = []
    for fileName in inFiles:
        if fileName.endswith(".fofn"):
            expandedInFiles.extend(
                [k.strip() for k in open(fileName).readlines()])
        else:
            expandedInFiles.append(fileName)

    # input validation. This is kinda clunky
    inps = {_fileExists(f) for f in expandedInFiles}

    # Not sure if this is the expected behavior
    if os.path.isabs(outFile):
        outp = outFile
    else:
        outp = os.path.join(os.getcwd(), outFile)

    if outp in inps:
        raise ValueError(
            "Outfile {f} was provided as an input file.".format(f=outp))

    if referencesFile is not None:
        selectedReferences = [
            k.strip() for k in open(referencesFile).readlines()
        ]
    else:
        selectedReferences = None

    # start the analysis
    try:
        logging.debug("Processing:\n\t" + "\t\n".join(expandedInFiles))
        logging.debug("Writing to:" + str(outFile))

        inCmps = [H5.File(z, 'r') for z in expandedInFiles]
        outCmp = H5.File(outFile, 'w')

        logging.debug("Loaded input and output h5 files.")

        if not allEqual([CmpH5Format(z).VERSION for z in inCmps]):
            raise PBH5ToolsException("merge", "Different cmp.h5 versions.")

        fmt = CmpH5Format(inCmps[0])

        if not allEqual([z[fmt.REF_INFO]['MD5'].value for z in inCmps]):
            raise PBH5ToolsException("merge", "Different reference sequences.")

        # Remove cmp.h5 files which have no alignment
        inNonEmptyCmps = []
        for f in inCmps:
            alnNum = 0
            try:
                alnNum = f['/AlnInfo/AlnIndex'].shape[0]
                if alnNum > 0:
                    inNonEmptyCmps.append(f)
                else:
                    logging.warn("Skipping emtpy file: %s" % f.filename)
            except Exception:
                logging.warn("Skipping emtpy file: %s" % f.filename)

        inCmps = inNonEmptyCmps

        if not len(inCmps):
            raise PBH5ToolsException("merge", "No non-empty files to merge.")

        # check for consistency of things like barcode and edna/z score
        # datasets.
        hasBarcode = all([fmt.BARCODE_INFO in z for z in inCmps])
        extraDatasets = [
            set(
                filter(lambda x: not x == fmt.ALN_INDEX_NAME,
                       z[fmt.ALN_INFO].keys())) for z in inCmps
        ]
        extraDatasets = reduce(set.intersection, extraDatasets)

        def filterPrint(x):
            if empty(x):
                logging.warn("Skipping emtpy file: %s" % x.filename)
                return False
            else:
                return True

        inCmps = filter(filterPrint, inCmps)

        if not len(inCmps):
            raise PBH5ToolsException("merge", "No non-empty files to merge.")

        # copy REF_INFO, FILE_LOG, and BARCODE_INFO if its there.
        outCmp.copy(inCmps[0][fmt.REF_INFO], fmt.REF_INFO)
        outCmp.copy(inCmps[0][fmt.FILE_LOG], fmt.FILE_LOG)

        if hasBarcode:
            outCmp.copy(inCmps[0][fmt.BARCODE_INFO], fmt.BARCODE_INFO)

        # top-level attributes.
        copyAttributes(inCmps[0], outCmp)
        deleteAttrIfExists(outCmp, fmt.INDEX_ATTR)

        # go through by REF_INFO_ID and select the relevant bits from each file.
        refInfoIDs = outCmp[fmt.REF_INFO]['ID'].value

        # process the movies upfront.
        umovies = processMovies(outCmp, inCmps, fmt)

        # an increment for new ALN_GROUP/ID values
        alnIDBegin = 1

        # either way you structure the loops annoyances arise.
        for cmpH5 in inCmps:
            logging.debug("Processing: %s" % cmpH5.filename)

            # we are going to map the ref ids into the globaly unique
            # refInfoIDs.
            refIDMap = dict(
                zip(cmpH5[fmt.REF_GROUP_ID].value,
                    cmpH5[fmt.REF_GROUP_INFO_ID].value))

            refPathMap = dict(
                zip(cmpH5[fmt.REF_GROUP_INFO_ID].value,
                    [os.path.basename(k) for k in cmpH5[fmt.REF_GROUP_PATH]]))

            # make a map from this cmpH5's movies to the new movie ID.
            movieMap = {}
            for oid, nm in zip(cmpH5[fmt.MOVIE_INFO_ID],
                               cmpH5[fmt.MOVIE_INFO_NAME]):
                newID = [z[0] for z in umovies if z[1] == nm]
                if len(newID) == 1:
                    movieMap[oid] = newID[0]
                else:
                    raise PBH5ToolsException("merge",
                                             "Error processing movies.")

            for rID in refInfoIDs:
                if rID not in refIDMap.values():
                    logging.info("Skipping reference with no reads.")
                    continue
                if selectedReferences is not None:
                    if refPathMap[rID] not in selectedReferences:
                        continue

                # compute new reference ID.
                aIdx = cmpH5[fmt.ALN_INDEX].value
                refID = {x: y for y, x in refIDMap.iteritems()}[rID]
                refName = makeRefName(rID)

                # which reads go to this reference.
                whichReads = aIdx[:, fmt.REF_ID] == refID
                if not any(whichReads):
                    # this should be covered by the test at the top,
                    # but it is not really perfectly defined by the
                    # spec as to whether something in the ref group
                    # *has* to have alignments.
                    continue
                aIdx = aIdx[whichReads, ]
                aIdx[:, fmt.REF_ID] = rID

                # make a map between old and new IDs
                uAlnIDs = NP.unique(aIdx[:, fmt.ALN_ID])
                alnIDMap = dict(
                    zip(uAlnIDs,
                        NP.array(range(0, len(uAlnIDs))) + alnIDBegin))
                alnGroup = {k:v for k,v in zip(cmpH5[fmt.ALN_GROUP_ID].value,
                                               cmpH5[fmt.ALN_GROUP_PATH].value) if \
                                k in uAlnIDs}
                newAlnGroup = [
                    (alnIDMap[k], "/%s/%s-%d" %
                     (refName, os.path.basename(alnGroup[k]), alnIDMap[k]),
                     alnGroup[k]) for k in alnGroup.keys()
                ]

                # Set the new ALN_ID vals in the ALN_INDEX.
                aIdx[:, fmt.ALN_ID] = NP.array([
                    alnIDMap[aIdx[i, fmt.ALN_ID]]
                    for i in range(0, aIdx.shape[0])
                ])
                # Set the new MOVIE_ID vals.
                aIdx[:, fmt.MOVIE_ID] = NP.array([
                    movieMap[aIdx[i, fmt.MOVIE_ID]]
                    for i in range(0, aIdx.shape[0])
                ])

                # copy the array data.
                for (nid, newGroup, oldGroup) in newAlnGroup:
                    logging.debug("Copying: \nfrom: %s \ninto: %s" % \
                                      (oldGroup, newGroup))
                    if not os.path.dirname(newGroup) in outCmp:
                        outCmp.create_group(refName)
                    outCmp.copy(cmpH5[oldGroup],
                                outCmp[refName],
                                name=os.path.basename(newGroup))

                # increment the ALN_GROUP id offset.
                alnIDBegin = alnIDBegin + len(uAlnIDs)

                # write the adjusted alignment information.
                makeOrAppend(outCmp, fmt.ALN_INDEX, aIdx)

                # write extra datasets in the ALN_INFO group
                for extra in extraDatasets:
                    pth = '/'.join([fmt.ALN_INFO, extra])
                    logging.info("Processing extra dataset: %s" % pth)
                    makeOrAppend(outCmp, pth, cmpH5[pth].value[whichReads, ])

                # write the ALN_GROUP.
                makeOrAppend(
                    outCmp, fmt.ALN_GROUP_ID,
                    NP.array([nid for nid, a, b in newAlnGroup],
                             dtype=cmpH5[fmt.ALN_GROUP_ID].dtype))
                makeOrAppend(
                    outCmp, fmt.ALN_GROUP_PATH,
                    NP.array([npth for a, npth, b in newAlnGroup],
                             dtype=cmpH5[fmt.ALN_GROUP_PATH].dtype))

        # now depending on what references had alignments we'll make the
        # new REF_GROUP.
        uRefsWithAlignments = NP.unique(outCmp[fmt.ALN_INDEX][:, fmt.REF_ID])
        outCmp.create_dataset(fmt.REF_GROUP_ID,
                              data=uRefsWithAlignments,
                              dtype=inCmps[0][fmt.REF_GROUP_ID].dtype)
        outCmp.create_dataset(fmt.REF_GROUP_PATH,
                              data=NP.array([('/' + makeRefName(z))
                                             for z in uRefsWithAlignments]),
                              dtype=inCmps[0][fmt.REF_GROUP_PATH].dtype)
        outCmp.create_dataset(fmt.REF_GROUP_INFO_ID,
                              data=uRefsWithAlignments,
                              dtype=inCmps[0][fmt.REF_GROUP_INFO_ID].dtype)

        # reset the IDs
        outCmp[fmt.ALN_INDEX][:, fmt.ID] = range(
            1, outCmp[fmt.ALN_INDEX].shape[0] + 1)
        # reset the molecule IDs
        outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID] = \
            ((NP.max(outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID]) *
              (outCmp[fmt.ALN_INDEX][:,fmt.MOVIE_ID] - 1)) +
             outCmp[fmt.ALN_INDEX][:,fmt.HOLE_NUMBER] + 1)

        # close the sucker.
        outCmp.close()

    except Exception, e:
        try:
            # remove the file as it won't be correct
            if os.path.exists(outFile):
                os.remove(outFile)
        except:
            pass
        raise
Пример #3
0
def cmpH5Merge(inFiles, outFile, referencesFile=None):
    
    # expand any fofns in inFiles
    expandedInFiles = []
    for fileName in inFiles:
        if fileName.endswith(".fofn"):
            expandedInFiles.extend([k.strip() for k in open(fileName).readlines()])
        else:
            expandedInFiles.append(fileName)

    # input validation. This is kinda clunky
    inps = {_fileExists(f) for f in expandedInFiles}

    # Not sure if this is the expected behavior
    if os.path.isabs(outFile):
        outp = outFile
    else:
        outp = os.path.join(os.getcwd(), outFile)

    if outp in inps:
        raise ValueError("Outfile {f} was provided as an input file.".format(f=outp))
    
    if referencesFile is not None:
        selectedReferences = [k.strip() for k in open(referencesFile).readlines()]
    else:
        selectedReferences = None

    # start the analysis
    try:
        logging.debug("Processing:\n\t" + "\t\n".join(expandedInFiles))
        logging.debug("Writing to:" + str(outFile))

        inCmps = [H5.File(z, 'r') for z in expandedInFiles]
        outCmp = H5.File(outFile, 'w')

        logging.debug("Loaded input and output h5 files.")

        if not allEqual([CmpH5Format(z).VERSION for z in inCmps]):
            raise PBH5ToolsException("merge", "Different cmp.h5 versions.")

        fmt = CmpH5Format(inCmps[0])

        if not allEqual([z[fmt.REF_INFO]['MD5'].value for z in inCmps]):
            raise PBH5ToolsException("merge", "Different reference sequences.")

        # Remove cmp.h5 files which have no alignment
        inNonEmptyCmps = []
        for f in inCmps:
            alnNum = 0
            try:
                alnNum = f['/AlnInfo/AlnIndex'].shape[0]
                if alnNum > 0:
                    inNonEmptyCmps.append(f)
                else:
                    logging.warn("Skipping emtpy file: %s" % f.filename)
            except Exception:
                logging.warn("Skipping emtpy file: %s" % f.filename)

        inCmps = inNonEmptyCmps

        if not len(inCmps):
            raise PBH5ToolsException("merge", "No non-empty files to merge.")

        # check for consistency of things like barcode and edna/z score
        # datasets.
        hasBarcode = all([ fmt.BARCODE_INFO in z for z in inCmps ])
        extraDatasets = [set(filter(lambda x : not x == fmt.ALN_INDEX_NAME,
                                    z[fmt.ALN_INFO].keys())) for z in inCmps ]
        extraDatasets = reduce(set.intersection, extraDatasets)

        def filterPrint(x):
            if empty(x):
                logging.warn("Skipping emtpy file: %s" % x.filename)
                return False
            else:
                return True
        inCmps = filter(filterPrint, inCmps)

        if not len(inCmps):
            raise PBH5ToolsException("merge", "No non-empty files to merge.")

        # copy REF_INFO, FILE_LOG, and BARCODE_INFO if its there.
        outCmp.copy(inCmps[0][fmt.REF_INFO], fmt.REF_INFO)
        outCmp.copy(inCmps[0][fmt.FILE_LOG], fmt.FILE_LOG)

        if hasBarcode:
            outCmp.copy(inCmps[0][fmt.BARCODE_INFO], fmt.BARCODE_INFO)

        # top-level attributes.
        copyAttributes(inCmps[0], outCmp)
        deleteAttrIfExists(outCmp, fmt.INDEX_ATTR)

        # go through by REF_INFO_ID and select the relevant bits from each file.
        refInfoIDs = outCmp[fmt.REF_INFO]['ID'].value

        # process the movies upfront.
        umovies = processMovies(outCmp, inCmps, fmt)

        # an increment for new ALN_GROUP/ID values
        alnIDBegin = 1

        # either way you structure the loops annoyances arise.
        for cmpH5 in inCmps:
            logging.debug("Processing: %s" % cmpH5.filename)

            # we are going to map the ref ids into the globaly unique
            # refInfoIDs.
            refIDMap = dict(zip(cmpH5[fmt.REF_GROUP_ID].value,
                                cmpH5[fmt.REF_GROUP_INFO_ID].value))
            
            refPathMap = dict(zip(cmpH5[fmt.REF_GROUP_INFO_ID].value,
                                  [os.path.basename(k) for k in cmpH5[fmt.REF_GROUP_PATH]]))

            # make a map from this cmpH5's movies to the new movie ID.
            movieMap = {}
            for oid,nm in zip(cmpH5[fmt.MOVIE_INFO_ID], cmpH5[fmt.MOVIE_INFO_NAME]):
                newID = [z[0] for z in umovies if z[1] == nm]
                if len(newID) == 1:
                    movieMap[oid] = newID[0]
                else:
                    raise PBH5ToolsException("merge", "Error processing movies.")

            for rID in refInfoIDs:
                if rID not in refIDMap.values():
                    logging.info("Skipping reference with no reads.")
                    continue
                if selectedReferences is not None:
                    if refPathMap[rID] not in selectedReferences:
                        continue


                # compute new reference ID.
                aIdx    = cmpH5[fmt.ALN_INDEX].value
                refID   = {x:y for y,x in refIDMap.iteritems()}[rID]
                refName = makeRefName(rID)

                # which reads go to this reference.
                whichReads = aIdx[:,fmt.REF_ID] == refID
                if not any(whichReads):
                    # this should be covered by the test at the top,
                    # but it is not really perfectly defined by the
                    # spec as to whether something in the ref group
                    # *has* to have alignments.
                    continue
                aIdx = aIdx[whichReads, ]
                aIdx[:,fmt.REF_ID] = rID

                # make a map between old and new IDs
                uAlnIDs  = NP.unique(aIdx[:,fmt.ALN_ID])
                alnIDMap = dict(zip(uAlnIDs, NP.array(range(0, len(uAlnIDs))) +
                                    alnIDBegin))
                alnGroup = {k:v for k,v in zip(cmpH5[fmt.ALN_GROUP_ID].value,
                                               cmpH5[fmt.ALN_GROUP_PATH].value) if \
                                k in uAlnIDs}
                newAlnGroup = [(alnIDMap[k],
                                "/%s/%s-%d" % (refName, os.path.basename(alnGroup[k]),
                                               alnIDMap[k]),
                                alnGroup[k]) for k in alnGroup.keys()]

                # Set the new ALN_ID vals in the ALN_INDEX.
                aIdx[:,fmt.ALN_ID] = NP.array([alnIDMap[aIdx[i,fmt.ALN_ID]] for i in
                                               range(0, aIdx.shape[0])])
                # Set the new MOVIE_ID vals.
                aIdx[:,fmt.MOVIE_ID] = NP.array([movieMap[aIdx[i,fmt.MOVIE_ID]] for i in
                                                 range(0, aIdx.shape[0])])

                # copy the array data.
                for (nid,newGroup,oldGroup) in newAlnGroup:
                    logging.debug("Copying: \nfrom: %s \ninto: %s" % \
                                      (oldGroup, newGroup))
                    if not os.path.dirname(newGroup) in outCmp:
                        outCmp.create_group(refName)
                    outCmp.copy(cmpH5[oldGroup], outCmp[refName],
                                name = os.path.basename(newGroup))

                # increment the ALN_GROUP id offset.
                alnIDBegin = alnIDBegin + len(uAlnIDs)

                # write the adjusted alignment information.
                makeOrAppend(outCmp, fmt.ALN_INDEX, aIdx)

                # write extra datasets in the ALN_INFO group
                for extra in extraDatasets:
                    pth = '/'.join([fmt.ALN_INFO, extra])
                    logging.info("Processing extra dataset: %s" % pth)
                    makeOrAppend(outCmp, pth, cmpH5[pth].value[whichReads,])

                # write the ALN_GROUP.
                makeOrAppend(outCmp, fmt.ALN_GROUP_ID,
                             NP.array([nid for nid,a,b in newAlnGroup],
                                      dtype = cmpH5[fmt.ALN_GROUP_ID].dtype))
                makeOrAppend(outCmp, fmt.ALN_GROUP_PATH,
                             NP.array([npth for a,npth,b in newAlnGroup],
                                      dtype = cmpH5[fmt.ALN_GROUP_PATH].dtype))

        # now depending on what references had alignments we'll make the
        # new REF_GROUP.
        uRefsWithAlignments = NP.unique(outCmp[fmt.ALN_INDEX][:,fmt.REF_ID])
        outCmp.create_dataset(fmt.REF_GROUP_ID, data = uRefsWithAlignments,
                              dtype = inCmps[0][fmt.REF_GROUP_ID].dtype)
        outCmp.create_dataset(fmt.REF_GROUP_PATH,
                              data = NP.array([('/' + makeRefName(z)) for z in
                                               uRefsWithAlignments]),
                              dtype = inCmps[0][fmt.REF_GROUP_PATH].dtype)
        outCmp.create_dataset(fmt.REF_GROUP_INFO_ID, data = uRefsWithAlignments,
                              dtype = inCmps[0][fmt.REF_GROUP_INFO_ID].dtype)

        # reset the IDs
        outCmp[fmt.ALN_INDEX][:,fmt.ID] = range(1, outCmp[fmt.ALN_INDEX].shape[0] + 1)
        # reset the molecule IDs
        outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID] = \
            ((NP.max(outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID]) *
              (outCmp[fmt.ALN_INDEX][:,fmt.MOVIE_ID] - 1)) +
             outCmp[fmt.ALN_INDEX][:,fmt.HOLE_NUMBER] + 1)

        # close the sucker.
        outCmp.close()

    except Exception, e:
        try:
            # remove the file as it won't be correct
            if os.path.exists(outFile):
                os.remove(outFile)
        except:
            pass
        raise
Пример #4
0
def callConsensus():
    def makeReadAndReads(zmwsForBC):
        ccsData = filter(lambda x: x,
                         [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw])
        srData = reduce(lambda x, y: x + y,
                        [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], [])
        if not srData and not ccsData:
            return (None, None)

        def getSeedRead(reads,
                        lq=80,
                        uq=90,
                        sLambda=lambda x: -x.zmw.readScore):
            lens = map(len, reads)
            candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq))
            pfReads = [
                read for read, l in zip(reads, lens)
                if l >= candidateRange[0] and l <= candidateRange[1]
            ]
            pfReads.sort(key=sLambda)
            return pfReads[0] if len(pfReads) else None

        if ccsData:
            ## all CCS reads should be the *same* length for an
            ## amplicon. Let's take the middle ones
            seedRead = getSeedRead(ccsData,
                                   lq=30,
                                   uq=70,
                                   sLambda=lambda x: -x.zmw.numPasses)
            if not seedRead:
                seedRead = getSeedRead(srData)
                logging.info("Unable to use a CCS read for the seed read.")
            else:
                logging.info("Using a CCS read for the seed read.")
        else:
            logging.info("Using a raw read for the seed read")
            seedRead = getSeedRead(srData)

        return (seedRead, srData)

    # check to make sure that you have the necessary dependencies,
    # i.e., hgap script, blasr, etc.
    try:
        import pbtools.pbdagcon
    except ImportError:
        raise ImportError(
            "Unable to find dependency `pbdagcon` - please install.")

    # retrieve ZMWs by barcode
    if runner.args.barcode:
        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
    else:
        zmwsForBCs = getZmwsForBarcodes()

    # subsample
    zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()}

    logging.info("unfiltered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # filter ZMWs
    zmwsForBCs = filterZmws(zmwsForBCs)

    logging.info("filtered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # now choose the best subread to seed the assembly
    if runner.args.ccsFofn:
        # XXX: This part depends on the filenames of the ccs and input
        # fofns, this is essentially a workaround to the fact the the
        # part isn't part of the API
        ccsReaders = {
            movieNameFromFile(l): BasH5Reader(l)
            for l in open(runner.args.ccsFofn).read().splitlines()
        }

        # fill in the CCS spot.
        for k, v in zmwsForBCs.items():
            l = []
            for zmw, lZmw in v:
                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
                l.append((zmw, lZmw, r[zmw.holeNumber]))
            zmwsForBCs[k] = l
    else:
        # add none to the CCS spot.
        zmwsForBCs = {
            k: [(zmw, lZmw, None) for zmw, lZmw in v]
            for k, v in zmwsForBCs.iteritems()
        }

    readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()}

    # remove barcodes that don't have a seed read and a set of useable reads.
    readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]}

    # generate FASTA files
    outDir = runner.args.outDir

    for barcode, reads in readAndReads.items():
        bcdir = '/'.join((outDir, barcode))
        if not os.path.exists(bcdir):
            os.makedirs(bcdir)

        # emit the seeds to separte files
        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))

        subreads = reads[1]

        # emit the subreads to a single file
        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
            for r in subreads:
                w.writeRecord(FastaRecord(r.readName, r.basecalls()))

        # construct the region file by subsetting the ZMWs that you
        # are interested in.
        nfofn = []
        for inFof, in zipFofns(runner.args.inputFofn):
            bh5 = BaxH5Reader(inFof)
            reg = bh5.file['/PulseData/Regions']
            inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName,
                             subreads)
            holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie]))
            if any(holes):
                nreg = reg[holes, :]
            else:
                nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32')

            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
            nfile = h5.File(fname, 'w')
            ndset = nfile.create_dataset('/PulseData/Regions',
                                         data=nreg,
                                         maxshape=(None, None))
            copyAttributes(reg, ndset)
            nfile.close()
            nfofn.append(fname)

        ofile = open('%s/region.fofn' % bcdir, 'w')
        ofile.writelines("\n".join(nfofn))
        ofile.close()

    ## call gcon
    outDirs = [(outDir, k) for k in readAndReads.keys()]
    if runner.args.nProcs == 1:
        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
    else:
        pool = Pool(runner.args.nProcs)
        outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs))

    ## write the results
    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
        for r in outFasta:
            w.writeRecord(r)

    ## optionally cleanup
    if not runner.args.keepTmpDir:
        for barcode, reads in readAndReads.items():
            bcdir = '/'.join((outDir, barcode))
            shutil.rmtree(bcdir)