예제 #1
0
 def get_hn_range(self):
     for file in self.files:
         print >> sys.stderr, "getting holeNumber range for", file
         bas = BasH5Reader(file)
         _lo = bas.sequencingZmws[0]
         _hi = bas.sequencingZmws[-1]
         self.hn_range.append((_lo, _hi))
예제 #2
0
def emitNoBCFastqs(inputFofn_filename, barcodeFofn_filename, outDir, outFile):
    # step through the bas.h5 and barcode.h5 files and emit
    # reads for each of these.
    inputFofn = open(inputFofn_filename).read().splitlines()
    barcodeFofn = open(barcodeFofn_filename).read().splitlines()
    outFastq = []

    for basFile, barcodeFile in zip(inputFofn, barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)

        msk = -np.in1d(
            basH5.sequencingZmws, bcH5.bestDS[:, 0], assume_unique=True)

        for hn in basH5.sequencingZmws[msk]:
            zmw = basH5[hn]
            if zmw:
                reads = zmw.subreads()
                if any(reads):
                    for read in reads:
                        outFastq.append(
                            FastqRecord(read.readName, read.basecalls(),
                                        read.QualityValue()))

    with FastqWriter("%s/%s.fastq" % (outDir, outFile)) as w:
        for e in outFastq:
            w.writeRecord(e)
예제 #3
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." % ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".format(
                        s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(
                    bas_handler=bas_handlers[bas_file],
                    parsed_read_name=parsed_read_name,
                    qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()
예제 #4
0
def getUnlabeledZmws():
    """Return FASTQ records for ZMWs which do not have a barcode label"""
    unlabeledZmws = []

    for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
                                         runner.args.barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)
        sdiff = basH5.sequencingZmws[
            ~n.in1d(basH5.sequencingZmws, bcH5.labeledZmws.keys())]
        for hn in sdiff:
            unlabeledZmws.append(basH5[hn])

    return reduce(
        lambda x, y: x + y,
        [getFastqRecords(unlabeledZmw) for unlabeledZmw in unlabeledZmws])
예제 #5
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5 or ccs FOFN.
    """

    qver = basQVcacher()
    if ccs_fofn.endswith(".h5"):  # Input is a ccs.h5 file not a FOFN.
        qver.add_bash5(ccs_fofn)
    else:  # Input is a ccs FOFN containing multiple ccs.h5 files.
        for ccs_fn in get_files_from_fofn(ccs_fofn):
            qver.add_bash5(ccs_fn)

    bas_handlers = {}

    with FastaReader(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            seqid = r.name.split(' ')[0]
            movie, hn, s_e = "", "", ""
            try:
                movie, hn, s_e = seqid.split('/')
                hn = int(hn)
            except ValueError:
                raise ValueError(
                    "{seqid} is not a valid CCS read".format(seqid=seqid))
            try:
                bas_file = qver.bas_files[movie][seqid]
                if bas_file not in bas_handlers:
                    bas_handlers[bas_file] = BasH5Reader(bas_file)
            except KeyError:
                raise IOError(
                    "Could not read {s} from input ccs fofn.".format(s=seqid))
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                          hn=hn,
                                          s_e=s_e,
                                          qv_name="QualityValue")
            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence, qvs)

    for bas_file, bas_handler in bas_handlers.iteritems():
        logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
        bas_handler.close()
예제 #6
0
def StoreMapped(fileNames, alnMap, stats):
    for fileName in fileNames:
        reader = BasH5Reader(fileName)
        for zmw in reader.sequencingZmws:
            for s in reader[zmw].subreads:
                stats.data["rs"].append(reader[zmw].readScore)
                stats.data["rl"].append(s.readEnd - s.readStart)
                if (s.readName in alnMap):
                    stats.data["m"].append(len(stats.data["rs"]) - 1)
                    stats.data["ml"].append(alnMap[s.readName].length)
                    stats.data["mi"].append(alnMap[s.readName].identity)
                    stats.data["s"].append(s)
                else:
                    stats.data["um"].append(len(stats.data["rs"]) - 1)
                    stats.data["ml"].append(0)
                    stats.data["mi"].append(0)
                    stats.data["us"].append(s)
예제 #7
0
    def run(self):
        inBasH5 = BasH5Reader(self.args.inFile)

        if not inBasH5.hasConsensusBasecalls and self.args.readType == "ccs":
            print "Input file %s contains no CCS reads." % self.args.inFile
            sys.exit(-1)

        if not inBasH5.hasRawBasecalls and self.args.readType in [
                "unrolled", "subreads"
        ]:
            print "Input file %s contains no %s reads" % (self.args.inFile,
                                                          self.args.readType)
            sys.exit(-1)

        movieName = inBasH5.movieName
        outFilePrefix = self.args.outFilePrefix or movieName
        outFilename = "%s.%s" % (outFilePrefix, self.args.outType)

        if self.args.outType == "fasta":
            sink = FastaEmitter(outFilename)
        elif self.args.outType == "fastq":
            sink = FastqEmitter(outFilename)

        if self.args.readType == '':
            # choose based on file.
            if inBasH5.hasRawBasecalls:
                readType = 'subreads'
            elif inBasH5.hasConsensusBasecalls:
                readType = 'ccs'
            else:
                print "Input bas.h5 file has neither CCS nor subread data"
                sys.exit(-1)
        else:
            readType = self.args.readType

        for zmwRead in self.zmwReads(inBasH5, readType):
            zmw = zmwRead.zmw
            #
            # Emit read if filters pass
            #
            if ((readType != "ccs" or zmw.numPasses >= self.args.minPasses) and
                (readType == "ccs" or zmw.readScore >= self.args.minReadScore)
                    and (len(zmwRead) >= self.args.minLength)):

                sink.emit(zmwRead)
예제 #8
0
def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()
예제 #9
0
def getZmwsForBarcodes(labels=None):
    """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode
    label"""
    zmwsForBCs = {}
    for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
                                         runner.args.barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)
        allLabs = bcH5.barcodeLabels
        if labels:
            allLabs = [x for x in allLabs if x in labels]
            logging.info("Processing only: %s" % ",".join(allLabs))
        for label in allLabs:
            lZmws = bcH5.labeledZmwsFromBarcodeLabel(label)
            for lZmw in lZmws:
                zmw = basH5[lZmw.holeNumber]
                if not label in zmwsForBCs.keys():
                    zmwsForBCs[label] = []
                zmwsForBCs[label].append((zmw, lZmw))

    return zmwsForBCs
예제 #10
0
def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  baxFofn = (l.strip('\n') for l in args.bax_fofn)
  
  # Get the read names that are not barcoded
  no_barcode = defaultdict(set)
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode[bcH5.movieName].add(row[0])

  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)

  for baxFile in baxFofn:
    baxH5 = BasH5Reader(baxFile)
    for holeNum in baxH5.sequencingZmws:
      if holeNum in no_barcode[baxH5.movieName]:
        zmw = baxH5[holeNum]
        if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength:
          for subread in zmw.subreads:
            if len(subread.basecalls()) >= args.minSubreadLength:
              if args.fasta:
                outh.writeRecord(FastaRecord(subread.readName,subread.basecalls()))
              else:
                outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue()))

  outh.close()
예제 #11
0
 def __init__(self):
     self.V = ZmwReadStitcher(getUnalignedBam())
     self.B = BasH5Reader(getBaxForBam())
     self.VZ = self.V[1650]
     self.BZ = self.B[1650]
예제 #12
0
            maxLen = l
            maxS = s
    return (float(read.subreads[maxS].basecalls().count('G') +
                  read.subreads[maxS].basecalls().count("C"))) / len(
                      read.subreads[maxS].basecalls())


#dh5 = "/net/eichler/vol20/projects/pacbio/backups/incoming/130625_MYD_eee_20kb_368/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5"
#dsam = "/net/eichler/vol20/projects/pacbio/nobackups/results/130625_MYD_eee_20kb_368/D01_1/D.sam"

dh5 = "/mnt/pacbio/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5"
dsam = "/mnt/pacbio_analysis/D01_1/D.sam"

from pbcore.io import BasH5Reader

dReader = BasH5Reader(dh5)

#
# key:
#   rs  read score
#   rl  read length
#   mi  mapped identity
#   ml  mapped length
#   m  indices of mapped reads
#   um  indices of unmapped reads
#   s  mapped subreads
#   us  unmapped subreads


class Count:
    def __init__(self):
예제 #13
0
def open_base_file(basefile_path):
    """Open basefile, if using cmp.h5 legacy format
       and the the basefile path was provided.
    """
    bas_reader = BasH5Reader(basefile_path)
    return bas_reader
예제 #14
0
from operator import itemgetter,attrgetter
from itertools import imap, starmap, repeat,izip,ifilter
from pbcore.io import BasH5Reader
from collections import Counter

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


if not len(sys.argv) == 2:
    sys.exit("zmwProductivityHeatmap.py input.bas.h5\n")

infile = sys.argv[1]
cell = BasH5Reader(infile)

get_prod = lambda o : getattr(o, "zmwMetric")("Productivity")

zmwgetters = map(itemgetter, cell.allSequencingZmws)
all_seq_zmws = list(starmap(apply,zip(zmwgetters, repeat([cell]))))
zmw_prods = map(get_prod, all_seq_zmws)


prod_lens = zip(zmw_prods, imap(lambda z: len(z.read()), all_seq_zmws))

prod1_lens = map(itemgetter(1), ifilter(lambda (p,l): p==1, prod_lens))
prod2_lens = map(itemgetter(1), ifilter(lambda (p,l): p==2, prod_lens))

xy = map(attrgetter("holeXY"), all_seq_zmws)
xyl = map(list,xy)
예제 #15
0
def mpWrapper(f):
    return makeBarcodeH5FromBasH5(BasH5Reader(f))
예제 #16
0
#!/usr/bin/env python

# makes fastq files from bas.h5 files

import sys
import os
from pbcore.io import BasH5Reader, FastqWriter

if len(sys.argv) != 3:
    print "Usage: {:s} bas.h5_file output_prefix".format(sys.argv[0])
    exit(1)

input_filename = sys.argv[1]
output_prefix = sys.argv[2]

bas = BasH5Reader(input_filename)

filenames = {}
writers = {}
filenames['raw'] = output_prefix + ".fastq"
filenames['subread'] = output_prefix + ".subreads.fastq"
filenames['ccs'] = output_prefix + ".ccs.fastq"
for filetype in filenames:
    if os.path.isfile(filenames[filetype]):
        exit("Error: file {:s} exists!".format(filenames[filetype]))
    else:
        writers[filetype] = FastqWriter(filenames[filetype])

for zmw in bas:
    if len(zmw.read()) > 0:
        writers['raw'].writeRecord(zmw.read().readName,
예제 #17
0
def callConsensus():
    def makeReadAndReads(zmwsForBC):
        ccsData = filter(lambda x: x,
                         [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw])
        srData = reduce(lambda x, y: x + y,
                        [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], [])
        if not srData and not ccsData:
            return (None, None)

        def getSeedRead(reads,
                        lq=80,
                        uq=90,
                        sLambda=lambda x: -x.zmw.readScore):
            lens = map(len, reads)
            candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq))
            pfReads = [
                read for read, l in zip(reads, lens)
                if l >= candidateRange[0] and l <= candidateRange[1]
            ]
            pfReads.sort(key=sLambda)
            return pfReads[0] if len(pfReads) else None

        if ccsData:
            ## all CCS reads should be the *same* length for an
            ## amplicon. Let's take the middle ones
            seedRead = getSeedRead(ccsData,
                                   lq=30,
                                   uq=70,
                                   sLambda=lambda x: -x.zmw.numPasses)
            if not seedRead:
                seedRead = getSeedRead(srData)
                logging.info("Unable to use a CCS read for the seed read.")
            else:
                logging.info("Using a CCS read for the seed read.")
        else:
            logging.info("Using a raw read for the seed read")
            seedRead = getSeedRead(srData)

        return (seedRead, srData)

    # check to make sure that you have the necessary dependencies,
    # i.e., hgap script, blasr, etc.
    try:
        import pbtools.pbdagcon
    except ImportError:
        raise ImportError(
            "Unable to find dependency `pbdagcon` - please install.")

    # retrieve ZMWs by barcode
    if runner.args.barcode:
        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
    else:
        zmwsForBCs = getZmwsForBarcodes()

    # subsample
    zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()}

    logging.info("unfiltered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # filter ZMWs
    zmwsForBCs = filterZmws(zmwsForBCs)

    logging.info("filtered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # now choose the best subread to seed the assembly
    if runner.args.ccsFofn:
        # XXX: This part depends on the filenames of the ccs and input
        # fofns, this is essentially a workaround to the fact the the
        # part isn't part of the API
        ccsReaders = {
            movieNameFromFile(l): BasH5Reader(l)
            for l in open(runner.args.ccsFofn).read().splitlines()
        }

        # fill in the CCS spot.
        for k, v in zmwsForBCs.items():
            l = []
            for zmw, lZmw in v:
                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
                l.append((zmw, lZmw, r[zmw.holeNumber]))
            zmwsForBCs[k] = l
    else:
        # add none to the CCS spot.
        zmwsForBCs = {
            k: [(zmw, lZmw, None) for zmw, lZmw in v]
            for k, v in zmwsForBCs.iteritems()
        }

    readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()}

    # remove barcodes that don't have a seed read and a set of useable reads.
    readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]}

    # generate FASTA files
    outDir = runner.args.outDir

    for barcode, reads in readAndReads.items():
        bcdir = '/'.join((outDir, barcode))
        if not os.path.exists(bcdir):
            os.makedirs(bcdir)

        # emit the seeds to separte files
        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))

        subreads = reads[1]

        # emit the subreads to a single file
        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
            for r in subreads:
                w.writeRecord(FastaRecord(r.readName, r.basecalls()))

        # construct the region file by subsetting the ZMWs that you
        # are interested in.
        nfofn = []
        for inFof, in zipFofns(runner.args.inputFofn):
            bh5 = BaxH5Reader(inFof)
            reg = bh5.file['/PulseData/Regions']
            inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName,
                             subreads)
            holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie]))
            if any(holes):
                nreg = reg[holes, :]
            else:
                nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32')

            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
            nfile = h5.File(fname, 'w')
            ndset = nfile.create_dataset('/PulseData/Regions',
                                         data=nreg,
                                         maxshape=(None, None))
            copyAttributes(reg, ndset)
            nfile.close()
            nfofn.append(fname)

        ofile = open('%s/region.fofn' % bcdir, 'w')
        ofile.writelines("\n".join(nfofn))
        ofile.close()

    ## call gcon
    outDirs = [(outDir, k) for k in readAndReads.keys()]
    if runner.args.nProcs == 1:
        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
    else:
        pool = Pool(runner.args.nProcs)
        outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs))

    ## write the results
    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
        for r in outFasta:
            w.writeRecord(r)

    ## optionally cleanup
    if not runner.args.keepTmpDir:
        for barcode, reads in readAndReads.items():
            bcdir = '/'.join((outDir, barcode))
            shutil.rmtree(bcdir)