Python BasH5Reader.BasH5Reader примеры, pbcore.io.BasH5Reader.BasH5Reader Python примеры использования

Пример #1

0

Показать файл

Файл: basQV_parallel.py Проект: yimsea/cDNA_primer

 def get_hn_range(self):
     for file in self.files:
         print >> sys.stderr, "getting holeNumber range for", file
         bas = BasH5Reader(file)
         _lo = bas.sequencingZmws[0]
         _hi = bas.sequencingZmws[-1]
         self.hn_range.append((_lo, _hi))

Пример #2

0

Показать файл

def emitNoBCFastqs(inputFofn_filename, barcodeFofn_filename, outDir, outFile):
    # step through the bas.h5 and barcode.h5 files and emit
    # reads for each of these.
    inputFofn = open(inputFofn_filename).read().splitlines()
    barcodeFofn = open(barcodeFofn_filename).read().splitlines()
    outFastq = []

    for basFile, barcodeFile in zip(inputFofn, barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)

        msk = -np.in1d(
            basH5.sequencingZmws, bcH5.bestDS[:, 0], assume_unique=True)

        for hn in basH5.sequencingZmws[msk]:
            zmw = basH5[hn]
            if zmw:
                reads = zmw.subreads()
                if any(reads):
                    for read in reads:
                        outFastq.append(
                            FastqRecord(read.readName, read.basecalls(),
                                        read.QualityValue()))

    with FastqWriter("%s/%s.fastq" % (outDir, outFile)) as w:
        for e in outFastq:
            w.writeRecord(e)

Пример #3

0

Показать файл

Файл: IceUtils.py Проект: palfalvi/pbtranscript

def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." % ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".format(
                        s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(
                    bas_handler=bas_handlers[bas_file],
                    parsed_read_name=parsed_read_name,
                    qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()

Пример #4

0

Показать файл

def getUnlabeledZmws():
    """Return FASTQ records for ZMWs which do not have a barcode label"""
    unlabeledZmws = []

    for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
                                         runner.args.barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)
        sdiff = basH5.sequencingZmws[
            ~n.in1d(basH5.sequencingZmws, bcH5.labeledZmws.keys())]
        for hn in sdiff:
            unlabeledZmws.append(basH5[hn])

    return reduce(
        lambda x, y: x + y,
        [getFastqRecords(unlabeledZmw) for unlabeledZmw in unlabeledZmws])

Пример #5

0

Показать файл

Файл: IceUtils.py Проект: yimsea/cDNA_primer

def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5 or ccs FOFN.
    """

    qver = basQVcacher()
    if ccs_fofn.endswith(".h5"):  # Input is a ccs.h5 file not a FOFN.
        qver.add_bash5(ccs_fofn)
    else:  # Input is a ccs FOFN containing multiple ccs.h5 files.
        for ccs_fn in get_files_from_fofn(ccs_fofn):
            qver.add_bash5(ccs_fn)

    bas_handlers = {}

    with FastaReader(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            seqid = r.name.split(' ')[0]
            movie, hn, s_e = "", "", ""
            try:
                movie, hn, s_e = seqid.split('/')
                hn = int(hn)
            except ValueError:
                raise ValueError(
                    "{seqid} is not a valid CCS read".format(seqid=seqid))
            try:
                bas_file = qver.bas_files[movie][seqid]
                if bas_file not in bas_handlers:
                    bas_handlers[bas_file] = BasH5Reader(bas_file)
            except KeyError:
                raise IOError(
                    "Could not read {s} from input ccs fofn.".format(s=seqid))
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                          hn=hn,
                                          s_e=s_e,
                                          qv_name="QualityValue")
            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence, qvs)

    for bas_file, bas_handler in bas_handlers.iteritems():
        logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
        bas_handler.close()

Пример #6

0

Показать файл

def StoreMapped(fileNames, alnMap, stats):
    for fileName in fileNames:
        reader = BasH5Reader(fileName)
        for zmw in reader.sequencingZmws:
            for s in reader[zmw].subreads:
                stats.data["rs"].append(reader[zmw].readScore)
                stats.data["rl"].append(s.readEnd - s.readStart)
                if (s.readName in alnMap):
                    stats.data["m"].append(len(stats.data["rs"]) - 1)
                    stats.data["ml"].append(alnMap[s.readName].length)
                    stats.data["mi"].append(alnMap[s.readName].identity)
                    stats.data["s"].append(s)
                else:
                    stats.data["um"].append(len(stats.data["rs"]) - 1)
                    stats.data["ml"].append(0)
                    stats.data["mi"].append(0)
                    stats.data["us"].append(s)

Пример #7

0

Показать файл

    def run(self):
        inBasH5 = BasH5Reader(self.args.inFile)

        if not inBasH5.hasConsensusBasecalls and self.args.readType == "ccs":
            print "Input file %s contains no CCS reads." % self.args.inFile
            sys.exit(-1)

        if not inBasH5.hasRawBasecalls and self.args.readType in [
                "unrolled", "subreads"
        ]:
            print "Input file %s contains no %s reads" % (self.args.inFile,
                                                          self.args.readType)
            sys.exit(-1)

        movieName = inBasH5.movieName
        outFilePrefix = self.args.outFilePrefix or movieName
        outFilename = "%s.%s" % (outFilePrefix, self.args.outType)

        if self.args.outType == "fasta":
            sink = FastaEmitter(outFilename)
        elif self.args.outType == "fastq":
            sink = FastqEmitter(outFilename)

        if self.args.readType == '':
            # choose based on file.
            if inBasH5.hasRawBasecalls:
                readType = 'subreads'
            elif inBasH5.hasConsensusBasecalls:
                readType = 'ccs'
            else:
                print "Input bas.h5 file has neither CCS nor subread data"
                sys.exit(-1)
        else:
            readType = self.args.readType

        for zmwRead in self.zmwReads(inBasH5, readType):
            zmw = zmwRead.zmw
            #
            # Emit read if filters pass
            #
            if ((readType != "ccs" or zmw.numPasses >= self.args.minPasses) and
                (readType == "ccs" or zmw.readScore >= self.args.minReadScore)
                    and (len(zmwRead) >= self.args.minLength)):

                sink.emit(zmwRead)

Пример #8

0

Показать файл

Файл: nobarcodeCCS.py Проект: raj347/JGI-MiSeq

def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()

Пример #9

0

Показать файл

def getZmwsForBarcodes(labels=None):
    """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode
    label"""
    zmwsForBCs = {}
    for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
                                         runner.args.barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)
        allLabs = bcH5.barcodeLabels
        if labels:
            allLabs = [x for x in allLabs if x in labels]
            logging.info("Processing only: %s" % ",".join(allLabs))
        for label in allLabs:
            lZmws = bcH5.labeledZmwsFromBarcodeLabel(label)
            for lZmw in lZmws:
                zmw = basH5[lZmw.holeNumber]
                if not label in zmwsForBCs.keys():
                    zmwsForBCs[label] = []
                zmwsForBCs[label].append((zmw, lZmw))

    return zmwsForBCs

Пример #10

0

Показать файл

Файл: nobarcodeSubreads.py Проект: raj347/JGI-MiSeq

def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  baxFofn = (l.strip('\n') for l in args.bax_fofn)
  
  # Get the read names that are not barcoded
  no_barcode = defaultdict(set)
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode[bcH5.movieName].add(row[0])

  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)

  for baxFile in baxFofn:
    baxH5 = BasH5Reader(baxFile)
    for holeNum in baxH5.sequencingZmws:
      if holeNum in no_barcode[baxH5.movieName]:
        zmw = baxH5[holeNum]
        if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength:
          for subread in zmw.subreads:
            if len(subread.basecalls()) >= args.minSubreadLength:
              if args.fasta:
                outh.writeRecord(FastaRecord(subread.readName,subread.basecalls()))
              else:
                outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue()))

  outh.close()

Пример #11

0

Показать файл

 def __init__(self):
     self.V = ZmwReadStitcher(getUnalignedBam())
     self.B = BasH5Reader(getBaxForBam())
     self.VZ = self.V[1650]
     self.BZ = self.B[1650]

Пример #12

0

Показать файл

            maxLen = l
            maxS = s
    return (float(read.subreads[maxS].basecalls().count('G') +
                  read.subreads[maxS].basecalls().count("C"))) / len(
                      read.subreads[maxS].basecalls())


#dh5 = "/net/eichler/vol20/projects/pacbio/backups/incoming/130625_MYD_eee_20kb_368/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5"
#dsam = "/net/eichler/vol20/projects/pacbio/nobackups/results/130625_MYD_eee_20kb_368/D01_1/D.sam"

dh5 = "/mnt/pacbio/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5"
dsam = "/mnt/pacbio_analysis/D01_1/D.sam"

from pbcore.io import BasH5Reader

dReader = BasH5Reader(dh5)

#
# key:
#   rs  read score
#   rl  read length
#   mi  mapped identity
#   ml  mapped length
#   m  indices of mapped reads
#   um  indices of unmapped reads
#   s  mapped subreads
#   us  unmapped subreads


class Count:
    def __init__(self):

Пример #13

0

Показать файл

def open_base_file(basefile_path):
    """Open basefile, if using cmp.h5 legacy format
       and the the basefile path was provided.
    """
    bas_reader = BasH5Reader(basefile_path)
    return bas_reader

Пример #14

0

Показать файл

from operator import itemgetter,attrgetter
from itertools import imap, starmap, repeat,izip,ifilter
from pbcore.io import BasH5Reader
from collections import Counter

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


if not len(sys.argv) == 2:
    sys.exit("zmwProductivityHeatmap.py input.bas.h5\n")

infile = sys.argv[1]
cell = BasH5Reader(infile)

get_prod = lambda o : getattr(o, "zmwMetric")("Productivity")

zmwgetters = map(itemgetter, cell.allSequencingZmws)
all_seq_zmws = list(starmap(apply,zip(zmwgetters, repeat([cell]))))
zmw_prods = map(get_prod, all_seq_zmws)


prod_lens = zip(zmw_prods, imap(lambda z: len(z.read()), all_seq_zmws))

prod1_lens = map(itemgetter(1), ifilter(lambda (p,l): p==1, prod_lens))
prod2_lens = map(itemgetter(1), ifilter(lambda (p,l): p==2, prod_lens))

xy = map(attrgetter("holeXY"), all_seq_zmws)
xyl = map(list,xy)

Пример #15

0

Показать файл

def mpWrapper(f):
    return makeBarcodeH5FromBasH5(BasH5Reader(f))

Пример #16

0

Показать файл

#!/usr/bin/env python

# makes fastq files from bas.h5 files

import sys
import os
from pbcore.io import BasH5Reader, FastqWriter

if len(sys.argv) != 3:
    print "Usage: {:s} bas.h5_file output_prefix".format(sys.argv[0])
    exit(1)

input_filename = sys.argv[1]
output_prefix = sys.argv[2]

bas = BasH5Reader(input_filename)

filenames = {}
writers = {}
filenames['raw'] = output_prefix + ".fastq"
filenames['subread'] = output_prefix + ".subreads.fastq"
filenames['ccs'] = output_prefix + ".ccs.fastq"
for filetype in filenames:
    if os.path.isfile(filenames[filetype]):
        exit("Error: file {:s} exists!".format(filenames[filetype]))
    else:
        writers[filetype] = FastqWriter(filenames[filetype])

for zmw in bas:
    if len(zmw.read()) > 0:
        writers['raw'].writeRecord(zmw.read().readName,

Пример #17

0

Показать файл

def callConsensus():
    def makeReadAndReads(zmwsForBC):
        ccsData = filter(lambda x: x,
                         [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw])
        srData = reduce(lambda x, y: x + y,
                        [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], [])
        if not srData and not ccsData:
            return (None, None)

        def getSeedRead(reads,
                        lq=80,
                        uq=90,
                        sLambda=lambda x: -x.zmw.readScore):
            lens = map(len, reads)
            candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq))
            pfReads = [
                read for read, l in zip(reads, lens)
                if l >= candidateRange[0] and l <= candidateRange[1]
            ]
            pfReads.sort(key=sLambda)
            return pfReads[0] if len(pfReads) else None

        if ccsData:
            ## all CCS reads should be the *same* length for an
            ## amplicon. Let's take the middle ones
            seedRead = getSeedRead(ccsData,
                                   lq=30,
                                   uq=70,
                                   sLambda=lambda x: -x.zmw.numPasses)
            if not seedRead:
                seedRead = getSeedRead(srData)
                logging.info("Unable to use a CCS read for the seed read.")
            else:
                logging.info("Using a CCS read for the seed read.")
        else:
            logging.info("Using a raw read for the seed read")
            seedRead = getSeedRead(srData)

        return (seedRead, srData)

    # check to make sure that you have the necessary dependencies,
    # i.e., hgap script, blasr, etc.
    try:
        import pbtools.pbdagcon
    except ImportError:
        raise ImportError(
            "Unable to find dependency `pbdagcon` - please install.")

    # retrieve ZMWs by barcode
    if runner.args.barcode:
        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
    else:
        zmwsForBCs = getZmwsForBarcodes()

    # subsample
    zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()}

    logging.info("unfiltered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # filter ZMWs
    zmwsForBCs = filterZmws(zmwsForBCs)

    logging.info("filtered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # now choose the best subread to seed the assembly
    if runner.args.ccsFofn:
        # XXX: This part depends on the filenames of the ccs and input
        # fofns, this is essentially a workaround to the fact the the
        # part isn't part of the API
        ccsReaders = {
            movieNameFromFile(l): BasH5Reader(l)
            for l in open(runner.args.ccsFofn).read().splitlines()
        }

        # fill in the CCS spot.
        for k, v in zmwsForBCs.items():
            l = []
            for zmw, lZmw in v:
                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
                l.append((zmw, lZmw, r[zmw.holeNumber]))
            zmwsForBCs[k] = l
    else:
        # add none to the CCS spot.
        zmwsForBCs = {
            k: [(zmw, lZmw, None) for zmw, lZmw in v]
            for k, v in zmwsForBCs.iteritems()
        }

    readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()}

    # remove barcodes that don't have a seed read and a set of useable reads.
    readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]}

    # generate FASTA files
    outDir = runner.args.outDir

    for barcode, reads in readAndReads.items():
        bcdir = '/'.join((outDir, barcode))
        if not os.path.exists(bcdir):
            os.makedirs(bcdir)

        # emit the seeds to separte files
        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))

        subreads = reads[1]

        # emit the subreads to a single file
        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
            for r in subreads:
                w.writeRecord(FastaRecord(r.readName, r.basecalls()))

        # construct the region file by subsetting the ZMWs that you
        # are interested in.
        nfofn = []
        for inFof, in zipFofns(runner.args.inputFofn):
            bh5 = BaxH5Reader(inFof)
            reg = bh5.file['/PulseData/Regions']
            inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName,
                             subreads)
            holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie]))
            if any(holes):
                nreg = reg[holes, :]
            else:
                nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32')

            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
            nfile = h5.File(fname, 'w')
            ndset = nfile.create_dataset('/PulseData/Regions',
                                         data=nreg,
                                         maxshape=(None, None))
            copyAttributes(reg, ndset)
            nfile.close()
            nfofn.append(fname)

        ofile = open('%s/region.fofn' % bcdir, 'w')
        ofile.writelines("\n".join(nfofn))
        ofile.close()

    ## call gcon
    outDirs = [(outDir, k) for k in readAndReads.keys()]
    if runner.args.nProcs == 1:
        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
    else:
        pool = Pool(runner.args.nProcs)
        outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs))

    ## write the results
    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
        for r in outFasta:
            w.writeRecord(r)

    ## optionally cleanup
    if not runner.args.keepTmpDir:
        for barcode, reads in readAndReads.items():
            bcdir = '/'.join((outDir, barcode))
            shutil.rmtree(bcdir)

Python BasH5Reader.BasH5Reader примеры использования