Exemplo n.º 1
0
class FastqEmitter(object):
    def __init__(self, filename):
        self.writer = FastqWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls(),
                                zmwRead.QualityValue())
Exemplo n.º 2
0
class FastqEmitter(object):
    def __init__(self, filename):
        self.writer = FastqWriter(filename)

    def emit(self, zmwRead):
        self.writer.writeRecord(zmwRead.readName,
                                zmwRead.basecalls(),
                                zmwRead.QualityValue())
Exemplo n.º 3
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Exemplo n.º 4
0
def emitNoBCFastqs(inputFofn_filename, barcodeFofn_filename, outDir, outFile):
    # step through the bas.h5 and barcode.h5 files and emit
    # reads for each of these.
    inputFofn = open(inputFofn_filename).read().splitlines()
    barcodeFofn = open(barcodeFofn_filename).read().splitlines()
    outFastq = []

    for basFile, barcodeFile in zip(inputFofn, barcodeFofn):
        basH5 = BasH5Reader(basFile)
        bcH5 = BarcodeH5Reader(barcodeFile)

        msk = -np.in1d(
            basH5.sequencingZmws, bcH5.bestDS[:, 0], assume_unique=True)

        for hn in basH5.sequencingZmws[msk]:
            zmw = basH5[hn]
            if zmw:
                reads = zmw.subreads()
                if any(reads):
                    for read in reads:
                        outFastq.append(
                            FastqRecord(read.readName, read.basecalls(),
                                        read.QualityValue()))

    with FastqWriter("%s/%s.fastq" % (outDir, outFile)) as w:
        for e in outFastq:
            w.writeRecord(e)
Exemplo n.º 5
0
def split_laa_fastq(input_file_name,
                    output_file_base,
                    subreads_file_name,
                    bio_samples_by_bc=None):
    """
    Split an LAA FASTQ file into one file per barcode.
    """
    if op.getsize(input_file_name) == 0:
        return []
    records = defaultdict(list)
    with FastqReader(input_file_name) as fastq_in:
        for rec in fastq_in:
            bc_id = re.sub("^Barcode", "", rec.id.split("_")[0])
            records[bc_id].append(rec)
    if bio_samples_by_bc is None:
        bio_samples_by_bc = {}
        with SubreadSet(subreads_file_name, strict=True) as ds:
            if ds.isBarcoded:  # pylint: disable=no-member
                bio_samples_by_bc = get_barcode_sample_mappings(ds)
    outputs = []
    for bc_id in sorted(records.keys()):
        bio_sample = bio_samples_by_bc.get(bc_id, "unknown")
        ofn = "{b}.{s}.{i}.fastq".format(b=output_file_base,
                                         s=bio_sample,
                                         i=bc_id)
        with FastqWriter(ofn) as fastq_out:
            for rec in records[bc_id]:
                fastq_out.writeRecord(rec)
        outputs.append(ofn)
    return outputs
Exemplo n.º 6
0
 def _write_fastx_file(cls, header, seq):
     fn = tempfile.NamedTemporaryFile(suffix=".fastq").name
     suffix = "|arrow"
     with FastqWriter(fn) as f:
         f.writeRecord("{h}{s}".format(h=header, s=suffix), seq,
                       [35] * len(seq))
     return fn
Exemplo n.º 7
0
    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId             = defaultdict(list)
        self.consensusChunksByRefId      = defaultdict(list)

        # open file writers
        self.fastaWriter = self.fastqWriter = self.gffWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())
Exemplo n.º 8
0
def make_fastq_inputs(records=None, ofn=None):
    if records is None:
        records = _get_fastq_records()
    if ofn is None:
        ofn = tempfile.NamedTemporaryFile(suffix=".fastq").name
    with FastqWriter(ofn) as fastq_out:
        for rec in records:
            fastq_out.writeRecord(rec)
    return ofn
Exemplo n.º 9
0
def combine_amplicon_analysis_files(directory):
    output_file = os.path.join(directory, 'amplicon_analysis.all.fastq')
    with FastqWriter(output_file) as handle:
        for input_file in [
                'amplicon_analysis.fastq',
                'amplicon_analysis_chimeras_noise.fastq'
        ]:
            input_path = os.path.join(directory, input_file)
            for record in FastqReader(input_path):
                handle.writeRecord(record)
    return output_file
Exemplo n.º 10
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." % ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".format(
                        s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(
                    bas_handler=bas_handlers[bas_file],
                    parsed_read_name=parsed_read_name,
                    qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()
Exemplo n.º 11
0
def write_fastq(fasta_records, output_file):
    """
    Write a FastaRecord, or list of records, out to file
    """
    with FastqWriter(output_file) as handle:
        if isinstance(fasta_records, FastqRecord):
            handle.writeRecord(fasta_records)
        elif isinstance(fasta_records, list):
            for record in fasta_records:
                handle.writeRecord(record)
        else:
            msg = "Input Record(s) type not recognized"
            log.error(msg)
            raise TypeError(msg)
    check_output_file(output_file)
Exemplo n.º 12
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5 or ccs FOFN.
    """

    qver = basQVcacher()
    if ccs_fofn.endswith(".h5"):  # Input is a ccs.h5 file not a FOFN.
        qver.add_bash5(ccs_fofn)
    else:  # Input is a ccs FOFN containing multiple ccs.h5 files.
        for ccs_fn in get_files_from_fofn(ccs_fofn):
            qver.add_bash5(ccs_fn)

    bas_handlers = {}

    with FastaReader(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            seqid = r.name.split(' ')[0]
            movie, hn, s_e = "", "", ""
            try:
                movie, hn, s_e = seqid.split('/')
                hn = int(hn)
            except ValueError:
                raise ValueError(
                    "{seqid} is not a valid CCS read".format(seqid=seqid))
            try:
                bas_file = qver.bas_files[movie][seqid]
                if bas_file not in bas_handlers:
                    bas_handlers[bas_file] = BasH5Reader(bas_file)
            except KeyError:
                raise IOError(
                    "Could not read {s} from input ccs fofn.".format(s=seqid))
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file],
                                          hn=hn,
                                          s_e=s_e,
                                          qv_name="QualityValue")
            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence, qvs)

    for bas_file, bas_handler in bas_handlers.iteritems():
        logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
        bas_handler.close()
Exemplo n.º 13
0
def writeSequenceRecords(filename, records, filetype=None):
    """
    Write the records out to file
    """
    fileType = filetype or getFileType(filename)
    if fileType == 'fasta':
        with FastaWriter(filename) as writer:
            for record in records:
                writer.writeRecord(record)
    elif fileType == 'fastq':
        with FastqWriter(filename) as writer:
            for record in records:
                writer.writeRecord(record)
    else:
        msg = 'Output filetype must be either FASTA or FASTQ'
        log.error(msg)
        raise TypeError(msg)
    return filename
Exemplo n.º 14
0
def split_laa_fastq(input_file_name, output_file_base):
    """
    Split an LAA FASTQ file into one file per barcode.
    """
    if op.getsize(input_file_name) == 0:
        return []
    records = defaultdict(list)
    with FastqReader(input_file_name) as fastq_in:
        for rec in fastq_in:
            bc_id = rec.id.split("_")[0]
            records[bc_id].append(rec)
    outputs = []
    for bc_id in sorted(records.keys()):
        ofn = "{b}.{i}.fastq".format(b=output_file_base, i=bc_id)
        with FastqWriter(ofn) as fastq_out:
            for rec in records[bc_id]:
                fastq_out.writeRecord(rec)
        outputs.append(ofn)
    return outputs
Exemplo n.º 15
0
def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()
Exemplo n.º 16
0
    def split(self, first_split=None):
        """Split `input_fastq` into smaller files each containing
        `reads_per_split` reads. Return splitted fastq."""
        split_index = 0
        self.out_fns = []
        writer = FastqWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))

        if first_split is None:
            first_split = self.reads_per_split
        with FastqReader(self.input_fastq) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastqWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence, r.quality)

        writer.close()
        return list(self.out_fns)
Exemplo n.º 17
0
def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  baxFofn = (l.strip('\n') for l in args.bax_fofn)
  
  # Get the read names that are not barcoded
  no_barcode = defaultdict(set)
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode[bcH5.movieName].add(row[0])

  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)

  for baxFile in baxFofn:
    baxH5 = BasH5Reader(baxFile)
    for holeNum in baxH5.sequencingZmws:
      if holeNum in no_barcode[baxH5.movieName]:
        zmw = baxH5[holeNum]
        if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength:
          for subread in zmw.subreads:
            if len(subread.basecalls()) >= args.minSubreadLength:
              if args.fasta:
                outh.writeRecord(FastaRecord(subread.readName,subread.basecalls()))
              else:
                outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue()))

  outh.close()
Exemplo n.º 18
0
 def consolidate(self, out_prefix):
     """Consolidate ContigSet to FASTA/FASTQ file, return path to output file."""
     try:
         r0 = self.next()
     except StopIteration:
         raise ValueError("No records to consolidate")
     if isinstance(r0, FastaRecord) or isinstance(r0, IndexedFastaRecord):
         out_fn = out_prefix + ".fasta"
         with FastaWriter(out_fn) as writer:
             writer.writeRecord(r0.name, r0.sequence[:])
             while True:
                 try:
                     r = self.next()
                 except StopIteration:
                     break
                 if not (isinstance(r, FastaRecord)
                         or isinstance(r, IndexedFastaRecord)):
                     raise ValueError(
                         "Not able to consolidate records of mixed types.")
                 writer.writeRecord(r.name, r.sequence)
         return out_fn
     elif isinstance(r0, FastqRecord):
         out_fn = out_prefix + ".fastq"
         with FastqWriter(out_fn) as writer:
             writer.writeRecord(r0)
             while True:
                 try:
                     r = self.next()
                 except StopIteration:
                     break
                 if not isinstance(r, FastqRecord):
                     raise ValueError(
                         "Not able to consolidate records of mixed types.")
                 writer.writeRecord(r)
         return out_fn
     else:
         raise ValueError("Files must only contain FASTA/FASTQ records.")
Exemplo n.º 19
0
def main(parser):

    args = parser.parse_args()

    def makeFqName(bcPair):
        return '{}/{}--{}.fastq'.format(args.outDir,
                                        *[bcNames[i] for i in bcPair])

    bcNames = {
        i: rec.name
        for i, rec in enumerate(FastaReader(args.barcodeFasta))
    }
    bcNames[-1] = 'NoBC'
    bam = IndexedBamReader(args.ccsBAM)

    for bcPair in set(zip(bam.bcForward, bam.bcReverse)):
        with FastqWriter(makeFqName(bcPair)) as writer:
            for rec in bam[(bam.bcForward == bcPair[0])
                           & (bam.bcReverse == bcPair[1])]:
                header = rec.readName
                if not args.noBcQual:
                    header += ' bq=%i' % rec.bcQual
                writer.writeRecord(header, rec.read(aligned=False),
                                   rec.peer.query_qualities)
Exemplo n.º 20
0
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename,
                                  in_rep_filename, out_abundance_filename,
                                  out_gff_filename, out_rep_filename, good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input %s and output %s must be either FASTA or FASTQ." %
            (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)
Exemplo n.º 21
0
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(options.temporaryDirectory,
                                                  "profile-%s.out" % (self.name)))
        else:
            self._run()


    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId             = defaultdict(list)
        self.consensusChunksByRefId      = defaultdict(list)

        # open file writers
        self.fastaWriter = self.fastqWriter = self.gffWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter:   self.gffWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId, options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter:
                self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId]))
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId, options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId]
                                   if windows.windowsIntersect(chunk.refWindow, span) ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName,
                                                 css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName,
                                                 css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
Exemplo n.º 22
0
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
Exemplo n.º 23
0
input_filename = sys.argv[1]
output_prefix = sys.argv[2]

bas = BasH5Reader(input_filename)

filenames = {}
writers = {}
filenames['raw'] = output_prefix + ".fastq"
filenames['subread'] = output_prefix + ".subreads.fastq"
filenames['ccs'] = output_prefix + ".ccs.fastq"
for filetype in filenames:
    if os.path.isfile(filenames[filetype]):
        exit("Error: file {:s} exists!".format(filenames[filetype]))
    else:
        writers[filetype] = FastqWriter(filenames[filetype])

for zmw in bas:
    if len(zmw.read()) > 0:
        writers['raw'].writeRecord(zmw.read().readName,
                                   zmw.read().basecalls(),
                                   zmw.read().QualityValue())

    for subread in zmw.subreads:
        if len(subread) > 0:
            writers['subread'].writeRecord(subread.readName,
                                           subread.basecalls(),
                                           subread.QualityValue())

    if zmw.ccsRead is not None:
        writers['ccs'].writeRecord(zmw.ccsRead.readName,
Exemplo n.º 24
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Exemplo n.º 25
0
    def pickup_best_clusters(self):
        """Pick up hiqh QV clusters."""
        self.add_log(
            "Picking up the best clusters according to QVs from {fs}.".format(
                fs=", ".join(self.fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        # check if the uc cids are integers
        uc_keys_are_int = type(uc.keys()[0]) is int

        polished = {}  # cid --> FastqRecord

        for fq in self.fq_filenames:
            self.add_log("Looking at arrowed fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID #1: c0|arrow (a single Ice2 directory)
                # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory)
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                if uc_keys_are_int:
                    # only convert in the case where uc keys are integers (ex: is c10, but 10)
                    cid = int(
                        cid[1:])  #becuz possible ID #2, dont convert to int
                polished[cid] = r

        expected_acc_dict = {}  # cid --> expected accuracy (ex: 0.99)
        good = []  # contains all the cids that are HQ

        # calculate expected QV given 5'/3' trimming
        # for sequences that are shorter than the trimming, use the length itself
        for cid, r in polished.iteritems():
            qv_len = max(len(r.quality),
                         len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            q = [phred_to_qv(x) for x in r.quality]
            err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3])
            expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len))
            if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \
                len(uc[cid]) >= self.hq_min_full_length_reads :
                good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc,
                              partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(
            f=self.arrowed_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".format(
            f=self.arrowed_bad_fa))
        with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \
                FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \
                FastqWriter(self.arrowed_good_fq) as good_fq_writer, \
                FastqWriter(self.arrowed_bad_fq) as bad_fq_writer:
            for cid in polished:
                r = polished[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation2(
                    newname, expected_acc=expected_acc_dict[cid])

                if cid in good:
                    self.add_log(
                        "processing arrowed cluster {c} --> good.".format(
                            c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:],
                                               r.quality)
                else:
                    self.add_log(
                        "processing arrowed cluster {c} --> bad.".format(
                            c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:],
                                              r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log(
            "High-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq),
            level=logging.INFO)
        self.add_log(
            "Low-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq),
            level=logging.INFO)
        self.add_log("-" * 60, level=logging.INFO)
Exemplo n.º 26
0
    def pickup_best_clusters(self, fq_filenames):
        """Pick up hiqh QV clusters."""
        self.add_log(
            "Picking up the best clusters according to QVs from {fs}.".format(
                fs=", ".join(fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        quivered = {}

        for fq in fq_filenames:
            self.add_log("Looking at quivered fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID: c0/0_1611|quiver
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                cid = int(cid[1:])
                quivered[cid] = r

        good = []

        for cid, r in quivered.iteritems():
            qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            if qv_len != 0:
                q = [phred_to_qv(x) for x in r.quality]
                err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3])
                if 1.0 - (err_sum /
                          float(qv_len)) >= self.hq_quiver_min_accuracy:
                    good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc,
                              partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(
            f=self.quivered_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".format(
            f=self.quivered_bad_fa))
        with FastaWriter(self.quivered_good_fa) as good_fa_writer, \
                FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \
                FastqWriter(self.quivered_good_fq) as good_fq_writer, \
                FastqWriter(self.quivered_bad_fq) as bad_fq_writer:
            for cid in quivered:
                r = quivered[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation(newname)

                if cid in good:
                    self.add_log(
                        "processing quivered cluster {c} --> good.".format(
                            c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence)
                    good_fq_writer.writeRecord(newname, r.sequence, r.quality)
                else:
                    self.add_log(
                        "processing quivered cluster {c} --> bad.".format(
                            c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence)
                    bad_fq_writer.writeRecord(newname, r.sequence, r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log("High-quality Quivered consensus written " +
                     "to:\n{0}\n{1}".format(self.quivered_good_fa,
                                            self.quivered_good_fq))
        self.add_log(
            "Low-qulality Quivered consensus written " +
            "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq))
        self.add_log("-" * 60, level=logging.INFO)
Exemplo n.º 27
0
 def __init__(self, filename):
     self.writer = FastqWriter(filename)
Exemplo n.º 28
0
class ResultCollector(object):
    """
    Gathers results and writes to a file.
    """
    def __init__(self, resultsQueue, algorithmName, algorithmConfig):
        self._resultsQueue = resultsQueue
        self._algorithmName = algorithmName
        self._algorithmConfig = algorithmConfig

    def _run(self):
        self.onStart()

        sentinelsReceived = 0
        while sentinelsReceived < options.numWorkers:
            result = self._resultsQueue.get()
            if result is None:
                sentinelsReceived += 1
            else:
                self.onResult(result)

        self.onFinish()

    def run(self):
        if options.doProfiling:
            cProfile.runctx("self._run()",
                            globals=globals(),
                            locals=locals(),
                            filename=os.path.join(
                                options.temporaryDirectory,
                                "profile-%s.out" % (self.name)))
        else:
            self._run()

    # ==================================
    # Overridable interface begins here.
    #

    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId = defaultdict(list)
        self.consensusChunksByRefId = defaultdict(list)

        # open file writers
        self.fastaWriter = None
        self.fastqWriter = None
        self.gffWriter = None
        self.vcfWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())
        if options.vcfOutputFilename:
            self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename,
                                               vars(options),
                                               reference.byName.values())

    def onResult(self, result):
        window, cssAndVariants = result
        css, variants = cssAndVariants
        self._recordNewResults(window, css, variants)
        self._flushContigIfCompleted(window)

    def onFinish(self):
        logging.info("Analysis completed.")
        if self.fastaWriter: self.fastaWriter.close()
        if self.fastqWriter: self.fastqWriter.close()
        if self.gffWriter: self.gffWriter.close()
        if self.vcfWriter: self.vcfWriter.close()
        logging.info("Output files completed.")

    def _recordNewResults(self, window, css, variants):
        refId, refStart, refEnd = window
        self.consensusChunksByRefId[refId].append(css)
        self.variantsByRefId[refId] += variants
        self.referenceBasesProcessedById[refId] += (refEnd - refStart)

    def _flushContigIfCompleted(self, window):
        refId, _, _ = window
        refEntry = reference.byName[refId]
        refName = refEntry.fullName
        basesProcessed = self.referenceBasesProcessedById[refId]
        requiredBases = reference.numReferenceBases(refId,
                                                    options.referenceWindows)
        if basesProcessed == requiredBases:
            # This contig is done, so we can dump to file and delete
            # the data structures.
            if self.gffWriter or self.vcfWriter:
                variants = sorted(self.variantsByRefId[refId])
                if self.gffWriter:
                    self.gffWriter.writeVariants(variants)
                if self.vcfWriter:
                    self.vcfWriter.writeVariants(variants)
            del self.variantsByRefId[refId]

            #
            # If the user asked to analyze a window or a set of
            # windows, we output a FAST[AQ] contig per analyzed
            # window.  Otherwise we output a fasta contig per
            # reference contig.
            #
            # We try to be intelligent about naming the output
            # contigs, to include window information where applicable.
            #
            for span in reference.enumerateSpans(refId,
                                                 options.referenceWindows):
                _, s, e = span
                if (s == 0) and (e == refEntry.length):
                    spanName = refName
                else:
                    spanName = refName + "_%d_%d" % (s, e)
                cssName = consensus.consensusContigName(
                    spanName, self._algorithmName)
                # Gather just the chunks pertaining to this span
                chunksThisSpan = [
                    chunk for chunk in self.consensusChunksByRefId[refId]
                    if windows.windowsIntersect(chunk.refWindow, span)
                ]
                css = consensus.join(chunksThisSpan)

                if self.fastaWriter:
                    self.fastaWriter.writeRecord(cssName, css.sequence)
                if self.fastqWriter:
                    self.fastqWriter.writeRecord(cssName, css.sequence,
                                                 css.confidence)

            del self.consensusChunksByRefId[refId]
Exemplo n.º 29
0
def pick_rep(isoform_filename, gff_filename,
             group_filename, output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError("%s must contain either indexed FASTA files or " % isoform_filename +
                              "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." % isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml": # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
Exemplo n.º 30
0
 def __init__(self, filename):
     self.writer = FastqWriter(filename)
Exemplo n.º 31
0
    def pickup_best_clusters(self, fq_filenames):
        """Pick up hiqh QV clusters."""
        self.add_log(
            "Picking up the best clusters according to QVs from {fs}.".format(
                fs=", ".join(fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        quivered = {}

        for fq in fq_filenames:
            self.add_log("Looking at quivered fq {f}".format(f=fq))
            for r in FastqReader(fq):
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                cid = int(cid[1:])
                quivered[cid] = r

        good = []

        for cid, r in quivered.iteritems():
            q = [phred_to_qv(x) for x in r.quality]
            if sum(q[self.qv_trim_5:-self.qv_trim_3]) <= self.qv_max_err:
                good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(
            f=self.quivered_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".format(
            f=self.quivered_bad_fa))
        with FastaWriter(self.quivered_good_fa) as good_fa_writer, \
             FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \
             FastqWriter(self.quivered_good_fq) as good_fq_writer, \
             FastqWriter(self.quivered_bad_fq) as bad_fq_writer:
            for cid in quivered:
                r = quivered[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                        format(cid=cid,
                               flnc_num=len(uc[cid]),
                               nfl_num=len(partial_uc2[cid]),
                               read_len=len(r.sequence))

                if cid in good:
                    self.add_log(
                        "processing quivered cluster {c} --> good.".format(
                            c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence)
                    good_fq_writer.writeRecord(newname, r.sequence, r.quality)
                else:
                    self.add_log(
                        "processing quivered cluster {c} --> bad.".format(
                            c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence)
                    bad_fq_writer.writeRecord(newname, r.sequence, r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log("High-quality Quivered consensus written " +
                     "to:\n{0}\n{1}\n".format(self.quivered_good_fa,
                                              self.quivered_good_fq))
        self.add_log(
            "Low-qulality Quivered consensus written " +
            "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq))
        self.add_log("-" * 60, level=logging.INFO)