예제 #1
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
                   fl="true" if self.rules.FL != 0 else "false",
                   nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(r.name,
                                                           self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(r.name,
                                                           self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
예제 #2
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
            fl="true" if self.rules.FL != 0 else "false",
            nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
예제 #3
0
    def _updateChimeraInfo(self,
                           suspicous_hits,
                           in_read_fn,
                           out_nc_fn,
                           out_c_fn,
                           primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug(
            "Update chimera info for reads in {f} ".format(f=in_read_fn))
        logging.debug(
            "Write primer report to {rpt}".format(rpt=primer_report_fn))

        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with FastaReader(in_read_fn) as reader, \
                FastaWriter(out_nc_fn) as writer, \
                FastaWriter(out_c_fn) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
            return (num_nc, num_c, num_nc_bases, num_c_bases)
예제 #4
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn,
                           out_c_fn, primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug("Update chimera info for reads in {f} ".
                      format(f=in_read_fn))
        logging.debug("Write primer report to {rpt}".
                      format(rpt=primer_report_fn))

        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with FastaReader(in_read_fn) as reader, \
                FastaWriter(out_nc_fn) as writer, \
                FastaWriter(out_c_fn) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(r.name,
                                                       ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
            return (num_nc, num_c, num_nc_bases, num_c_bases)
예제 #5
0
    def test_satisfy(self):
        """Test function satisfy()."""
        inFN = op.join(self.testDir, "data/test_subset.fa")
        reads = []
        with FastaReader(inFN) as reader:
            reads = [x for x in reader]

        rules = SubsetRules(1, 1) # Full-length, non-chimeric
        obj = ReadsSubsetExtractor("in", "out", rules, True)

        ans = [ReadAnnotation.fromString(r.name) for r in reads]
        res = [obj.satisfy(an, rules) for an in ans]
        expected = [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1]
        self.assertTrue(res == expected)
예제 #6
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn,
                           out_flc_fn, primer_report_fl_fn):
        """
        in_read_fn --- a fasta of full-length reads
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fl_fn.
        """
        logging.info("Update chimera info to reads annotations " +
                     "in the output FASTA file and the primer report.")

        with FastaReader(in_read_fn) as reader, \
             FastaWriter(out_flnc_fn) as writer, \
             FastaWriter(out_flc_fn) as writer_chimera, \
             open(primer_report_fl_fn, 'w') as reporter:
            reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    assert(annotation.isFullLength)
                    self.summary.num_flnc += 1
                    self.summary.num_flnc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    self.summary.num_flc += 1
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord() + "\n")
예제 #7
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn,
                           out_flc_fn, primer_report_fl_fn):
        """
        in_read_fn --- a fasta of full-length reads
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fl_fn.
        """
        logging.info("Update chimera info to reads annotations " +
                     "in the output FASTA file and the primer report.")

        with FastaReader(in_read_fn) as reader, \
             FastaWriter(out_flnc_fn) as writer, \
             FastaWriter(out_flc_fn) as writer_chimera, \
             open(primer_report_fl_fn, 'w') as reporter:
            reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    assert (annotation.isFullLength)
                    self.summary.num_flnc += 1
                    self.summary.num_flnc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    self.summary.num_flc += 1
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord() + "\n")
예제 #8
0
    def _trimBarCode(self, reads_fn, out_fl_reads_fn, out_nfl_reads_fn,
                     primer_report_nfl_fn,
                     best_of_front, best_of_back, primer_indices,
                     min_seq_len, min_score, change_read_id,
                     ignore_polyA):
        """Trim bar code from reads in 'reads_fn', annotate each read,
        indicating:
            whether its 5' primer, 3' primer and polyA tail are seen,
            start positions of its 5' primer, 3' primer and polyA tail,
            and primer info.
        , save non-full-length reads to 'out_nfl_reads_fn',
        , save full-length reads to 'out_fl_reads_fn', which can later be
        used in chimera detection
        , write primer info of nfl reads to _primer_report_nfl_fn.

        Note that chimera detection is not necessary for nfl reads, but
        is required for fl reads. So we only write primer info for nfl here
        and will write primer info for fl reads when chimera detection
        is done.

        best_of_front/Back: {read_id: {primer_name:DOMRecord}}
        min_seq_len: minimum length to output a read.
        min_score: minimum score to output a read.
        change_read_id: if True, change read ids to 'movie/zmw/start_end'.
        """
        logging.info("Trim bar code away from reads.")
        logging.debug("Writing full-length trimmed reads to {f}".
                      format(f=out_fl_reads_fn))
        logging.debug("Writing non-full-length trimmed reads to {f}".
                      format(f=out_nfl_reads_fn))
        logging.debug("Writing primer reports before chimera detection to {f}".
                      format(f=primer_report_nfl_fn))

        with FastaReader(reads_fn) as fareader, \
                FastaWriter(out_nfl_reads_fn) as nfl_fawriter, \
                FastaWriter(out_fl_reads_fn) as fl_fawriter, \
                open(primer_report_nfl_fn, 'w') as reporter:
            for read in fareader:
                self.summary.num_reads += 1  # number of ROI reads
                pbread = PBRead(read)
                logging.debug("Pick up best primer combo for {r}".
                              format(r=read.name))
                primerIndex, strand, fw, rc = self._pickBestPrimerCombo(
                    best_of_front[read.name], best_of_back[read.name],
                    primer_indices, min_score)
                logging.debug("read={0}\n".format(read.name) +
                              "primer={0} strand={1} fw={2} rc={3}".
                              format(primerIndex, strand, fw, rc))

                if fw is None and rc is None:
                    # No primer seen in this sequence, classified
                    # as non-full-length
                    newName = pbread.name
                    if change_read_id:
                        newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                                  m=pbread.movie, z=pbread.zmw,
                                  s1=pbread.start, e1=pbread.end,
                                  isccs=("_CCS" if pbread.isCCS else ""))
                    annotation = ReadAnnotation(ID=newName)
                    # Write reports of nfl reads
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
                    if len(read.sequence) >= min_seq_len:
                        # output non-full-length reads to nfl.trimmed.fasta
                        nfl_fawriter.writeRecord(annotation.toAnnotation(),
                                                 read.sequence)
                        self.summary.num_nfl += 1
                    else:
                        self.summary.num_filtered_short_reads += 1
                    continue
                seq = read.sequence if strand == "+" else revcmp(read.sequence)
                five_end, three_start = None, None
                if fw is not None:
                    five_end = fw.sEnd
                    self.summary.num_5_seen += 1
                if rc is not None:
                    three_start = len(seq) - rc.sEnd
                    self.summary.num_3_seen += 1

                s, e = pbread.start, pbread.end
                # Try to find polyA tail in read
                polyAPos = self._findPolyA(seq, three_start=three_start)
                if polyAPos >= 0:  # polyA found
                    seq = seq[:polyAPos]
                    e1 = s + polyAPos if strand == "+" else e - polyAPos
                    self.summary.num_polyA_seen += 1
                elif three_start is not None:  # polyA not found
                    seq = seq[:three_start]
                    e1 = s + three_start if strand == "+" else e - three_start
                else:
                    e1 = e if strand == "+" else s

                if five_end is not None:
                    seq = seq[five_end:]
                    s1 = s + five_end if strand == "+" else e - five_end
                else:
                    s1 = s if strand == "+" else e

                newName = pbread.name
                if change_read_id:
                    newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                        m=pbread.movie, z=pbread.zmw, s1=s1, e1=e1,
                        isccs=("_CCS" if pbread.isCCS else ""))
                # Create an annotation
                annotation = ReadAnnotation(ID=newName, strand=strand,
                                            fiveend=five_end, polyAend=polyAPos,
                                            threeend=three_start, primer=primerIndex,
                                            ignore_polyA=ignore_polyA)

                # Write reports for nfl reads
                if annotation.isFullLength is not True:
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")

                if len(seq) >= min_seq_len:
                    if annotation.isFullLength is True:
                        # Write long full-length reads
                        fl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_fl += 1
                    else:
                        # Write long non-full-length reads.
                        nfl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_nfl += 1
                else:
                    self.summary.num_filtered_short_reads += 1
예제 #9
0
    def _trimBarCode(self, reads_fn, out_fl_reads_fn, out_nfl_reads_fn,
                     primer_report_nfl_fn,
                     best_of_front, best_of_back, primer_names,
                     min_seq_len, min_score, change_read_id,
                     ignore_polyA, keep_primer):
        """Trim bar code from reads in 'reads_fn', annotate each read,
        indicating:
            whether its 5' primer, 3' primer and polyA tail are seen,
            start positions of its 5' primer, 3' primer and polyA tail,
            and primer info.
        , save non-full-length reads to 'out_nfl_reads_fn',
        , save full-length reads to 'out_fl_reads_fn', which can later be
        used in chimera detection
        , write primer info of nfl reads to _primer_report_nfl_fn.

        Note that chimera detection is not necessary for nfl reads, but
        is required for fl reads. So we only write primer info for nfl here
        and will write primer info for fl reads when chimera detection
        is done.

        best_of_front/Back: {read_id: {primer_name:DOMRecord}}
        min_seq_len: minimum length to output a read.
        min_score: minimum score to output a read.
        change_read_id: if True, change read ids to 'movie/zmw/start_end'.
        """
        logging.info("Trim bar code away from reads.")
        logging.debug("Writing full-length trimmed reads to {f}".
                      format(f=out_fl_reads_fn))
        logging.debug("Writing non-full-length trimmed reads to {f}".
                      format(f=out_nfl_reads_fn))
        logging.debug("Writing primer reports before chimera detection to {f}".
                      format(f=primer_report_nfl_fn))

        with FastaReader(reads_fn) as fareader, \
                FastaWriter(out_nfl_reads_fn) as nfl_fawriter, \
                FastaWriter(out_fl_reads_fn) as fl_fawriter, \
                open(primer_report_nfl_fn, 'w') as reporter:
            for read in fareader:
                self.summary.num_reads += 1  # number of ROI reads
                pbread = PBRead(read)
                logging.debug("Pick up best primer combo for {r}".
                              format(r=read.name))
                primerName, strand, fw, rc = self._pickBestPrimerCombo(
                    best_of_front[read.name], best_of_back[read.name],
                    primer_names, min_score)
                logging.debug("read={0}\n".format(read.name) +
                              "strand={0} fw={1} rc={2}".
                              format(strand, fw, rc))

                if (strand == '?') or (fw is None and rc is None):
                    # No primer seen in this sequence, classified
                    # as non-full-length
                    newName = pbread.name
                    if change_read_id:
                        newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                                  m=pbread.movie, z=pbread.zmw,
                                  s1=pbread.start, e1=pbread.end,
                                  isccs=("_CCS" if pbread.isCCS else ""))
                    annotation = ReadAnnotation(ID=newName, primer=primerName)
                    # Write reports of nfl reads
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
                    if len(read.sequence) >= min_seq_len:
                        # output non-full-length reads to nfl.trimmed.fasta
                        nfl_fawriter.writeRecord(annotation.toAnnotation(),
                                                 read.sequence)
                        self.summary.num_nfl += 1
                    else:
                        self.summary.num_filtered_short_reads += 1
                    continue
                seq = read.sequence if strand == "+" else revcmp(read.sequence)
                five_end, three_start = None, None
                if fw is not None:
                    five_end = fw.sEnd
                    self.summary.num_5_seen += 1
                if rc is not None:
                    three_start = len(seq) - rc.sEnd
                    self.summary.num_3_seen += 1

                s, e = pbread.start, pbread.end
                # Try to find polyA tail in read
                polyAPos = self._findPolyA(seq, three_start=three_start)
                if polyAPos >= 0 and not ignore_polyA:  # polyA found and not to ignore it
                    if not keep_primer:
                        seq = seq[:polyAPos]
                        e1 = s + polyAPos if strand == "+" else e - polyAPos
                    else:
                        e1 = e if strand == '+' else s
                    self.summary.num_polyA_seen += 1
                elif three_start is not None:  # polyA not found but 3' found
                    if not keep_primer:
                        seq = seq[:three_start]
                        e1 = s + three_start if strand == "+" else e - three_start
                    else:
                        e1 = e if strand == '+' else s
                else: # polyA not found and 3' not found
                    e1 =  e if strand == "+" else s

                if five_end is not None:
                    if not keep_primer:
                        seq = seq[five_end:]
                        s1 = s + five_end if strand == "+" else e - five_end
                    else:
                        s1 = s if strand == '+' else e
                else:
                    s1 = s if strand == "+" else e

                newName = pbread.name
                if change_read_id:
                    newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                        m=pbread.movie, z=pbread.zmw, s1=s1, e1=e1,
                        isccs=("_CCS" if pbread.isCCS else ""))
                # Create an annotation
                annotation = ReadAnnotation(ID=newName, strand=strand,
                                            fiveend=five_end, polyAend=polyAPos,
                                            threeend=three_start, primer=primerName,
                                            ignore_polyA=ignore_polyA)

                # Write reports for nfl reads
                if annotation.isFullLength is not True:
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")

                if len(seq) >= min_seq_len:
                    if annotation.isFullLength is True:
                        # Write long full-length reads
                        fl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_fl += 1
                    else:
                        # Write long non-full-length reads.
                        nfl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_nfl += 1
                else:
                    self.summary.num_filtered_short_reads += 1