Пример #1
0
    def pickup_best_clusters(self):
        """Pick up hiqh QV clusters."""
        self.add_log(
            "Picking up the best clusters according to QVs from {fs}.".format(
                fs=", ".join(self.fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        # check if the uc cids are integers
        uc_keys_are_int = type(uc.keys()[0]) is int

        polished = {}  # cid --> FastqRecord

        for fq in self.fq_filenames:
            self.add_log("Looking at arrowed fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID #1: c0|arrow (a single Ice2 directory)
                # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory)
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                if uc_keys_are_int:
                    # only convert in the case where uc keys are integers (ex: is c10, but 10)
                    cid = int(
                        cid[1:])  #becuz possible ID #2, dont convert to int
                polished[cid] = r

        expected_acc_dict = {}  # cid --> expected accuracy (ex: 0.99)
        good = []  # contains all the cids that are HQ

        # calculate expected QV given 5'/3' trimming
        # for sequences that are shorter than the trimming, use the length itself
        for cid, r in polished.iteritems():
            qv_len = max(len(r.quality),
                         len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            q = [phred_to_qv(x) for x in r.quality]
            err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3])
            expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len))
            if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \
                len(uc[cid]) >= self.hq_min_full_length_reads :
                good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc,
                              partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(
            f=self.arrowed_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".format(
            f=self.arrowed_bad_fa))
        with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \
                FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \
                FastqWriter(self.arrowed_good_fq) as good_fq_writer, \
                FastqWriter(self.arrowed_bad_fq) as bad_fq_writer:
            for cid in polished:
                r = polished[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation2(
                    newname, expected_acc=expected_acc_dict[cid])

                if cid in good:
                    self.add_log(
                        "processing arrowed cluster {c} --> good.".format(
                            c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:],
                                               r.quality)
                else:
                    self.add_log(
                        "processing arrowed cluster {c} --> bad.".format(
                            c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:],
                                              r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log(
            "High-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq),
            level=logging.INFO)
        self.add_log(
            "Low-quality Arrowed consensus written " +
            "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq),
            level=logging.INFO)
        self.add_log("-" * 60, level=logging.INFO)
Пример #2
0
    def pickup_best_clusters(self, fq_filenames):
        """Pick up hiqh QV clusters."""
        self.add_log("Picking up the best clusters according to QVs from {fs}.".
                     format(fs=", ".join(fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        quivered = {}

        for fq in fq_filenames:
            self.add_log("Looking at quivered fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID: c0/0_1611|quiver
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                cid = int(cid[1:])
                quivered[cid] = r

        good = []

        for cid, r in quivered.iteritems():
            qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            if qv_len != 0:
                q = [phred_to_qv(x) for x in r.quality]
                err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3])
                if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy and \
                    len(uc[cid]) >= self.hq_min_full_length_reads :
                    good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc, partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".
                     format(f=self.quivered_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".
                     format(f=self.quivered_bad_fa))
        with FastaWriter(self.quivered_good_fa) as good_fa_writer, \
                FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \
                FastqWriter(self.quivered_good_fq) as good_fq_writer, \
                FastqWriter(self.quivered_bad_fq) as bad_fq_writer:
            for cid in quivered:
                r = quivered[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation(newname)

                if cid in good:
                    self.add_log("processing quivered cluster {c} --> good.".
                                 format(c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:], r.quality)
                else:
                    self.add_log("processing quivered cluster {c} --> bad.".
                                 format(c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log("High-quality Quivered consensus written " +
                     "to:\n{0}\n{1}".format(self.quivered_good_fa,
                                            self.quivered_good_fq),
                     level=logging.INFO)
        self.add_log("Low-qulality Quivered consensus written " +
                     "to:\n{0}\n{1}".format(self.quivered_bad_fa,
                                            self.quivered_bad_fq),
                     level=logging.INFO)
        self.add_log("-" * 60, level=logging.INFO)
    def pickup_best_clusters(self, fq_filenames):
        """Pick up hiqh QV clusters."""
        self.add_log("Picking up the best clusters according to QVs from {fs}.".
                     format(fs=", ".join(fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        quivered = {}

        for fq in fq_filenames:
            self.add_log("Looking at quivered fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID: c0/0_1611|quiver
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                cid = int(cid[1:])
                quivered[cid] = r

        good = []

        for cid, r in quivered.iteritems():
            qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            if qv_len != 0:
                q = [phred_to_qv(x) for x in r.quality]
                err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3])
                if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy and \
                    len(uc[cid]) >= self.hq_min_full_length_reads :
                    good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc, partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".
                     format(f=self.quivered_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".
                     format(f=self.quivered_bad_fa))
        with FastaWriter(self.quivered_good_fa) as good_fa_writer, \
                FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \
                FastqWriter(self.quivered_good_fq) as good_fq_writer, \
                FastqWriter(self.quivered_bad_fq) as bad_fq_writer:
            for cid in quivered:
                r = quivered[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation(newname)

                if cid in good:
                    self.add_log("processing quivered cluster {c} --> good.".
                                 format(c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:], r.quality)
                else:
                    self.add_log("processing quivered cluster {c} --> bad.".
                                 format(c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log("High-quality Quivered consensus written " +
                     "to:\n{0}\n{1}".format(self.quivered_good_fa,
                                            self.quivered_good_fq))
        self.add_log("Low-qulality Quivered consensus written " +
                     "to:\n{0}\n{1}".format(self.quivered_bad_fa,
                                            self.quivered_bad_fq))
        self.add_log("-" * 60, level=logging.INFO)
Пример #4
0
    def pickup_best_clusters(self):
        """Pick up hiqh QV clusters."""
        self.add_log("Picking up the best clusters according to QVs from {fs}.".
                     format(fs=", ".join(self.fq_filenames)))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        # check if the uc cids are integers
        uc_keys_are_int = type(uc.keys()[0]) is int

        polished = {} # cid --> FastqRecord

        for fq in self.fq_filenames:
            self.add_log("Looking at arrowed fq {f}".format(f=fq))
            for r in FastqReader(fq):
                # possible ID #1: c0|arrow (a single Ice2 directory)
                # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory)
                cid = r.name.split('|')[0]
                if cid.endswith('_ref'):
                    cid = cid[:-4]
                i = cid.find('/')
                if i > 0:
                    cid = cid[:i]
                if uc_keys_are_int:
                    # only convert in the case where uc keys are integers (ex: is c10, but 10)
                    cid = int(cid[1:]) #becuz possible ID #2, dont convert to int
                polished[cid] = r


        expected_acc_dict = {} # cid --> expected accuracy (ex: 0.99)
        good = [] # contains all the cids that are HQ

        # calculate expected QV given 5'/3' trimming
        # for sequences that are shorter than the trimming, use the length itself
        for cid, r in polished.iteritems():
            qv_len = max(len(r.quality), len(r.quality) - self.qv_trim_5 - self.qv_trim_3)
            q = [phred_to_qv(x) for x in r.quality]
            err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3])
            expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len))
            if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \
                len(uc[cid]) >= self.hq_min_full_length_reads :
                good.append(cid)

        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        if self.report_fn is not None:
            self.write_report(report_fn=self.report_fn,
                              uc=uc, partial_uc=partial_uc2)

        self.add_log("Writing hiqh-quality isoforms to {f}|fq".
                     format(f=self.arrowed_good_fa))
        self.add_log("Writing low-quality isoforms to {f}|fq".
                     format(f=self.arrowed_bad_fa))
        with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \
                FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \
                FastqWriter(self.arrowed_good_fq) as good_fq_writer, \
                FastqWriter(self.arrowed_bad_fq) as bad_fq_writer:
            for cid in polished:
                r = polished[cid]
                newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\
                    format(cid=cid,
                           flnc_num=len(uc[cid]),
                           nfl_num=len(partial_uc2[cid]),
                           read_len=len(r.sequence))
                newname = cid_with_annotation2(newname, expected_acc=expected_acc_dict[cid])

                if cid in good:
                    self.add_log("processing arrowed cluster {c} --> good.".
                                 format(c=cid))
                    good_fa_writer.writeRecord(newname, r.sequence[:])
                    good_fq_writer.writeRecord(newname, r.sequence[:], r.quality)
                else:
                    self.add_log("processing arrowed cluster {c} --> bad.".
                                 format(c=cid))
                    bad_fa_writer.writeRecord(newname, r.sequence[:])
                    bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality)

        self.add_log("-" * 60, level=logging.INFO)
        self.add_log("High-quality Arrowed consensus written " +
                     "to:\n{0}\n{1}".format(self.arrowed_good_fa,
                                            self.arrowed_good_fq),
                     level=logging.INFO)
        self.add_log("Low-quality Arrowed consensus written " +
                     "to:\n{0}\n{1}".format(self.arrowed_bad_fa,
                                            self.arrowed_bad_fq),
                     level=logging.INFO)
        self.add_log("-" * 60, level=logging.INFO)