Exemplo n.º 1
0
    def predict_orf(self, method='5prime-hsp', use_pfam=True, min_expect=DEFAULT_MIN_EXPECT):
        """
        Predict ORF based on one of two methods:

        1. 5'-most beginning ORF that overlaps 5'-most HSP. This
        procedure errors on the side of too much protein sequence.

        2. ORF starting at the start codon 5' of the 5'-most HSP.

        These are the core two methods for choosing an ORF in the case
        when we:

        - don't suspect missing 5'-end
        - don't suspect a frameshift

        TODO: this is a huge method; in the future this should be
        refactored and maybe put in a new module.
        """        
        if not self.has_relative:
            self.orf_type = ORFTypes(None, "no_relative")
            return None
        if self.inconsistent_strand(min_expect):
            self.orf_type = ORFTypes(None, "inconsistent_strand")
            return None

        # even though every function does this, we do it here to
        # return None if none pass thresholds.
        filtered_hsps = filter(lambda x: x['expect'] <= min_expect, self.hsps)
        if len(filtered_hsps) < 1:
            self.orf_type = ORFTypes(None, "none_passed_expect_thresh")
            return None
        self.annotation["num_relatives"] = len(set([s['relative'] for s in filtered_hsps]))

        ## 0. Get strand and anchor HSPs.
        strand = self.get_strand(min_expect)
        most_5prime_relative, most_5prime, most_3prime = self.get_anchor_HSPs(min_expect)
        self.annotation['most_5prime_relative'] = most_5prime_relative

        ## 1. Try to infer frame
        ## 1.a Look for frameshift
        has_majority_frameshift = self.majority_frameshift(min_expect)
        self.annotation["majority_frameshift"] = has_majority_frameshift
        if has_majority_frameshift:
            # Our frame is that of the 5'-most HSP
            frame = most_5prime['frame']
        else:
            ## 1.d Finally, infer frame in the vanilla case
            frame = self.majority_frame(min_expect)

        # assert our strand according to strand & frame are consistent
        numeric_strand = {"+":1, "-":-1}[strand]
        assert(int(numeric_strand) == int(frame/abs(frame)))
        
        ## If the frame is negative, we must do a
        ## coordinate transform of the anchor HSPs SeqRange objects so
        ## that they are on the forward orientation (as ORF candidates
        ## would be)
        if frame < 0:
            most_5prime, most_3prime = (most_5prime.forward_coordinate_transform(),
                                        most_3prime.forward_coordinate_transform())

        most_5prime_hsp = most_5prime # reference for annotation, in case of PFAM extension

        ## Check for PFAM frames, if necessary
        if use_pfam:
            more_5prime_pfam = self.more_5prime_pfam_domain(most_5prime, frame)
            if more_5prime_pfam is not None:
                most_5prime = more_5prime_pfam
            self.annotation["pfam_extended_5prime"] = more_5prime_pfam is not None
            
        ## 4. Get all ORFs
        orf_candidates = get_all_orfs(self.record, frame)
        self.orf_candidates = orf_candidates
        self.annotation["num_orf_candidates"] = len(orf_candidates)
        if len(orf_candidates) == 0:
            # why would we have no ORF candidate at all? usually there
            # should be the open-ended case. However, if a sequence's
            # first codon is a stop codon and no start codons are
            # found, there can be no ORF.
            self.orf_type = ORFTypes(None, "no_orf_candidates")
            return None

        ## 6. ORF Prediction: subset ORFs by those that overlap the
        ## 5'-most HSP
        overlapping_candidates = orf_candidates.subsetByOverlaps(most_5prime)
        if len(overlapping_candidates):
            ## 6.a Method-dependent ORF selection. Method (a): 5'-most
            ## start codon. If there is none, we take the open-ended
            ## case.
            if method == '5prime-most':
                orf_i = range(len(overlapping_candidates))
                tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start)
                assert(len(tmp) > 0)
                orf_range_i = tmp[0]
                # assert(not overlapping_candidates[orf_range_i]["no_start"])
            elif method == '5prime-hsp':
                # which of the overlapping candidates have a start
                # position 5' of the most 5' HSP?
                five_prime_of_hsp_i = filter(lambda i: overlapping_candidates[i].start <= most_5prime.start,
                                             range(len(overlapping_candidates)))
                # let's sort these by start position now, reversing so
                # that the latest ORF candidate that overlaps is chosen
                if len(five_prime_of_hsp_i) > 0:
                    five_prime_of_hsp_i = sorted(five_prime_of_hsp_i,
                                                 key=lambda i: overlapping_candidates[i].start,
                                                 reverse=True)
                    orf_range_i = five_prime_of_hsp_i[0]
                else:
                    # if no ORF candidates that overlap a 5' HSP have
                    # a start position 5' of the anchor HSP, we take
                    # the 5'-most ORF overlapping candidate and assert
                    # that it's start position is 3' of the 5' HSP
                    # start.
                    orf_i = range(len(overlapping_candidates))
                    tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start)
                    orf_range_i = tmp[0]
                    assert(overlapping_candidates[orf_range_i].start > most_5prime.start)
            else:
                raise ValueError("method must be either '5prime-most' or '5prime-hsp'")
        else:
            # no candidates overlap the most 5prime HSP
            self.orf_type = ORFTypes(None, "no_overlap")
            return None
        orf = overlapping_candidates[orf_range_i]
        self.orf = orf
        self.orf["frame"] = frame
        self.orf["most_5prime_hsp"] = most_5prime_hsp
        
        # check for ORF type, and annotate
        self.orf_type = ORFTypes(self.orf)

        ## 6. Internal stop codon check
        self.annotation["internal_stop"] = self.majority_internal_stop()

        ## 7. Annotate other 5' start sites.
        if orf is not None:
            self.annotation["num_5prime_ATG"] = count_5prime_ATG(self.seq, frame, orf.start)

        ## 8. Annotate the furthest 5 ORF candidate start position's
        ## difference with current orf start position (ignoring open
        ## ended cases)
        if self.orf is not None:
            starts = [x.start for x in orf_candidates if not x["no_start"] and orf.start > x.start]
            if len(starts) == 0:
                self.annotation["diff_5prime_most_start_and_orf"] = 0
            else:
                tmp = orf.start - max(starts)
                assert(tmp > 0)
                self.annotation["diff_5prime_most_start_and_orf"] = tmp

        ## Annotate the data used in the 5'-most HSP, specifically
        ## subject and query start
        self.annotation["most_5prime_query_start"] = most_5prime_hsp.start
        self.annotation["most_5prime_sbjct_start"] = most_5prime_hsp['sbjct_start']
        return orf
Exemplo n.º 2
0
    def predict_orf_inconsistent_strand(self, method="5prime-hsp", min_expect=DEFAULT_MIN_EXPECT):
        """
        Predict both ORFs for a contig with HSPs on different
        strands. This works in cases in which there are two different
        frames on different strands. Cases of more than three unique
        frames will not be handled, as these are likely degenerate
        cases.

        A lot of this code is duplicated from predict_orf(). TODO:
        refactor this redundant code out and make common functionality
        generic methods.

        This function is not interfaced to the command line
        program. It's used to interogate these strange cases. As such,
        it does not have side effects on self.orf_type or annotation.

        No PFAM support.
        """

        assert(self.inconsistent_strand(min_expect))

        filtered_hsps = filter(lambda x: x['expect'] <= min_expect, self.hsps)
        assert(len(filtered_hsps) >= 1)
        strands = set([h.strand for h in filtered_hsps])
        assert(len(strands) == 2)

        # set up data structs for ORFs on both strand; no side-effects
        # in object.
        orfs = [None, None]
        orf_types = [None, None]
        annotations = [dict(), dict()]
        for which_strand, strand in enumerate(strands):
            strand_hsps = filter(lambda x: x.strand == strand, filtered_hsps)
            tmp = SeqRanges()
            for shsp in strand_hsps:
                tmp.append(shsp)
            strand_hsps = tmp

            i = sorted(range(len(strand_hsps)), key=lambda k: strand_hsps.end[k], reverse=True)[0]
            j = sorted(range(len(strand_hsps)), key=lambda k: strand_hsps.start[k])[0]
            annotations[which_strand]["strand"] = strand
            
            if strand == "-":
                # negative strand; 5'-most HSP is that with the largest
                # query end
                most_5prime_relative, most_5prime, most_3prime = AnchorHSPs(strand_hsps[i]['relative'], strand_hsps[i], strand_hsps[j])
            else:
                # positive strand; 5-most HSP is that with the smallest
                # query start
                most_5prime_relative, most_5prime, most_3prime = AnchorHSPs(strand_hsps[j]['relative'], strand_hsps[j], strand_hsps[i])

            annotations[which_strand]['most_5prime_relative'] = most_5prime_relative
            if len(set(strand_hsps.getdata("frame"))) > 1:
                # this contig has inconsistent strands and differing HSP frames *per* strand.
                orf_types[which_strand] = ORFTypes(None, "frameshift_and_inconsistent_strands")
                orfs[which_strand] = None
                continue

            frame = most_5prime['frame']            

            # coordinate transform (see note at predict_orf)
            if frame < 0:
                most_5prime, most_3prime = (most_5prime.forward_coordinate_transform(),
                                            most_3prime.forward_coordinate_transform())

            most_5prime_hsp = most_5prime # reference for annotation, in case of PFAM extension

            orf_candidates = get_all_orfs(self.record, frame)
            orf_candidates = orf_candidates
            annotations[which_strand]["num_orf_candidates"] = len(orf_candidates)
            if len(orf_candidates) == 0:
                orf_types[which_strand] = ORFTypes(None, "no_orf_candidates")
                orfs[which_strand] = None
                continue

            overlapping_candidates = orf_candidates.subsetByOverlaps(most_5prime)
            if len(overlapping_candidates):
                if method == '5prime-most':
                    orf_i = range(len(overlapping_candidates))
                    tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start)
                    assert(len(tmp) > 0)
                    orf_range_i = tmp[0]
                    # assert(not overlapping_candidates[orf_range_i]["no_start"])
                elif method == '5prime-hsp':
                    five_prime_of_hsp_i = filter(lambda i: overlapping_candidates[i].start <= most_5prime.start,
                                                 range(len(overlapping_candidates)))
                    if len(five_prime_of_hsp_i) > 0:
                        five_prime_of_hsp_i = sorted(five_prime_of_hsp_i,
                                                     key=lambda i: overlapping_candidates[i].start,
                                                     reverse=True)
                        orf_range_i = five_prime_of_hsp_i[0]
                    else:
                        orf_i = range(len(overlapping_candidates))
                        tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start)
                        orf_range_i = tmp[0]
                        assert(overlapping_candidates[orf_range_i].start > most_5prime.start)
                else:
                    raise ValueError("method must be either '5prime-most' or '5prime-hsp'")
            else:
                # no candidates overlap the most 5prime HSP
                orf_types[which_strand] = ORFTypes(None, "no_overlap")
                orfs[which_strand] = None
                continue            

            orf = overlapping_candidates[orf_range_i]
            orfs[which_strand] = orf
            if orf is None:
                orf_types[which_strand] = ORFTypes(None, "no_overlap")                
            else:
                # check for ORF type, and annotate
                orf_types[which_strand] = ORFTypes(orf)

                annotations[which_strand]["frame"] = frame
                annotations[which_strand]["most_5prime_hsp"] = most_5prime_hsp
            assert(orf_types[which_strand] is not None)

        assert(None not in orf_types)
        return orfs, orf_types, annotations