def predict_orf(self, method='5prime-hsp', use_pfam=True, min_expect=DEFAULT_MIN_EXPECT): """ Predict ORF based on one of two methods: 1. 5'-most beginning ORF that overlaps 5'-most HSP. This procedure errors on the side of too much protein sequence. 2. ORF starting at the start codon 5' of the 5'-most HSP. These are the core two methods for choosing an ORF in the case when we: - don't suspect missing 5'-end - don't suspect a frameshift TODO: this is a huge method; in the future this should be refactored and maybe put in a new module. """ if not self.has_relative: self.orf_type = ORFTypes(None, "no_relative") return None if self.inconsistent_strand(min_expect): self.orf_type = ORFTypes(None, "inconsistent_strand") return None # even though every function does this, we do it here to # return None if none pass thresholds. filtered_hsps = filter(lambda x: x['expect'] <= min_expect, self.hsps) if len(filtered_hsps) < 1: self.orf_type = ORFTypes(None, "none_passed_expect_thresh") return None self.annotation["num_relatives"] = len(set([s['relative'] for s in filtered_hsps])) ## 0. Get strand and anchor HSPs. strand = self.get_strand(min_expect) most_5prime_relative, most_5prime, most_3prime = self.get_anchor_HSPs(min_expect) self.annotation['most_5prime_relative'] = most_5prime_relative ## 1. Try to infer frame ## 1.a Look for frameshift has_majority_frameshift = self.majority_frameshift(min_expect) self.annotation["majority_frameshift"] = has_majority_frameshift if has_majority_frameshift: # Our frame is that of the 5'-most HSP frame = most_5prime['frame'] else: ## 1.d Finally, infer frame in the vanilla case frame = self.majority_frame(min_expect) # assert our strand according to strand & frame are consistent numeric_strand = {"+":1, "-":-1}[strand] assert(int(numeric_strand) == int(frame/abs(frame))) ## If the frame is negative, we must do a ## coordinate transform of the anchor HSPs SeqRange objects so ## that they are on the forward orientation (as ORF candidates ## would be) if frame < 0: most_5prime, most_3prime = (most_5prime.forward_coordinate_transform(), most_3prime.forward_coordinate_transform()) most_5prime_hsp = most_5prime # reference for annotation, in case of PFAM extension ## Check for PFAM frames, if necessary if use_pfam: more_5prime_pfam = self.more_5prime_pfam_domain(most_5prime, frame) if more_5prime_pfam is not None: most_5prime = more_5prime_pfam self.annotation["pfam_extended_5prime"] = more_5prime_pfam is not None ## 4. Get all ORFs orf_candidates = get_all_orfs(self.record, frame) self.orf_candidates = orf_candidates self.annotation["num_orf_candidates"] = len(orf_candidates) if len(orf_candidates) == 0: # why would we have no ORF candidate at all? usually there # should be the open-ended case. However, if a sequence's # first codon is a stop codon and no start codons are # found, there can be no ORF. self.orf_type = ORFTypes(None, "no_orf_candidates") return None ## 6. ORF Prediction: subset ORFs by those that overlap the ## 5'-most HSP overlapping_candidates = orf_candidates.subsetByOverlaps(most_5prime) if len(overlapping_candidates): ## 6.a Method-dependent ORF selection. Method (a): 5'-most ## start codon. If there is none, we take the open-ended ## case. if method == '5prime-most': orf_i = range(len(overlapping_candidates)) tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start) assert(len(tmp) > 0) orf_range_i = tmp[0] # assert(not overlapping_candidates[orf_range_i]["no_start"]) elif method == '5prime-hsp': # which of the overlapping candidates have a start # position 5' of the most 5' HSP? five_prime_of_hsp_i = filter(lambda i: overlapping_candidates[i].start <= most_5prime.start, range(len(overlapping_candidates))) # let's sort these by start position now, reversing so # that the latest ORF candidate that overlaps is chosen if len(five_prime_of_hsp_i) > 0: five_prime_of_hsp_i = sorted(five_prime_of_hsp_i, key=lambda i: overlapping_candidates[i].start, reverse=True) orf_range_i = five_prime_of_hsp_i[0] else: # if no ORF candidates that overlap a 5' HSP have # a start position 5' of the anchor HSP, we take # the 5'-most ORF overlapping candidate and assert # that it's start position is 3' of the 5' HSP # start. orf_i = range(len(overlapping_candidates)) tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start) orf_range_i = tmp[0] assert(overlapping_candidates[orf_range_i].start > most_5prime.start) else: raise ValueError("method must be either '5prime-most' or '5prime-hsp'") else: # no candidates overlap the most 5prime HSP self.orf_type = ORFTypes(None, "no_overlap") return None orf = overlapping_candidates[orf_range_i] self.orf = orf self.orf["frame"] = frame self.orf["most_5prime_hsp"] = most_5prime_hsp # check for ORF type, and annotate self.orf_type = ORFTypes(self.orf) ## 6. Internal stop codon check self.annotation["internal_stop"] = self.majority_internal_stop() ## 7. Annotate other 5' start sites. if orf is not None: self.annotation["num_5prime_ATG"] = count_5prime_ATG(self.seq, frame, orf.start) ## 8. Annotate the furthest 5 ORF candidate start position's ## difference with current orf start position (ignoring open ## ended cases) if self.orf is not None: starts = [x.start for x in orf_candidates if not x["no_start"] and orf.start > x.start] if len(starts) == 0: self.annotation["diff_5prime_most_start_and_orf"] = 0 else: tmp = orf.start - max(starts) assert(tmp > 0) self.annotation["diff_5prime_most_start_and_orf"] = tmp ## Annotate the data used in the 5'-most HSP, specifically ## subject and query start self.annotation["most_5prime_query_start"] = most_5prime_hsp.start self.annotation["most_5prime_sbjct_start"] = most_5prime_hsp['sbjct_start'] return orf
def predict_orf_inconsistent_strand(self, method="5prime-hsp", min_expect=DEFAULT_MIN_EXPECT): """ Predict both ORFs for a contig with HSPs on different strands. This works in cases in which there are two different frames on different strands. Cases of more than three unique frames will not be handled, as these are likely degenerate cases. A lot of this code is duplicated from predict_orf(). TODO: refactor this redundant code out and make common functionality generic methods. This function is not interfaced to the command line program. It's used to interogate these strange cases. As such, it does not have side effects on self.orf_type or annotation. No PFAM support. """ assert(self.inconsistent_strand(min_expect)) filtered_hsps = filter(lambda x: x['expect'] <= min_expect, self.hsps) assert(len(filtered_hsps) >= 1) strands = set([h.strand for h in filtered_hsps]) assert(len(strands) == 2) # set up data structs for ORFs on both strand; no side-effects # in object. orfs = [None, None] orf_types = [None, None] annotations = [dict(), dict()] for which_strand, strand in enumerate(strands): strand_hsps = filter(lambda x: x.strand == strand, filtered_hsps) tmp = SeqRanges() for shsp in strand_hsps: tmp.append(shsp) strand_hsps = tmp i = sorted(range(len(strand_hsps)), key=lambda k: strand_hsps.end[k], reverse=True)[0] j = sorted(range(len(strand_hsps)), key=lambda k: strand_hsps.start[k])[0] annotations[which_strand]["strand"] = strand if strand == "-": # negative strand; 5'-most HSP is that with the largest # query end most_5prime_relative, most_5prime, most_3prime = AnchorHSPs(strand_hsps[i]['relative'], strand_hsps[i], strand_hsps[j]) else: # positive strand; 5-most HSP is that with the smallest # query start most_5prime_relative, most_5prime, most_3prime = AnchorHSPs(strand_hsps[j]['relative'], strand_hsps[j], strand_hsps[i]) annotations[which_strand]['most_5prime_relative'] = most_5prime_relative if len(set(strand_hsps.getdata("frame"))) > 1: # this contig has inconsistent strands and differing HSP frames *per* strand. orf_types[which_strand] = ORFTypes(None, "frameshift_and_inconsistent_strands") orfs[which_strand] = None continue frame = most_5prime['frame'] # coordinate transform (see note at predict_orf) if frame < 0: most_5prime, most_3prime = (most_5prime.forward_coordinate_transform(), most_3prime.forward_coordinate_transform()) most_5prime_hsp = most_5prime # reference for annotation, in case of PFAM extension orf_candidates = get_all_orfs(self.record, frame) orf_candidates = orf_candidates annotations[which_strand]["num_orf_candidates"] = len(orf_candidates) if len(orf_candidates) == 0: orf_types[which_strand] = ORFTypes(None, "no_orf_candidates") orfs[which_strand] = None continue overlapping_candidates = orf_candidates.subsetByOverlaps(most_5prime) if len(overlapping_candidates): if method == '5prime-most': orf_i = range(len(overlapping_candidates)) tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start) assert(len(tmp) > 0) orf_range_i = tmp[0] # assert(not overlapping_candidates[orf_range_i]["no_start"]) elif method == '5prime-hsp': five_prime_of_hsp_i = filter(lambda i: overlapping_candidates[i].start <= most_5prime.start, range(len(overlapping_candidates))) if len(five_prime_of_hsp_i) > 0: five_prime_of_hsp_i = sorted(five_prime_of_hsp_i, key=lambda i: overlapping_candidates[i].start, reverse=True) orf_range_i = five_prime_of_hsp_i[0] else: orf_i = range(len(overlapping_candidates)) tmp = sorted(orf_i, key=lambda x: overlapping_candidates[x].start) orf_range_i = tmp[0] assert(overlapping_candidates[orf_range_i].start > most_5prime.start) else: raise ValueError("method must be either '5prime-most' or '5prime-hsp'") else: # no candidates overlap the most 5prime HSP orf_types[which_strand] = ORFTypes(None, "no_overlap") orfs[which_strand] = None continue orf = overlapping_candidates[orf_range_i] orfs[which_strand] = orf if orf is None: orf_types[which_strand] = ORFTypes(None, "no_overlap") else: # check for ORF type, and annotate orf_types[which_strand] = ORFTypes(orf) annotations[which_strand]["frame"] = frame annotations[which_strand]["most_5prime_hsp"] = most_5prime_hsp assert(orf_types[which_strand] is not None) assert(None not in orf_types) return orfs, orf_types, annotations