def localize_feature(self, coord, txt): """Identifies feature in given transcript of a given coordinate. Features include intron, exon, splice-accepotor, splice-donor, UTR """ feature = None # search introns first for i in range(len(txt.exons)-1): intron = [int(txt.exons[i][1])+1, int(txt.exons[i+1][0])-1] if subsume([coord, coord], intron): if txt.strand == '+': feature = 'intron' + str(i+1) else: feature = 'intron' + str(len(txt.exons) - i - 1) splice = None if subsume([coord, coord], [intron[0], intron[0]+1]): if txt.strand == '+': splice = 'splice-donor' else: splice = 'splice-acceptor' elif subsume([coord, coord], [intron[-1]-1, intron[-1]]): if txt.strand == '+': splice = 'splice-acceptor' else: splice = 'splice-donor' if splice: feature += "(%s)" % (splice) break # then exons if not feature: for i in range(len(txt.exons)): if subsume([coord, coord], txt.exons[i]): if txt.strand == '+': feature = 'exon' + str(i+1) else: feature = 'exon' + str(len(txt.exons)-i) break # must be utr if not in intron and not in exons inside cds if not feature: if (int(coord) < int(txt.cdsStart) and txt.strand == '+') or (int(coord) > int(txt.cdsEnd) and txt.strand == '-'): feature = '5utr' else: feature = '3utr' return feature
def construct_cdna(self, coord, txt, refseq, variant=None, change=None, exons=None): """Constructs cDNA seqeunce given event""" cdna = "" for i in range(len(txt.exons)): if txt.coding_type() != 'CODING' or not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]): continue if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]): start = int(txt.cdsStart) + 1 else: start = txt.exons[i][0] if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]): end = int(txt.cdsEnd) else: end = int(txt.exons[i][1]) exon = refseq.GetSequence(coord[0], int(start), int(end)) if change: if change.lower() == 'retained_intron' and i+1 == exons[0]: intron = refseq.GetSequence(coord[0], end+1, txt.exons[i+1][0]-1) exon += intron elif change.lower() == 'novel_exon' and i+1 == exons[0]: exon += variant elif change.lower() == 'skipped_exon' and i+1 in exons: exon = '' elif change.lower() == 'novel_intron' and i+1 == exons[0]: bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start) new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:] exon = new_exon elif change.lower() in ['as5', 'as3', 'as53'] and i+1 == exons[0]: new_exon = refseq.GetSequence(coord[0], int(coord[1]), int(coord[2])) exon = new_exon elif change.lower() == 'novel_utr' and i+1 == exons[0]: exon = variant cdna += exon return cdna
def find_overlaps(test, repeat_overlaps): """Overlaps given coordinates with repeats to identify subsuming(simple repeats, segdups) or overlaps(rmsk) """ overlaps = {} for repeat_type, repeat_overlap in repeat_overlaps.iteritems(): overlaps[repeat_type] = {} repeats = repeat_overlap.overlap(test['chrom'], test['start'], test['end'], parse_line=parse_line) if repeat_type == 'simple_repeats' or repeat_type == 'segdup': for repeat in repeats: if subsume([test['start'], test['end']], [repeat['start'], repeat['end']]): overlaps[repeat_type][repeat['type']] = True elif repeat_type == 'rmsk': for repeat in repeats: if overlap([test['start'], test['end']], [repeat['start'], repeat['end']]): overlaps[repeat_type][repeat['type']] = True return overlaps
def construct_cdna(self, coord, txt, refseq, variant=None, change=None): """Constructs transcript sequence given variant. Variants: SNV, insertion, duplication, deletion """ cdna = "" remove = False for i in range(len(txt.exons)): if not overlap(txt.exons[i], [txt.cdsStart, txt.cdsEnd]): continue # extracts reference exon sequence if subsume([txt.cdsStart, txt.cdsStart], txt.exons[i]): start = int(txt.cdsStart) + 1 else: start = txt.exons[i][0] if subsume([txt.cdsEnd, txt.cdsEnd], txt.exons[i]): end = txt.cdsEnd else: end = txt.exons[i][1] exon = refseq.GetSequence(coord[0], int(start), int(end)) # modifies exon sequence based on variant if change: if change.lower() == 'snv' and subsume(coord[1:], [int(start), int(end)]): bases_changed = int(coord[1])-int(start), int(coord[2])-int(start) before_change = exon[:bases_changed[0]] after_change = exon[bases_changed[1]+1:] exon = before_change + variant + after_change elif change.lower() in ('ins', 'dup', 'ITD', 'PTD') and subsume(coord[1:], [int(start), int(end)]): base_to_insert = int(coord[1])-int(start)+1 exon = exon[:base_to_insert] + variant + exon[base_to_insert:] elif change.lower() == 'del': if subsume(coord[1:], [int(start), int(end)]): bases_deleted = int(coord[1])-int(start), int(coord[2])-int(start) new_exon = exon[:bases_deleted[0]] + exon[bases_deleted[1]+1:] exon = new_exon elif subsume([coord[1], coord[1]], [int(start), int(end)]): first_base_deleted = int(coord[1])-int(start) exon = exon[:first_base_deleted] remove = True elif subsume([coord[2], coord[2]], [int(start), int(end)]): last_base_deleted = int(coord[2])-int(start) exon = exon[last_base_deleted+1:] remove = False elif i >0 and subsume([coord[1], coord[1]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]): if not remove: remove = True exon = '' else: remove = False elif i >0 and subsume([coord[2], coord[2]], [int(txt.exons[i-1][1])+1, int(txt.exons[i][0]-1)]): if not remove: remove = True exon = '' else: remove = False elif remove: exon = '' cdna += exon return cdna
def is_read_through(self, txts, mm): """Determines if event is read-through""" last_matched_block, last_matched_exon = self.last_matched() for txt2 in txts: if txt2.strand != self.txt.strand: continue if txt2.model != self.txt.model: continue if txt2.name == self.txt.name or txt2.alias == self.txt.alias: continue if not overlap([self.align_coords[0][0], self.align_coords[-1][1]], [txt2.txStart, txt2.txEnd]) or\ overlap([self.txt.txStart, self.txt.txEnd], [txt2.txStart, txt2.txEnd]): continue if overlap(last_matched_block, [txt2.txStart, txt2.txEnd]): continue result = mm.match_exons(self.contig, txt2.full_name(), self.align_coords, txt2.exons, txt2.chrom, strand=txt2.strand) if result and len(result.matched_blocks) == len(self.align_blocks): exon_bounds_matched = True for i in range(len(result.matched_blocks)): # only 1 boundary has to be flush if it's terminal block if i == len(self.align_blocks) - 1: if self.txt.txStart < txt2.txStart: if self.align_coords[result.matched_blocks[i] - 1][0] != txt2.exons[result.matched_exons[i] - 1][0]: exon_bounds_matched = False else: if self.align_coords[result.matched_blocks[i] - 1][1] != txt2.exons[result.matched_exons[i] - 1][1]: exon_bounds_matched = False # both boundaries have to be flush if it's not terminal block else: if not(self.align_coords[result.matched_blocks[i] - 1][0] == txt2.exons[result.matched_exons[i] - 1][0] and\ self.align_coords[result.matched_blocks[i] - 1][1] == txt2.exons[result.matched_exons[i] - 1][1]): exon_bounds_matched = False if not exon_bounds_matched: continue if self.txt.txStart < txt2.txStart: txt_span = [int(self.txt.txEnd) + 1, int(txt2.txStart) - 1] else: txt_span = [int(txt2.txEnd) + 1, int(self.txt.txStart) - 1] # make sure there is no transcripts in between the 1st and 2nd transcripts has_txt_between = False for t in txts: if t.name == self.txt.name or t.name == txt2.name: continue if subsume([t.txStart, t.txEnd], txt_span): has_txt_between = True break if not has_txt_between: if self.txt.alias and txt2.alias and type(self.txt.alias) is str and type(txt2.alias) is str: if not Transcript.same_family(self.txt.alias, txt2.alias): self.event_type = 'read-through' self.txt2 = txt2
def set_novelty(self, txts, matches=None): """Determines if event is novel""" novel_events = [] if self.event_type == "novel_exon" or self.event_type == "AS53" or self.event_type == "novel_utr": for txt in txts: blocks_to_delete = [] for b in range(len(self.align_blocks)): for e in range(len(txt.exons)): novel = True if subsume(self.align_coords[b], txt.exons[e]): novel = False #novel utr - requires just one edge to align if novel and self.event_type == 'novel_utr': if int(self.align_coords[b][0]) == int(txt.exons[e][0]) or int(self.align_coords[b][1]) == int(txt.exons[e][1]): novel = False if not novel: blocks_to_delete.append(b) break if blocks_to_delete: for b in sorted(blocks_to_delete, reverse=True): del self.align_blocks[b] del self.align_coords[b] if not self.align_blocks: self.novel = False elif self.event_type == 'read-through': if len(self.align_coords) == 1: start, end = self.align.blocks[self.align_blocks[0] - 2][1], self.align.blocks[self.align_blocks[0] - 1][0] # see if any single transcript contains the exon junction for txt in txts: found_start, found_end = None, None for i in range(len(txt.exons) - 1): if int(txt.exons[i][1]) == start and int(txt.exons[i + 1][0]) == end: found_start, found_end = i, i + 1 self.novel = False break if not self.novel: break elif self.event_type == "retained_intron": multi = False if int(self.exons[-1]) - int(self.exons[0]) > 1: multi = True self.novel = False for i in range(len(self.exon_coords)-1): retained_intron = [int(self.exon_coords[i][1])+1, int(self.exon_coords[i+1][0])-1] middle_exons = {} for txt in txts: exons_txt = [] for j in range(len(txt.exons)): exon = txt.exons[j] #terminal exon, require subsume if j == 0 or j == len(txt.exons)-1: if subsume(retained_intron, exon): exons_txt.append(exon) #middle exons, require just overlap elif overlap(exon, (retained_intron[0], retained_intron[1])): exons_txt.append(exon) if exons_txt: middle_exons[txt] = exons_txt # only time when original event is novel WITHOUT testing is when it's a single ri and it is clear of overlapping exons if len(middle_exons.keys()) == 0 and not multi: self.novel = True else: self.novel = False # substract overlapping exons if middle_exons.values() and middle_exons.values()[0]: true_retained_intron = subtract(retained_intron, middle_exons.values()) else: true_retained_intron = [retained_intron] # if there is still some intron left after subtraction if true_retained_intron: # create new events for ri in true_retained_intron: event = {'contig': self.contig, 'chrom':self.chrom, 'align_blocks':self.align_blocks, 'align_coords':self.align_coords, 'type':self.event_type, 'novel':True } # determine flanking exons, and transcript by frequency of flanking coordinates flanks = {} for txt in txts: for i in range(len(txt.exons)-1): left = txt.exons[i] right = txt.exons[i+1] if left[1]+1 == ri[0] and right[0]-1 == ri[1]: #print "ri flanks", txt.full_name(), left, right, ri coord = ",".join((str(left[0]), str(left[1]), str(right[0]), str(right[1]))) if not flanks.has_key(coord): flanks[coord] = [[txt,i,i+1]] else: flanks[coord].append([txt,i,i+1]) if len(flanks.keys()) > 0: # use the most commom frequent exons flanks_sorted = flanks.keys() flanks_sorted.sort(lambda x,y: len(flanks[y])-len(flanks[x])) # use the orginally assigned transcript if possible same_txt = False exon_coords = flanks_sorted[0].split(',') exons = [] for txt,e1,e2 in flanks[flanks_sorted[0]]: if txt.full_name() == self.transcript: same_txt = True event['transcript'] = txt.full_name() event['exons'] = [e1+1,e2+1] event['exon_coords'] = [exon_coords[:2], exon_coords[2:]] event['txt'] = txt break if not same_txt: txt,e1,e2 = flanks[flanks_sorted[0]][0] event['transcript'] = txt.full_name() event['exons'] = [e1+1,e2+1] event['exon_coords'] = [exon_coords[:2], exon_coords[2:]] event['txt'] = txt novel_events.append(event) elif self.event_type == "skipped_exon": for txt in txts: for e in range(len(txt.exons)-1): intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1] if self.align_coords and self.exon_coords: if subsume([int(self.exon_coords[0][0]), int(self.exon_coords[-1][1])], intron_span): self.novel = False break if not self.novel: break elif self.event_type == "novel_intron": novel_intron_span = [int(self.align_coords[0][1])+1, int(self.align_coords[1][0])-1] novel_intron_size = int(self.align_coords[1][0]) - 1 - int(self.align_coords[0][1]) for txt in txts: for e in range(len(txt.exons)-1): intron_span = [int(txt.exons[e][1])+1, int(txt.exons[e+1][0])-1] if novel_intron_span[0] == intron_span[0] and novel_intron_span[1] == intron_span[1]: self.novel = False break if not self.novel: break elif 'AS' in self.event_type and self.edge == 'left': for txt in txts: for exon in txt.exons: if int(self.align_coords[0]) == int(exon[0]): self.novel = False break if not self.novel: break elif 'AS' in self.event_type and self.edge == 'right': for txt in txts: for exon in txt.exons: if int(self.align_coords[1]) == int(exon[1]): self.novel = False break if not self.novel: break return novel_events