def read_reference_gtf(ref_gtf_file): gene_map = {} for f in GTFFeature.parse(open(ref_gtf_file)): # get gene by id gene_id = f.attrs["gene_id"] if gene_id not in gene_map: g = Gene() g.gene_id = gene_id g.chrom = f.seqid g.strand = f.strand g.gene_start = f.start g.gene_end = f.end gene_map[gene_id] = g else: g = gene_map[gene_id] # update gene g.gene_start = min(g.gene_start, f.start) g.gene_end = max(g.gene_end, f.end) if f.feature_type == "exon": g.exons.add((f.start, f.end)) elif f.feature_type == "CDS": g.is_coding = True if "gene_name" in f.attrs: g.gene_names.add(f.attrs["gene_name"]) g.annotation_sources.add(f.source) logging.info("Sorting genes") genes = sorted(gene_map.values(), key=operator.attrgetter('chrom', 'gene_start')) del gene_map # cluster loci logging.debug("Building interval index") locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1)) locus_trees = collections.defaultdict(lambda: IntervalTree()) for i, g in enumerate(genes): locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i) for chrom, cluster_tree in locus_cluster_trees.iteritems(): for locus_start, locus_end, indexes in cluster_tree.getregions(): # cluster gene exons and add to interval tree exon_tree = IntervalTree() for i in indexes: g = genes[i] cluster_tree = ClusterTree(0, 1) for start, end in g.exons: cluster_tree.insert(start, end, 1) # update exons exon_clusters = [] for start, end, indexes in cluster_tree.getregions(): exon_clusters.append((start, end)) g.exons = exon_clusters del cluster_tree for start, end in g.exons: exon_tree.insert_interval(Interval(start, end, value=g)) # add to locus interval tree locus_trees[chrom].insert_interval( Interval(locus_start, locus_end, value=exon_tree)) logging.debug("Done indexing reference GTF file") return locus_trees
def build_interval_tree_from_bed(bed_file): trees = collections.defaultdict(lambda: IntervalTree()) for f in BEDFeature.parse(open(bed_file)): tree = trees[f.chrom] for start,end in f.exons: tree.insert_interval(Interval(start, end, strand=f.strand, value=f.name)) return trees
def build_locus_trees(gtf_file): transcripts = [] locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1)) for locus_transcripts in parse_gtf(open(gtf_file)): for t in locus_transcripts: is_ref = bool(int(t.attrs[GTFAttr.REF])) if not is_ref: continue i = len(transcripts) transcripts.append(t) locus_cluster_trees[t.chrom].insert(t.start, t.end, i) # build interval trees of loci locus_trees = collections.defaultdict(lambda: IntervalTree()) for chrom, cluster_tree in locus_cluster_trees.iteritems(): for locus_start, locus_end, indexes in cluster_tree.getregions(): for i in indexes: locus_transcripts = [transcripts[i] for i in indexes] locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts)) return locus_trees
def trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction): # get 'chains' of contiguous non-intron nodes with edge degree of # one or less node_chain_map, chains = get_chains(G, introns=False) # setup dictionaries of predecessors and successors successor_dict = {} for n, nbrdict in G.adjacency_iter(): successor_dict[n] = nbrdict.keys() predecessor_dict = {} G.reverse(copy=False) for n, nbrdict in G.adjacency_iter(): predecessor_dict[n] = nbrdict.keys() G.reverse(copy=False) # setup intron data structures introns = {} intron_tree = IntervalTree() reverse = (strand == NEG_STRAND) for u, nbrdict in G.adjacency_iter(): for v in nbrdict: if reverse: left, right = v, u else: left, right = u, v # skip contiguous nodes if left.end == right.start: continue # calculate score of the chains u_chain_nodes = chains[node_chain_map[u]] u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes) v_chain_nodes = chains[node_chain_map[v]] v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes) # store scores in intron data structures introns[(left.end, right.start)] = (u_score, v_score) intron_tree.insert_interval( Interval(left.end, right.start, value=(u_score, v_score))) # trim chains all_trim_nodes = set() for parent, nodes in chains.iteritems(): if strand == NEG_STRAND: nodes.reverse() in_degree = len(predecessor_dict[nodes[0]]) out_degree = len(successor_dict[nodes[-1]]) trim_nodes = set() if ((in_degree == 1) and (out_degree == 1) and (parent.start, parent.end) in introns): # intron retention - a chain of nodes precisely matches an # intron, so we can potentially remove the entire chain pred_score, succ_score = introns[(parent.start, parent.end)] cutoff_score = trim_intron_fraction * max(pred_score, succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) else: # determine whether this node chain is intronic. intronic node # chains are trimmed more strictly due to intronic pre-mrna found_intron = False max_pred_score = 0.0 max_succ_score = 0.0 for hit in intron_tree.find(parent.start, parent.end): # ignore contained introns if (hit.start > parent.start) and (hit.end < parent.end): continue # set intron flag and keep track of highest coverage # overlapping intron to make trimming conservative found_intron = True pred_score, succ_score = hit.value if pred_score > max_pred_score: max_pred_score = pred_score if succ_score > max_succ_score: max_succ_score = succ_score if (in_degree == 0) and (out_degree == 0): if found_intron: cutoff_score = trim_intron_fraction * max( max_pred_score, max_succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) trim_nodes.update( trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction)) elif in_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_succ_score trim_nodes.update(trim_intronic_utr( G, nodes, cutoff_score)) trim_nodes.update( trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction)) elif out_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_pred_score trim_nodes.update( trim_intronic_utr(G, nodes[::-1], cutoff_score)) trim_nodes.update( trim_utr(G, nodes, min_trim_length, trim_utr_fraction)) all_trim_nodes.update(trim_nodes) if len(all_trim_nodes) > 0: logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % (strand_int_to_str(strand), len(all_trim_nodes), len(G))) return all_trim_nodes
def annotate_locus(transcripts, gtf_sample_attr): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: ([], [])) node_score_dict = collections.defaultdict(lambda: [0.0, 0.0]) all_introns = set() # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) # add transcript to intron and graph data structures inp_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n][t.strand].append(t) # add to introns for start, end in t.iterintrons(): ref_intron_dict[(t.strand, start, end)].append(t) all_introns.add((t.strand, start, end)) else: if t.strand != NO_STRAND: score = float(t.attrs[GTFAttr.SCORE]) for n in split_exons(t, boundaries): node_score_dict[n][t.strand] += score inp_transcripts.append(t) # add to introns for start, end in t.iterintrons(): all_introns.add((t.strand, start, end)) # index introns for fast intersection intron_tree = IntervalTree() for strand, start, end in all_introns: intron_tree.insert_interval(Interval(start, end, strand=strand)) del all_introns # categorize transcripts strand_transcript_lists = [[], [], []] for t in inp_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = set(t.iterintrons()) # try to resolve strand strand = t.strand if strand == NO_STRAND: strand = resolve_strand(nodes, node_score_dict, ref_node_dict) # define opposite strand if strand == NO_STRAND: opp_strand = NO_STRAND else: opp_strand = (strand + 1) % 2 # get all reference transcripts that share introns intron_ref_dict = {} for start, end in introns: if (strand, start, end) in ref_intron_dict: refs = ref_intron_dict[(strand, start, end)] intron_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in refs) intron_refs = [] for ref in intron_ref_dict.itervalues(): intron_refs.append((ref, list(split_exons(ref, boundaries)))) # get all reference transcripts that share coverage same_strand_ref_dict = {} opp_strand_ref_dict = {} for n in nodes: if n in ref_node_dict: strand_refs = ref_node_dict[n] same_strand_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in strand_refs[strand]) opp_strand_ref_dict.update( (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in strand_refs[opp_strand]) same_strand_refs = [] for ref in same_strand_ref_dict.itervalues(): same_strand_refs.append((ref, list(split_exons(ref, boundaries)))) opp_strand_refs = [] for ref in opp_strand_ref_dict.itervalues(): opp_strand_refs.append((ref, list(split_exons(ref, boundaries)))) # categorize cinf = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=False) if cinf.is_test: # recategorize test transcripts cinf2 = categorize_transcript(t, nodes, introns, intron_refs, same_strand_refs, opp_strand_refs, intron_tree, ignore_test=True) cinf = cinf._replace(category=cinf2.category) # add annotation attributes best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID] if cinf.ref is not None else 'na') t.attrs[GTFAttr.CATEGORY] = cinf.category t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0' t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio # group transcripts by strand strand_transcript_lists[strand].append(t) # explictly delete large data structures del ref_intron_dict del ref_node_dict del node_score_dict del intron_tree del inp_transcripts # annotate score and recurrence for transcripts for strand_transcripts in strand_transcript_lists: # find the intron domains of the transcripts boundaries = find_exon_boundaries(strand_transcripts) # gather node score/recurrence data new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0} node_data = collections.defaultdict(new_data_func) for t in strand_transcripts: sample_id = t.attrs[gtf_sample_attr] score = float(t.attrs[GTFAttr.SCORE]) pctrank = float(t.attrs[GTFAttr.PCTRANK]) # split exons that cross boundaries and to get the # nodes in the transcript path for n in split_exons(t, boundaries): nd = node_data[n] nd['ids'].add(sample_id) nd['score'] += score nd['pct'] += pctrank # calculate recurrence and score statistics for t in strand_transcripts: nodes = list(split_exons(t, boundaries)) mean_score, mean_pctrank, mean_recur = \ compute_recurrence_and_score(nodes, node_data) t.attrs[GTFAttr.MEAN_SCORE] = mean_score t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start, end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval( Interval(start, end, strand=strand, value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start, end in t.iterintrons(): introns.append((t.strand, start, end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len( m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len( m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)