def trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction): # get 'chains' of contiguous non-intron nodes with edge degree of # one or less node_chain_map, chains = get_chains(G, introns=False) # setup dictionaries of predecessors and successors successor_dict = {} for n, nbrdict in G.adjacency_iter(): successor_dict[n] = nbrdict.keys() predecessor_dict = {} G.reverse(copy=False) for n, nbrdict in G.adjacency_iter(): predecessor_dict[n] = nbrdict.keys() G.reverse(copy=False) # setup intron data structures introns = {} intron_tree = IntervalTree() reverse = (strand == NEG_STRAND) for u, nbrdict in G.adjacency_iter(): for v in nbrdict: if reverse: left, right = v, u else: left, right = u, v # skip contiguous nodes if left.end == right.start: continue # calculate score of the chains u_chain_nodes = chains[node_chain_map[u]] u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes) v_chain_nodes = chains[node_chain_map[v]] v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes) # store scores in intron data structures introns[(left.end, right.start)] = (u_score, v_score) intron_tree.insert_interval( Interval(left.end, right.start, value=(u_score, v_score))) # trim chains all_trim_nodes = set() for parent, nodes in chains.iteritems(): if strand == NEG_STRAND: nodes.reverse() in_degree = len(predecessor_dict[nodes[0]]) out_degree = len(successor_dict[nodes[-1]]) trim_nodes = set() if ((in_degree == 1) and (out_degree == 1) and (parent.start, parent.end) in introns): # intron retention - a chain of nodes precisely matches an # intron, so we can potentially remove the entire chain pred_score, succ_score = introns[(parent.start, parent.end)] cutoff_score = trim_intron_fraction * max(pred_score, succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) else: # determine whether this node chain is intronic. intronic node # chains are trimmed more strictly due to intronic pre-mrna found_intron = False max_pred_score = 0.0 max_succ_score = 0.0 for hit in intron_tree.find(parent.start, parent.end): # ignore contained introns if (hit.start > parent.start) and (hit.end < parent.end): continue # set intron flag and keep track of highest coverage # overlapping intron to make trimming conservative found_intron = True pred_score, succ_score = hit.value if pred_score > max_pred_score: max_pred_score = pred_score if succ_score > max_succ_score: max_succ_score = succ_score if (in_degree == 0) and (out_degree == 0): if found_intron: cutoff_score = trim_intron_fraction * max( max_pred_score, max_succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) trim_nodes.update( trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction)) elif in_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_succ_score trim_nodes.update(trim_intronic_utr( G, nodes, cutoff_score)) trim_nodes.update( trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction)) elif out_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_pred_score trim_nodes.update( trim_intronic_utr(G, nodes[::-1], cutoff_score)) trim_nodes.update( trim_utr(G, nodes, min_trim_length, trim_utr_fraction)) all_trim_nodes.update(trim_nodes) if len(all_trim_nodes) > 0: logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % (strand_int_to_str(strand), len(all_trim_nodes), len(G))) return all_trim_nodes
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start, end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval( Interval(start, end, strand=strand, value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start, end in t.iterintrons(): introns.append((t.strand, start, end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len( m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len( m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)
def trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction): # get 'chains' of contiguous non-intron nodes with edge degree of # one or less node_chain_map, chains = get_chains(G, introns=False) # setup dictionaries of predecessors and successors successor_dict = {} for n,nbrdict in G.adjacency_iter(): successor_dict[n] = nbrdict.keys() predecessor_dict = {} G.reverse(copy=False) for n,nbrdict in G.adjacency_iter(): predecessor_dict[n] = nbrdict.keys() G.reverse(copy=False) # setup intron data structures introns = {} intron_tree = IntervalTree() reverse = (strand == NEG_STRAND) for u,nbrdict in G.adjacency_iter(): for v in nbrdict: if reverse: left, right = v, u else: left, right = u, v # skip contiguous nodes if left.end == right.start: continue # calculate score of the chains u_chain_nodes = chains[node_chain_map[u]] u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes) v_chain_nodes = chains[node_chain_map[v]] v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes) # store scores in intron data structures introns[(left.end,right.start)] = (u_score, v_score) intron_tree.insert_interval(Interval(left.end, right.start, value=(u_score,v_score))) # trim chains all_trim_nodes = set() for parent, nodes in chains.iteritems(): if strand == NEG_STRAND: nodes.reverse() in_degree = len(predecessor_dict[nodes[0]]) out_degree = len(successor_dict[nodes[-1]]) trim_nodes = set() if ((in_degree == 1) and (out_degree == 1) and (parent.start, parent.end) in introns): # intron retention - a chain of nodes precisely matches an # intron, so we can potentially remove the entire chain pred_score, succ_score = introns[(parent.start, parent.end)] cutoff_score = trim_intron_fraction * max(pred_score, succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) else: # determine whether this node chain is intronic. intronic node # chains are trimmed more strictly due to intronic pre-mrna found_intron = False max_pred_score = 0.0 max_succ_score = 0.0 for hit in intron_tree.find(parent.start, parent.end): # ignore contained introns if (hit.start > parent.start) and (hit.end < parent.end): continue # set intron flag and keep track of highest coverage # overlapping intron to make trimming conservative found_intron = True pred_score, succ_score = hit.value if pred_score > max_pred_score: max_pred_score = pred_score if succ_score > max_succ_score: max_succ_score = succ_score if (in_degree == 0) and (out_degree == 0): if found_intron: cutoff_score = trim_intron_fraction * max(max_pred_score, max_succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) trim_nodes.update(trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction)) elif in_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_succ_score trim_nodes.update(trim_intronic_utr(G, nodes, cutoff_score)) trim_nodes.update(trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction)) elif out_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_pred_score trim_nodes.update(trim_intronic_utr(G, nodes[::-1], cutoff_score)) trim_nodes.update(trim_utr(G, nodes, min_trim_length, trim_utr_fraction)) all_trim_nodes.update(trim_nodes) if len(all_trim_nodes) > 0: logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % (strand_int_to_str(strand), len(all_trim_nodes), len(G))) return all_trim_nodes
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start,end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start,end in t.iterintrons(): introns.append((t.strand,start,end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)