def build_gene_model(align_db, clusters, big_cluster, find_max, min_transcript_len=0, max_isoforms=1e6, ): '''Build and print out gene models.''' visited_clusters = set() transcripts_num = 0 gene_id = 0 excluded = 0 two_exon_trns = set() def check_criteria(transcript, two_exon_trns): '''Return True or False whether a transcript pass or fail the criteria. ''' transcript_length = sum([align_db.exon_db[e].get_size() \ for e in transcript]) if transcript_length <= min_transcript_len: return False # fail else: if len(transcript) == 2: trns = ','.join(transcript) if trns in two_exon_trns: return False # fail else: two_exon_trns.add(trns) return True # pass else: return True for cl_num, cl in enumerate(big_cluster.nodes(), start=1): if cl not in visited_clusters: g = nx.DiGraph() for intron in clusters[cl].nodes(): g.add_edges_from(align_db.intron_db[intron].edges()) visited_clusters.add(cl) for neighbor in nx.dfs_tree(big_cluster, cl): neighbor_cluster = clusters[neighbor] for intron in neighbor_cluster.nodes(): g.add_edges_from(align_db.intron_db[intron].edges()) visited_clusters.add(neighbor) if g.nodes(): trans_id = 0 gene_id += 1 collapse_exon(g, align_db) for node in g.nodes(): if not g.predecessors(node): g.add_edge('Start', node) if not g.successors(node): g.add_edge(node, 'End') max_paths = [path for path in \ nx.all_simple_paths(g, 'Start', 'End')] if find_max: '''Report all maximum isoforms.''' for transcript in max_paths: transcript = transcript[1:-1] if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, gene_id, trans_id) else: excluded += 1 else: '''Report minimal isoforms if maximum isoforms exceeds max_isoforms. ''' if len(max_paths) > max_isoforms: for transcript in \ get_min_isoforms.get_min_paths(g, False): if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, gene_id, trans_id) else: excluded += 1 else: for transcript in max_paths: transcript = transcript[1:-1] if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, gene_id, trans_id) else: excluded += 1 if trans_id == 0: gene_id -= 1 print >> stderr, '\r |--Multi-exon\t\t%d genes, %d isoforms ' % \ (gene_id, transcripts_num), return gene_id, transcripts_num, excluded
def build_gene_model(genome, align_db, clusters, big_cluster, find_max, min_transcript_len=0, max_isoforms=1e6, ): '''Build and print out gene models.''' visited_clusters = set() transcripts_num = 0 gene_id = 0 excluded = 0 two_exon_trns = set() def check_criteria(transcript, two_exon_trns): '''Return True or False whether a transcript pass or fail the criteria. ''' transcript_length = sum([align_db.exon_db[e].get_size() \ for e in transcript]) if transcript_length <= min_transcript_len: return False # fail else: if len(transcript) == 2: trns = ','.join(transcript) if trns in two_exon_trns: return False # fail else: two_exon_trns.add(trns) return True # pass else: return True def exon_to_exonobj(exon): '''Returns an exon objects from a given exon coordinate.''' chrom, coord = exon.split(':') start, end = coord.split('-') return ExonObj(chrom, int(start), int(end)) for cl_num, cl in enumerate(big_cluster.nodes(), start=1): if cl not in visited_clusters: g = nx.DiGraph() for intron in clusters[cl].nodes(): g.add_edges_from(align_db.intron_db[intron].edges()) visited_clusters.add(cl) for neighbor in nx.dfs_tree(big_cluster, cl): neighbor_cluster = clusters[neighbor] for intron in neighbor_cluster.nodes(): g.add_edges_from(align_db.intron_db[intron].edges()) visited_clusters.add(neighbor) # # nx.draw_spring(nx.algorithms.dfs_tree(g)) # nx.draw_spring(g) # plt.show() # for node in g.nodes(): # print node, g[node] # raise SystemExit collapse_exon(g, align_db) for g in split_strand.split(g, genome): if g.nodes(): subalign_db = AlignmentDB() for edge in g.edges(): exon1 = exon_to_exonobj(edge[0]) exon2 = exon_to_exonobj(edge[1]) add_exon(subalign_db, [exon1, exon2]) collapse_exon(g, subalign_db) trans_id = 0 gene_id += 1 strand = g.graph['strand'] for node in g.nodes(): if not g.predecessors(node): g.add_edge('Start', node) if not g.successors(node): g.add_edge(node, 'End') max_paths = [path for path in \ nx.all_simple_paths(g, 'Start', 'End')] if find_max: '''Report all maximum isoforms.''' for transcript in max_paths: transcript = transcript[1:-1] if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, strand, gene_id, trans_id) else: excluded += 1 else: '''Report minimal isoforms if maximum isoforms exceeds max_isoforms. ''' if len(max_paths) > max_isoforms: for transcript in \ get_min_isoforms.get_min_paths(g, False): if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, strand, gene_id, trans_id) else: excluded += 1 else: for transcript in max_paths: transcript = transcript[1:-1] if check_criteria(transcript, two_exon_trns): transcripts_num += 1 trans_id += 1 print_bed(align_db, transcript, strand, gene_id, trans_id) else: excluded += 1 print >> stderr, '\r |--Multi-exon\t\t%d genes, %d isoforms ' % \ (gene_id, transcripts_num), return gene_id, transcripts_num, excluded