예제 #1
0
파일: gimme.py 프로젝트: dib-lab/gimme
def build_gene_model(align_db,
                        clusters,
                        big_cluster,
                        find_max,
                        min_transcript_len=0,
                        max_isoforms=1e6,
                    ):

    '''Build and print out gene models.'''

    visited_clusters = set()
    transcripts_num = 0
    gene_id = 0
    excluded = 0
    two_exon_trns = set()

    def check_criteria(transcript, two_exon_trns):
        '''Return True or False whether a transcript pass or
        fail the criteria.

        '''
        transcript_length = sum([align_db.exon_db[e].get_size() \
                                                for e in transcript])

        if transcript_length <= min_transcript_len:
            return False  # fail
        else:
            if len(transcript) == 2:
                trns = ','.join(transcript)
                if trns in two_exon_trns:
                    return False  # fail
                else:
                    two_exon_trns.add(trns)
                    return True  # pass
            else:
                return True

    for cl_num, cl in enumerate(big_cluster.nodes(), start=1):
        if cl not in visited_clusters:
            g = nx.DiGraph()
            for intron in clusters[cl].nodes():
                g.add_edges_from(align_db.intron_db[intron].edges())

            visited_clusters.add(cl)

            for neighbor in nx.dfs_tree(big_cluster, cl):
                neighbor_cluster = clusters[neighbor]
                for intron in neighbor_cluster.nodes():
                    g.add_edges_from(align_db.intron_db[intron].edges())

                visited_clusters.add(neighbor)

            if g.nodes():
                trans_id = 0
                gene_id += 1
                collapse_exon(g, align_db)
                for node in g.nodes():
                    if not g.predecessors(node):
                        g.add_edge('Start', node)
                    if not g.successors(node):
                        g.add_edge(node, 'End')

                max_paths = [path for path in \
                                    nx.all_simple_paths(g, 'Start', 'End')]

                if find_max:
                    '''Report all maximum isoforms.'''

                    for transcript in max_paths:
                        transcript = transcript[1:-1]
                        if check_criteria(transcript, two_exon_trns):
                            transcripts_num += 1
                            trans_id += 1
                            print_bed(align_db, transcript,
                                                    gene_id, trans_id)
                        else:
                            excluded += 1
                else:
                    '''Report minimal isoforms if maximum isoforms exceeds
                    max_isoforms.

                    '''
                    if len(max_paths) > max_isoforms:
                        for transcript in \
                                get_min_isoforms.get_min_paths(g, False):
                            if check_criteria(transcript, two_exon_trns):
                                transcripts_num += 1
                                trans_id += 1
                                print_bed(align_db, transcript,
                                                        gene_id, trans_id)
                            else:
                                excluded += 1
                    else:
                        for transcript in max_paths:
                            transcript = transcript[1:-1]
                            if check_criteria(transcript, two_exon_trns):
                                transcripts_num += 1
                                trans_id += 1
                                print_bed(align_db, transcript,
                                                        gene_id, trans_id)
                            else:
                                excluded += 1

            if trans_id == 0:
                gene_id -= 1

        print >> stderr, '\r  |--Multi-exon\t\t%d genes, %d isoforms ' % \
                                            (gene_id, transcripts_num),

    return gene_id, transcripts_num, excluded
예제 #2
0
파일: gimme.py 프로젝트: likit/gimme
def build_gene_model(genome,
                        align_db,
                        clusters,
                        big_cluster,
                        find_max,
                        min_transcript_len=0,
                        max_isoforms=1e6,
                    ):

    '''Build and print out gene models.'''

    visited_clusters = set()
    transcripts_num = 0
    gene_id = 0
    excluded = 0
    two_exon_trns = set()

    def check_criteria(transcript, two_exon_trns):
        '''Return True or False whether a transcript pass or
        fail the criteria.

        '''
        transcript_length = sum([align_db.exon_db[e].get_size() \
                                                for e in transcript])

        if transcript_length <= min_transcript_len:
            return False  # fail
        else:
            if len(transcript) == 2:
                trns = ','.join(transcript)
                if trns in two_exon_trns:
                    return False  # fail
                else:
                    two_exon_trns.add(trns)
                    return True  # pass
            else:
                return True

    def exon_to_exonobj(exon):
        '''Returns an exon objects from a given exon coordinate.'''
        chrom, coord = exon.split(':')
        start, end = coord.split('-')
        return ExonObj(chrom, int(start), int(end))

    for cl_num, cl in enumerate(big_cluster.nodes(), start=1):
        if cl not in visited_clusters:
            g = nx.DiGraph()
            for intron in clusters[cl].nodes():
                g.add_edges_from(align_db.intron_db[intron].edges())

            visited_clusters.add(cl)

            for neighbor in nx.dfs_tree(big_cluster, cl):
                neighbor_cluster = clusters[neighbor]
                for intron in neighbor_cluster.nodes():
                    g.add_edges_from(align_db.intron_db[intron].edges())

                visited_clusters.add(neighbor)
            # # nx.draw_spring(nx.algorithms.dfs_tree(g))
            # nx.draw_spring(g)
            # plt.show()
            # for node in g.nodes():
            #     print node, g[node]
            # raise SystemExit
            collapse_exon(g, align_db)
            for g in split_strand.split(g, genome):
                if g.nodes():
                    subalign_db = AlignmentDB()
                    for edge in g.edges():
                        exon1 = exon_to_exonobj(edge[0])
                        exon2 = exon_to_exonobj(edge[1])
                        add_exon(subalign_db, [exon1, exon2])
                    collapse_exon(g, subalign_db)

                    trans_id = 0
                    gene_id += 1
                    strand = g.graph['strand']
                    for node in g.nodes():
                        if not g.predecessors(node):
                            g.add_edge('Start', node)
                        if not g.successors(node):
                            g.add_edge(node, 'End')

                    max_paths = [path for path in \
                                    nx.all_simple_paths(g, 'Start', 'End')]

                    if find_max:
                        '''Report all maximum isoforms.'''

                        for transcript in max_paths:
                            transcript = transcript[1:-1]
                            if check_criteria(transcript, two_exon_trns):
                                transcripts_num += 1
                                trans_id += 1
                                print_bed(align_db,
                                            transcript,
                                            strand,
                                            gene_id,
                                            trans_id)
                            else:
                                excluded += 1
                    else:
                        '''Report minimal isoforms if maximum isoforms exceeds
                        max_isoforms.

                        '''
                        if len(max_paths) > max_isoforms:
                            for transcript in \
                                    get_min_isoforms.get_min_paths(g, False):
                                if check_criteria(transcript, two_exon_trns):
                                    transcripts_num += 1
                                    trans_id += 1
                                    print_bed(align_db,
                                                transcript,
                                                strand,
                                                gene_id,
                                                trans_id)
                                else:
                                    excluded += 1
                        else:
                            for transcript in max_paths:
                                transcript = transcript[1:-1]
                                if check_criteria(transcript, two_exon_trns):
                                    transcripts_num += 1
                                    trans_id += 1
                                    print_bed(align_db,
                                                transcript,
                                                strand,
                                                gene_id,
                                                trans_id)
                                else:
                                    excluded += 1

        print >> stderr, '\r  |--Multi-exon\t\t%d genes, %d isoforms ' % \
                                            (gene_id, transcripts_num),

    return gene_id, transcripts_num, excluded