def prune_transcript_graph(G, strand, transcript_map,
                           min_trim_length=0, 
                           trim_utr_fraction=0.0,
                           trim_intron_fraction=0.0):
    '''
    trim_utr_fraction: float specifying the fraction of the average UTR
    coverage below which the ends of the UTR will be trimmed

    trim_intron_fraction: float specifying the fraction of the average 
    intron coverage below which intronic nodes will be removed
    '''
    # trim utrs and intron retentions
    trim_nodes = trim_graph(G, strand, min_trim_length, 
                            trim_utr_fraction, 
                            trim_intron_fraction)
    G.remove_nodes_from(trim_nodes)
    # collapse consecutive nodes in graph
    H = collapse_strand_specific_graph(G, transcript_map, introns=True)
    # get connected components of graph which represent independent genes
    # unconnected components are considered different genes
    Gsubs = nx.weakly_connected_component_subgraphs(H)
    for Gsub in Gsubs:
        # get partial path data supporting graph
        transcript_node_map = get_transcript_node_map(Gsub)
        path_score_dict = collections.defaultdict(lambda: 0)
        for t_id, nodes in transcript_node_map.iteritems():
            # reverse path for negative strand transcripts
            if strand == NEG_STRAND:
                nodes.reverse()
            # get transcript scores
            t = transcript_map[t_id]
            path_score_dict[tuple(nodes)] += t.score
        yield Gsub, strand, path_score_dict.items()
Exemplo n.º 2
0
def prune_transcript_graph(G,
                           strand,
                           transcript_map,
                           min_trim_length=0,
                           trim_utr_fraction=0.0,
                           trim_intron_fraction=0.0):
    '''
    trim_utr_fraction: float specifying the fraction of the average UTR
    coverage below which the ends of the UTR will be trimmed

    trim_intron_fraction: float specifying the fraction of the average 
    intron coverage below which intronic nodes will be removed
    '''
    # trim utrs and intron retentions
    trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction,
                            trim_intron_fraction)
    G.remove_nodes_from(trim_nodes)
    # collapse consecutive nodes in graph
    H = collapse_strand_specific_graph(G, transcript_map, introns=True)
    # get connected components of graph which represent independent genes
    # unconnected components are considered different genes
    Gsubs = nx.weakly_connected_component_subgraphs(H)
    for Gsub in Gsubs:
        # get partial path data supporting graph
        transcript_node_map = get_transcript_node_map(Gsub)
        path_score_dict = collections.defaultdict(lambda: 0)
        for t_id, nodes in transcript_node_map.iteritems():
            # reverse path for negative strand transcripts
            if strand == NEG_STRAND:
                nodes.reverse()
            # get transcript scores
            t = transcript_map[t_id]
            path_score_dict[tuple(nodes)] += t.score
        yield Gsub, strand, path_score_dict.items()
Exemplo n.º 3
0
def create_transcript_graphs(chrom, transcripts, 
                             min_trim_length=0, 
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.0,
                             create_bedgraph=False, 
                             bedgraph_filehs=None):

    '''
    generates (graph, strand, transcript_map) tuples with transcript 
    graphs
    '''
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) 
            yield fields
    # partition transcripts by strand and resolve unstranded transcripts
    logging.debug("\tResolving unstranded transcripts")
    strand_transcript_lists, strand_ref_transcripts = \
        partition_transcripts_by_strand(transcripts)
    # create strand-specific graphs using redistributed score
    logging.debug("\tCreating transcript graphs")
    transcript_graphs = []
    for strand, transcript_list in enumerate(strand_transcript_lists):
        # create strand specific transcript graph
        G = create_directed_graph(strand, transcript_list)
        # output bedgraph
        if create_bedgraph:
            for fields in get_bedgraph_lines(chrom, G):
                print >>bedgraph_filehs[strand], '\t'.join(map(str,fields))
        # trim utrs and intron retentions
        trim_nodes = trim_graph(G, strand, 
                                min_trim_length, 
                                trim_utr_fraction, 
                                trim_intron_fraction)
        G.remove_nodes_from(trim_nodes)
        # collapse consecutive nodes in graph
        H, node_chain_map = collapse_strand_specific_graph(G, introns=True)
        # get connected components of graph which represent independent genes
        # unconnected components are considered different genes
        Gsubs = nx.weakly_connected_component_subgraphs(H)
        # add components as separate transcript graphs
        strand_graphs = []
        node_subgraph_map = {}
        for i,Gsub in enumerate(Gsubs):
            for n in Gsub:
                node_subgraph_map[n] = i
            tg = TranscriptGraph(chrom, strand, Gsub)
            tg.partial_paths = collections.defaultdict(lambda: 0.0)
            strand_graphs.append(tg)
        # populate transcript graphs with partial paths
        for t in transcript_list:
            # get original transcript nodes and subtract trimmed nodes
            # convert to collapsed nodes and bin according to subgraph
            # TODO: intronic transcripts may be split into multiple pieces,
            # should we allow this?
            subgraph_node_map = collections.defaultdict(lambda: set())
            for n in split_exons(t, G.graph['boundaries']):
                n = Exon(*n)
                if n in trim_nodes:
                    continue
                cn = node_chain_map[n]
                subgraph_id = node_subgraph_map[cn]
                subgraph_node_map[subgraph_id].add(cn)
            # add transcript node/score pairs to subgraphs
            for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems():
                subgraph_nodes = sorted(subgraph_nodes, 
                                        key=operator.attrgetter('start'), 
                                        reverse=(strand == NEG_STRAND))
                tg = strand_graphs[subgraph_id]
                tg.partial_paths[tuple(subgraph_nodes)] += t.score
        transcript_graphs.extend(strand_graphs)
    # convert 
    for tg in transcript_graphs:
        tg.partial_paths = tg.partial_paths.items()
    return transcript_graphs
Exemplo n.º 4
0
def create_transcript_graphs(chrom,
                             transcripts,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.0,
                             create_bedgraph=False,
                             bedgraph_filehs=None):
    '''
    generates (graph, strand, transcript_map) tuples with transcript 
    graphs
    '''
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE])
            yield fields

    # partition transcripts by strand and resolve unstranded transcripts
    logging.debug("\tResolving unstranded transcripts")
    strand_transcript_lists, strand_ref_transcripts = \
        partition_transcripts_by_strand(transcripts)
    # create strand-specific graphs using redistributed score
    logging.debug("\tCreating transcript graphs")
    transcript_graphs = []
    for strand, transcript_list in enumerate(strand_transcript_lists):
        # create strand specific transcript graph
        G = create_directed_graph(strand, transcript_list)
        # output bedgraph
        if create_bedgraph:
            for fields in get_bedgraph_lines(chrom, G):
                print >> bedgraph_filehs[strand], '\t'.join(map(str, fields))
        # trim utrs and intron retentions
        trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction,
                                trim_intron_fraction)
        G.remove_nodes_from(trim_nodes)
        # collapse consecutive nodes in graph
        H, node_chain_map = collapse_strand_specific_graph(G, introns=True)
        # get connected components of graph which represent independent genes
        # unconnected components are considered different genes
        Gsubs = nx.weakly_connected_component_subgraphs(H)
        # add components as separate transcript graphs
        strand_graphs = []
        node_subgraph_map = {}
        for i, Gsub in enumerate(Gsubs):
            for n in Gsub:
                node_subgraph_map[n] = i
            tg = TranscriptGraph(chrom, strand, Gsub)
            tg.partial_paths = collections.defaultdict(lambda: 0.0)
            strand_graphs.append(tg)
        # populate transcript graphs with partial paths
        for t in transcript_list:
            # get original transcript nodes and subtract trimmed nodes
            # convert to collapsed nodes and bin according to subgraph
            # TODO: intronic transcripts may be split into multiple pieces,
            # should we allow this?
            subgraph_node_map = collections.defaultdict(lambda: set())
            for n in split_exons(t, G.graph['boundaries']):
                n = Exon(*n)
                if n in trim_nodes:
                    continue
                cn = node_chain_map[n]
                subgraph_id = node_subgraph_map[cn]
                subgraph_node_map[subgraph_id].add(cn)
            # add transcript node/score pairs to subgraphs
            for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems():
                subgraph_nodes = sorted(subgraph_nodes,
                                        key=operator.attrgetter('start'),
                                        reverse=(strand == NEG_STRAND))
                tg = strand_graphs[subgraph_id]
                tg.partial_paths[tuple(subgraph_nodes)] += t.score
        transcript_graphs.extend(strand_graphs)
    # convert
    for tg in transcript_graphs:
        tg.partial_paths = tg.partial_paths.items()
    return transcript_graphs