Пример #1
0
 def test_trim_intron_retention(self):
     transcripts = read_first_locus("trim_intron_retention1.gtf",
                                    score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.01)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.11)
     correct = set([Exon(500, 1500)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.21)
     correct = set([Exon(500, 1500), Exon(2000, 9000)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=1.0)
     correct = set([Exon(500, 1500), Exon(2000, 9000)])
     self.assertTrue(trim_nodes == correct)
def genome_interval_to_exons(start, end, t):
    assert start >= t.start
    assert end <= t.end
    assert start < end
    i = 0
    while t.exons[i].end < start:
        i += 1
    start_exon = i
    while t.exons[i].end < end:
        i += 1
    end_exon = i
    newexons = []
    if start_exon == end_exon:
        newexons.append(Exon(start, end))
    else:
        newexons.append(Exon(start, t.exons[start_exon].end))
        for i in xrange(start_exon + 1, end_exon):
            newexons.append(t.exons[i])
        newexons.append(Exon(t.exons[end_exon].start, end))
    return newexons
 def from_table(line):
     fields = line.strip().split('\t')
     self = ORFInfo()
     self.transcript_id = fields[0]
     self.gene_id = fields[1]
     self.orf_id = fields[2]
     self.frame = int(fields[3])
     self.chrom = fields[5]
     self.start = int(fields[6])
     self.end = int(fields[7])
     self.strand = fields[8]
     exon_starts = map(int, fields[9].split(','))
     exon_ends = map(int, fields[10].split(','))
     self.exons = [Exon(x, y) for x, y in zip(exon_starts, exon_ends)]
     self.seq = fields[ORFInfo.SEQ_COL_NUM]
     return self
Пример #4
0
def create_undirected_transcript_graph(transcripts, add_node_func, **kwargs):
    '''
    add all transcripts to a single undirected graph
    '''
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # initialize transcript graph as undirected at first
    G = nx.Graph()
    # add transcripts
    for t in transcripts:
        # split exons that cross boundaries and to get the
        # nodes in the transcript path
        nodes = list(Exon(start,end) for start,end in split_exons(t, boundaries))
        # add nodes/edges to graph
        u = nodes[0]
        add_node_func(G, u, t, **kwargs)
        for v in nodes[1:]:
            add_node_func(G, v, t, **kwargs)
            G.add_edge(u, v)
            u = v
    return G
Пример #5
0
def create_directed_graph(strand, transcripts):
    '''build strand-specific graph'''
    def add_node_directed(G, n, score):
        """add node to graph"""
        if n not in G:
            G.add_node(n,
                       attr_dict={
                           NODE_LENGTH: (n.end - n.start),
                           NODE_SCORE: 0.0
                       })
        nd = G.node[n]
        nd[NODE_SCORE] += score

    # initialize transcript graph
    G = nx.DiGraph()
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # add transcripts
    for t in transcripts:
        # split exons that cross boundaries and get the
        # nodes that made up the transcript
        # TODO: can generate
        nodes = [Exon(start, end) for start, end in split_exons(t, boundaries)]
        if strand == NEG_STRAND:
            nodes.reverse()
        # add nodes/edges to graph
        u = nodes[0]
        add_node_directed(G, u, t.score)
        for i in xrange(1, len(nodes)):
            v = nodes[i]
            add_node_directed(G, v, t.score)
            G.add_edge(u, v)
            u = v
    # set graph attributes
    G.graph['boundaries'] = boundaries
    return G
Пример #6
0
def create_directed_graph(strand, transcripts):
    '''build strand-specific graph'''
    def add_node_directed(G, n, t_id, score):
        """add node to graph"""
        if n not in G:
            G.add_node(n,
                       attr_dict={
                           TRANSCRIPT_IDS: set(),
                           NODE_LENGTH: (n.end - n.start),
                           NODE_SCORE: 0.0
                       })
        nd = G.node[n]
        nd[TRANSCRIPT_IDS].add(t_id)
        nd[NODE_SCORE] += score

    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # initialize transcript graph
    G = nx.DiGraph()
    # add transcripts
    for t in transcripts:
        t_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
        # split exons that cross boundaries and get the
        # nodes that made up the transcript
        nodes = [Exon(start, end) for start, end in split_exons(t, boundaries)]
        if strand == NEG_STRAND:
            nodes.reverse()
        # add nodes/edges to graph
        u = nodes[0]
        add_node_directed(G, u, t_id, t.score)
        for i in xrange(1, len(nodes)):
            v = nodes[i]
            add_node_directed(G, v, t_id, t.score)
            G.add_edge(u, v)
            u = v
    return G
Пример #7
0
 def test_trim_intron_bidir(self):
     transcripts = read_first_locus("trim_intron_bidir1.gtf",
                                    score_attr="FPKM")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.025)
     correct = set([Exon(1900, 2000), Exon(1000, 1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.2)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             POS_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.25)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1200, 1300),
         Exon(1700, 1800),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     # flip sign of transcripts and try again
     for t in transcripts:
         t.strand = NEG_STRAND
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[NEG_STRAND]
     # trim at different thresholds
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.001)
     correct = set()
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.025)
     correct = set([Exon(1900, 2000), Exon(1000, 1100)])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.2)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
     trim_nodes = trim_graph(G,
                             NEG_STRAND,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.25)
     correct = set([
         Exon(1900, 2000),
         Exon(1100, 1200),
         Exon(1200, 1300),
         Exon(1700, 1800),
         Exon(1800, 1900),
         Exon(1000, 1100)
     ])
     self.assertTrue(trim_nodes == correct)
Пример #8
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     GG = get_transcript_graphs(transcripts)
     G, tmap = GG[POS_STRAND]
     # set transcript scores
     tmap["ABCDE"].score = 2.0
     tmap["ACE"].score = 1.0
     tmap["ABCE"].score = 1.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 2
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # set transcript scores
     tmap["ABCDE"].score = 4.0
     tmap["ACE"].score = 3.0
     tmap["ABCE"].score = 2.0
     tmap["ACDE"].score = 1.0
     # set assembly parameter
     kmax = 3
     # assemble
     GS = list(
         prune_transcript_graph(G,
                                POS_STRAND,
                                tmap,
                                min_trim_length=0,
                                trim_utr_fraction=0,
                                trim_intron_fraction=0))
     Gsub, strand, partial_paths = GS[0]
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=kmax,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)
Пример #9
0
def create_transcript_graphs(chrom,
                             transcripts,
                             min_trim_length=0,
                             trim_utr_fraction=0.0,
                             trim_intron_fraction=0.0,
                             create_bedgraph=False,
                             bedgraph_filehs=None):
    '''
    generates (graph, strand, transcript_map) tuples with transcript 
    graphs
    '''
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE])
            yield fields

    # partition transcripts by strand and resolve unstranded transcripts
    logging.debug("\tResolving unstranded transcripts")
    strand_transcript_lists, strand_ref_transcripts = \
        partition_transcripts_by_strand(transcripts)
    # create strand-specific graphs using redistributed score
    logging.debug("\tCreating transcript graphs")
    transcript_graphs = []
    for strand, transcript_list in enumerate(strand_transcript_lists):
        # create strand specific transcript graph
        G = create_directed_graph(strand, transcript_list)
        # output bedgraph
        if create_bedgraph:
            for fields in get_bedgraph_lines(chrom, G):
                print >> bedgraph_filehs[strand], '\t'.join(map(str, fields))
        # trim utrs and intron retentions
        trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction,
                                trim_intron_fraction)
        G.remove_nodes_from(trim_nodes)
        # collapse consecutive nodes in graph
        H, node_chain_map = collapse_strand_specific_graph(G, introns=True)
        # get connected components of graph which represent independent genes
        # unconnected components are considered different genes
        Gsubs = nx.weakly_connected_component_subgraphs(H)
        # add components as separate transcript graphs
        strand_graphs = []
        node_subgraph_map = {}
        for i, Gsub in enumerate(Gsubs):
            for n in Gsub:
                node_subgraph_map[n] = i
            tg = TranscriptGraph(chrom, strand, Gsub)
            tg.partial_paths = collections.defaultdict(lambda: 0.0)
            strand_graphs.append(tg)
        # populate transcript graphs with partial paths
        for t in transcript_list:
            # get original transcript nodes and subtract trimmed nodes
            # convert to collapsed nodes and bin according to subgraph
            # TODO: intronic transcripts may be split into multiple pieces,
            # should we allow this?
            subgraph_node_map = collections.defaultdict(lambda: set())
            for n in split_exons(t, G.graph['boundaries']):
                n = Exon(*n)
                if n in trim_nodes:
                    continue
                cn = node_chain_map[n]
                subgraph_id = node_subgraph_map[cn]
                subgraph_node_map[subgraph_id].add(cn)
            # add transcript node/score pairs to subgraphs
            for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems():
                subgraph_nodes = sorted(subgraph_nodes,
                                        key=operator.attrgetter('start'),
                                        reverse=(strand == NEG_STRAND))
                tg = strand_graphs[subgraph_id]
                tg.partial_paths[tuple(subgraph_nodes)] += t.score
        transcript_graphs.extend(strand_graphs)
    # convert
    for tg in transcript_graphs:
        tg.partial_paths = tg.partial_paths.items()
    return transcript_graphs
Пример #10
0
 def test_assembler1(self):
     # setup correct transcripts
     PATH_ABCDE = tuple([
         Exon(0, 100),
         Exon(200, 300),
         Exon(400, 500),
         Exon(600, 700),
         Exon(800, 900)
     ])
     PATH_ACE = tuple([Exon(0, 100), Exon(400, 500), Exon(800, 900)])
     PATH_ABCE = tuple(
         [Exon(0, 100),
          Exon(200, 300),
          Exon(400, 500),
          Exon(800, 900)])
     PATH_ACDE = tuple(
         [Exon(0, 100),
          Exon(400, 500),
          Exon(600, 700),
          Exon(800, 900)])
     # read transcripts
     transcripts = read_first_locus("assemble1.gtf", score_attr="score")
     tdict = dict((t.attrs['transcript_id'], t) for t in transcripts)
     # set transcript scores
     tdict["ABCDE"].score = 2.0
     tdict["ACE"].score = 1.0
     tdict["ABCE"].score = 1.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=2
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=2,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 2)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 3.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 2.0, places=3)
     # change transcript scores
     tdict["ABCDE"].score = 4.0
     tdict["ACE"].score = 3.0
     tdict["ABCE"].score = 2.0
     tdict["ACDE"].score = 1.0
     # create graphs
     GS = create_transcript_graphs('chr1',
                                   transcripts,
                                   create_bedgraph=False,
                                   bedgraph_filehs=None,
                                   min_trim_length=0,
                                   trim_utr_fraction=0.0,
                                   trim_intron_fraction=0.0)
     Gsub, strand, partial_paths = GS[0].Gsub, GS[0].strand, GS[
         0].partial_paths
     # assemble with kmax=3
     results = list(
         assemble_transcript_graph(Gsub,
                                   strand,
                                   partial_paths,
                                   user_kmax=3,
                                   ksensitivity=0,
                                   fraction_major_path=0,
                                   max_paths=1000))
     self.assertEquals(len(results), 4)
     self.assertEqual(tuple(results[0].path), PATH_ABCDE)
     self.assertAlmostEqual(results[0].score, 4.0, places=3)
     self.assertEqual(tuple(results[1].path), PATH_ACE)
     self.assertAlmostEqual(results[1].score, 3.0, places=3)
     self.assertEqual(tuple(results[2].path), PATH_ABCE)
     self.assertAlmostEqual(results[2].score, 2.0, places=3)
     self.assertEqual(tuple(results[3].path), PATH_ACDE)
     self.assertAlmostEqual(results[3].score, 1.0, places=3)
     return