forked from ctokheim/PrimerSeq
/
algorithms.py
executable file
·391 lines (345 loc) · 17.4 KB
/
algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python
# Copyright (C) 2012-2013 Collin Tokheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import networkx as nx
import numpy as np
import sys
import logging
import utils
import multinomial_em as mem
oldsettings = np.seterr(all='raise')
def get_biconnected(G):
"""
Wrapper arround the networkx biconnected_components function. To find out
why the biconnected components algorithm is useful for finding
constitutive exons check the information section or wikipedia.
"""
G_undirected = G.to_undirected() # make sure undirected graph for biconnected components
components = filter(lambda x: len(
x) > 2, map(list, nx.biconnected_components(G_undirected))) # filter out trivial dyad biconnected components
# assert len(components) > 0, 'what nothing in it' + str(components)
# assert components != None, 'Oddly there is a none object in the biconnected comp' + str(components)
return components
def bellman_ford_longest_path(G, num_nodes, visited, weight='weight'):
"""
Computes the longest path (most total weight) by only considering
unexplained edges. That is weights of any edge already in an isoform is
set to zero. This function tries to minimize the number of isoforms that
could possibly be generated based on novel edges. Assumes topologically
sorted with source node as first in topological sort. Topological sort
version runs in O(n+m) instead of O(nm).
"""
# initialize variables
sorted_nodes = sorted(G.nodes())
d = {nde: float('-inf') for nde in sorted_nodes}
d[sorted_nodes[0]] = 0 # initialize source to have 0 distance
p = {nde: [] for nde in sorted_nodes}
p[sorted_nodes[0]] = [sorted_nodes[0]] # initialize source path to be it's self
# "edge relax"
for tail_node in sorted_nodes:
for head_node in G.successors(tail_node):
# want longest path of unexplained edges, so all explained edges have zero weight
edge_weight = G[tail_node][head_node][
weight] if visited[tail_node][head_node] == 0 else 0
# larger total weight case
if d[head_node] < d[tail_node] + edge_weight:
d[head_node] = d[tail_node] + edge_weight
p[head_node] = p[tail_node] + [head_node]
# same total weight case, choose edge with greater weight into head node
elif d[head_node] == (d[tail_node] + edge_weight) and G[tail_node][head_node][weight] > G[p[head_node][-2]][head_node][weight]:
d[head_node] = d[tail_node] + edge_weight
p[head_node] = p[tail_node] + [head_node]
longest_path = p[sorted_nodes[-1]]
no_newly_visited = True
for i in range(len(longest_path)-1):
no_newly_visited &= (visited[longest_path[i]][longest_path[i+1]])
#no_newly_visited = reduce(
#lambda x, y: x and (visited[x][y] == 1), longest_path)
return p[sorted_nodes[-1]], no_newly_visited
class AllPaths(object):
'''
Handle all possible paths in a biconnected component
'''
def __init__(self, sg, component, target, chr=None, strand=None):
self.set_splice_graph(sg, component, target)
self.asm_component = self.component # save the ASM components, self.components may get trimmed if primers aren't placed on first and last exon
self.chr = chr
self.strand = strand
def set_chr(self, chr):
'''Chromosome setter'''
self.chr = chr
def set_strand(self, strand):
'''Strand setter'''
if strand == '+' or strand == '-':
self.strand = strand
else:
raise ValueError('Strand should either be + or -')
def set_splice_graph_old(self, sg, component, target):
self.graph = sg.get_graph()
self.tx_paths = sg.annotation
self.original_tx_paths = sg.annotation # tx paths all ways without trimming
known_edges = set([(tx[i], tx[i + 1])
for tx in self.tx_paths
for i in range(len(tx) - 1)])
self.component = component
self.target = target
self.sub_graph = nx.subgraph(self.graph, self.component)
# add any possible tx that uses novel edges to list of known txs
for tx in nx.all_simple_paths(self.sub_graph,
source=self.component[0],
target=self.component[-1]):
novel = False
for i in range(len(tx) - 1):
if (tx[i], tx[i + 1]) not in known_edges:
novel = True
if novel:
self.tx_paths.append(tx)
self.inc_lengths, self.skip_lengths = [], [] # call set all_path_lengths method
self.all_path_coordinates = [] # call set_all_path_coordinates method
def set_splice_graph(self, sg, component, target):
"""Setter for the splice graph which consists of graph/transcript attributes"""
self.graph = sg.get_graph()
self.tx_paths = sg.annotation
self.original_tx_paths = sg.annotation # tx paths all ways without trimming
self.component = component
self.target = target
self.sub_graph = nx.subgraph(self.graph, self.component)
# add novel txs
novel_txs = self.all_paths_with_novel_junctions()
self.tx_paths += novel_txs
def all_paths_with_novel_junctions(self):
"""
Create novel isoforms by finding all possible paths that include
at least one novel junction.
"""
iter_limit = 10000 # explicitly set error for upper limit of all paths
tx_list = [] # store novel isforms
known_edges = set([(tx[i], tx[i + 1])
for tx in self.tx_paths
for i in range(len(tx) - 1)])
tmp_sub_graph = self.add_dummy_nodes_to_graph(self.sub_graph)
tmp_src, tmp_sink = (float('-inf'), float('-inf')), (float('inf'), float('inf'))
# only add paths that include a novel jct
for l, tx in enumerate(nx.all_simple_paths(tmp_sub_graph,
source=tmp_src,
target=tmp_sink)):
# set the maximum number of iterations
if l >= iter_limit:
raise utils.PrimerSeqError('Iteration limit reached in all paths algorithm.')
# find all paths with novel edge
novel = False
# for i in range(len(tx) - 1):
for i in range(1, len(tx) - 2):
if (tx[i], tx[i + 1]) not in known_edges:
novel = True
if novel:
tx_list.append(tx[1:-1])
return tx_list
def add_dummy_nodes_to_graph(self, my_graph):
"""Add a sink and source node to a graph"""
# setup variables
tmp_graph = my_graph.copy() # make sure not editing the original graph
src_connected_nodes = [node for node in tmp_graph.nodes() if not tmp_graph.predecessors(node)]
sink_connected_nodes = [node for node in tmp_graph.nodes() if not tmp_graph.successors(node)]
# add edges to source and sink node
src, sink = (float('-inf'), float('-inf')), (float('inf'), float('inf'))
for node in src_connected_nodes:
tmp_graph.add_edge(src, node)
for node in sink_connected_nodes:
tmp_graph.add_edge(node, sink)
return tmp_graph
def trim_tx_paths_old(self):
'''
Remove all exons outside the biconnected component.
'''
self.component = sorted(self.component, key=lambda x: (x[0], x[1])) # make sure it is sorted
# trim tx_paths to only contain paths within component_subgraph
tmp = set()
for p in self.tx_paths:
# make sure this tx path has the biconnected component
if self.component[0] in p and self.component[-1] in p:
tmp.add(tuple(
p[p.index(self.component[0]):p.index(self.component[-1]) + 1])) # make sure there is no redundant paths
self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
def trim_tx_paths(self):
'''
Remove all exons outside the biconnected component.
'''
self.component = sorted(self.component, key=lambda x: (x[0], x[1])) # make sure it is sorted
# trim tx_paths to only contain paths within component_subgraph
tmp = set()
for p in self.tx_paths:
# make sure this tx path has the biconnected component
tmp_path = self._get_sub_tx(p)
if len(tmp_path) > 1:
tmp.add(tuple(
tmp_path))
# p[p.index(self.component[0]):p.index(self.component[-1]) + 1])) # make sure there is no redundant paths
self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
def _get_sub_tx(self, path):
sub_path = []
for p in path:
if p in self.component:
if sub_path and not self.sub_graph.has_edge(sub_path[-1], p):
return []
sub_path.append(p)
elif len(sub_path) > 1:
return sub_path
return sub_path
def trim_tx_paths_using_primers(self, first_primer, second_primer, first_exon, second_exon):
"""
Get rid of all transcripts which do not contain both the first
and second primer. This method may keep transcripts that do not
contain the exact user defined flanking exons.
"""
self.component = sorted(self.component, key=lambda x: (x[0], x[1])) # make sure it is sorted
# trim tx_paths to only contain paths within component_subgraph
tmp = set()
for p in self.tx_paths:
first_ex = utils.find_first_exon(first_primer, p)
last_ex = utils.find_last_exon(second_primer, p)
if first_ex is not None and last_ex is not None:
tmp_path = p[first_ex:last_ex+1]
if tmp_path[0][0] < first_exon[0]:
tmp_path[0] = (first_exon[0], tmp_path[0][1]) # don't be before user-defined exon
if tmp_path[-1][1] > second_exon[1]:
tmp_path[-1] = (tmp_path[-1][0], second_exon[1]) # don't be after user-defined exon
tmp.add(tuple(tmp_path)) # make sure no redundancies
# self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
self.tx_paths = list(tmp)
def trim_tx_paths_using_flanking_exons_and_target(self, strand,
target_exon, up_exon, down_exon):
tmp = set()
for p in self.tx_paths:
# make sure this tx path has the biconnected component
flank_exon_flag = (up_exon in p and down_exon in p)
target_exon_flag = target_exon in p
if flank_exon_flag:
if strand == '+':
first_index, second_index = p.index(up_exon), p.index(down_exon)
elif strand == '-':
first_index, second_index = p.index(down_exon), p.index(up_exon)
tmp.add(tuple(
sorted(p[first_index:second_index + 1], key=lambda x: (x[0], x[1])))) # make sure there is no redundant paths
elif target_exon_flag:
tmp_p = []
for ex in p:
if strand == "+":
if (up_exon[0] <= ex[0] <= down_exon[1]) or (up_exon[0] <= ex[1] <= down_exon[1]):
tmp_p.append(ex)
elif strand == "-":
if (down_exon[0] <= ex[0] <= up_exon[1]) or (down_exon[0] <= ex[1] <= up_exon[1]):
tmp_p.append(ex)
tmp.add(tuple(sorted(tmp_p, key=lambda x: (x[0], x[1]))))
# self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
self.tx_paths = list(tmp)
def trim_tx_paths_using_flanking_exons(self, strand, up_exon, down_exon):
tmp = set()
for p in self.tx_paths:
# make sure this tx path has the biconnected component
if up_exon in p and down_exon in p:
if strand == '+':
first_index, second_index = p.index(up_exon), p.index(down_exon)
elif strand == '-':
first_index, second_index = p.index(down_exon), p.index(up_exon)
tmp.add(tuple(
sorted(p[first_index:second_index + 1], key=lambda x: (x[0], x[1])))) # make sure there is no redundant paths
# self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
self.tx_paths = list(tmp)
def trim_tx_paths_using_flanking_exons2(self, strand, up_exon, down_exon):
"""Keep TXs with appropriate flanking exons"""
tmp = set()
for p in self.tx_paths:
tmp_starts, tmp_ends = zip(*p) # get starts and ends
if strand == "+":
my_flag = up_exon[1] in tmp_ends and down_exon[0] in tmp_starts
elif strand == '-':
my_flag = up_exon[0] in tmp_starts and down_exon[1] in tmp_ends
if my_flag:
if strand == '+':
first_index = tmp_ends.index(up_exon[1])
second_index = tmp_starts.index(down_exon[0])
elif strand == '-':
first_index = tmp_ends.index(down_exon[1])
second_index = tmp_starts.index(up_exon[0])
tmp.add(tuple(
p[first_index:second_index + 1])) # make sure there is no redundant paths
my_flag = False
self.tx_paths = sorted(list(tmp), key=lambda x: (x[0], x[1]))
def keep_weakly_connected(self):
'''This method filters out exons (nodes) not involved in AS events'''
# find weakly connected subgraphs
weakly_connected_list = nx.weakly_connected_component_subgraphs(self.sub_graph)
# iterate to find which subgraph has the target exon
for subgraph in weakly_connected_list:
if self.target in subgraph.nodes():
self.sub_graph = subgraph # assign subgraph that actually connects to target exon
def estimate_counts(self):
'''
Estimates read counts by using :func:`~algorithms.read_count_em`
and then returns the transcript paths and read counts for those
paths.
'''
# check the connectivity of the graph -- deprecated checking
# if not nx.is_weakly_connected(self.sub_graph): raise utils.PrimerSeqError('Error: SpliceGraph should be connected')
# make sure graph (self.sub_graph) is weakly connected
tmp_sub_graph = self.sub_graph.copy()
self.keep_weakly_connected()
if not len(self.sub_graph.nodes()) > 1:
self.sub_graph = tmp_sub_graph
# AFE/ALE testing
num_first_exons = len(filter(lambda x: len(self.sub_graph.predecessors(x)) == 0, self.sub_graph.nodes()))
if num_first_exons > 1: utils.PrimerSeqError('Error: not internal AS event')
num_last_exons = len(filter(lambda x: len(self.sub_graph.successors(x)) == 0, self.sub_graph.nodes()))
if num_last_exons > 1: utils.PrimerSeqError('Error: not internal AS event')
# run EM algorithm
logging.debug('Start read count EM algorithm . . . ')
self.count_info = mem.multinomial_em(self.tx_paths, self.sub_graph)
logging.debug('Finished calculating counts.')
return map(list, self.tx_paths), self.count_info
def set_all_path_coordinates(self):
'''
Computes the coordinates ofr each tx
'''
tmp = []
for p in self.tx_paths:
# tmp.append(map(lambda x: (self.strand, self.chr, x[0], x[1]), self.tx_paths))
tmp.append(map(lambda x: (self.strand, self.chr, x[0], x[1]), p))
self.all_path_lengths = tmp
def set_all_path_lengths(self, primer_coords):
'''
Computes the path length for each isoform
'''
# get possible lengths
inc_length, skip_length = [], []
for path in self.original_tx_paths:
if self.target in path:
inc_length.append(utils.calc_product_length(path, primer_coords)) # length of everything but target exon and flanking constitutive exons
else:
skip_length.append(utils.calc_product_length(path, primer_coords)) # length of everything but target exon and flanking constitutive exons
self.inc_lengths, self.skip_lengths = list(set(inc_length)), list(set(skip_length))
def get_shortest_path(self):
"""
Returns the shortest isoform, this could be a skipping isoform or an
inclusion isoform.
"""
min_len = float('inf')
shortest_tx = []
for tx in self.tx_paths:
tx_len = sum([end - start for start, end in tx[1:-1]]) # this line won't work for retained introns
if tx_len < min_len:
shortest_tx = tx
min_len = tx_len
return shortest_tx