/
markov_toolz.py
101 lines (73 loc) · 2.9 KB
/
markov_toolz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!usr/bin/env python
import toolz as tz
from toolz import curried as cur
from glob import glob
import itertools as it
import more_itertools as mit
from functools import partial
def is_sequence(line):
return not line.startswith('>')
def is_nucleotide(letter):
# doesn't care abou Ns
return letter in LDICT1
def get_sequence(path_to_files):
"""Stream a genome, letter by letter, from a list of FASTA filenames."""
return tz.pipe(path_to_files,
cur.map(fasta_reader),
tz.concat,
cur.filter(is_sequence),
# concatenate characters from all lines
tz.concat,
# discard newlines and 'N'
cur.filter(is_nucleotide))
def genome_gz(file_pattern):
"""Stream a genome, letter by letter, from a list of FASTA filenames."""
return tz.pipe(file_pattern,
glob,
sorted,
# Filenames
cur.map(gzopen(mode='rt')), # lines
# concatenate lines from all files:
tz.concat, # drop header from each sequence
cur.filter(is_sequence),
# concatenate characters from all lines
tz.concat,
# discard newlines and 'N'
cur.filter(is_nucleotide))
LDICT = dict(zip('ACGT', range(8)))
LDICT
# dicitonary of of pairs of letters
PDICT = {(a, b): (LDICT[a], LDICT[b]) for a, b in it.product(LDICT, LDICT)}
PDICT
@tz.curry
def increment_model(model, index):
""" whenever we get a new nucleotide pair, say, (A, T),
we want to increment our Markov model (our NumPy matrix) at
the corresponding position. """
model[index] += 1
def markov(seq):
"""Get a 1st-order Markov model from a sequence of nucleotides."""
model = np.zeros((4, 4))
tz.last(tz.pipe(seq,
cur.sliding_window(2),
# each successive tuple
cur.map(PDICT.__getitem__),
# location in matrix of tuple
cur.map(increment_model(model)))) # increment matrix
# convert counts to transition probability matrix
model /= np.sum(model, axis=1)[:, np.newaxis]
return model
def plot_model(model, labels, figure=None):
fig = figure or plt.figure()
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
im = ax.imshow(model, cmap='magma');
axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.8])
plt.colorbar(im, cax=axcolor)
for axis in [ax.xaxis, ax.yaxis]:
axis.set_ticks(range(4))
axis.set_ticks_position('none')
axis.set_ticklabels(labels)
return ax
gzopen = tz.curry(gzip.open)
model = tz.pipe('Data/Escherichia_coli_k_12.GCA_000800765.1.29.dna.genome.fa.gz', genome_gz, markov)
plot_model(model, labels='ACGT');