-
Notifications
You must be signed in to change notification settings - Fork 3
/
dynamic_mode_decomposition.py
248 lines (219 loc) · 10.4 KB
/
dynamic_mode_decomposition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# Dynamic Mode Decomposition based on http://arxiv.org/pdf/1312.0041v1.pdf
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from bacteriopop_utils import prepare_DMD_matrices
def find_fixed_adjacency_matrix(min_abundance=0.0, phylo_column='order',
full_svd=True):
"""
This function find the adjacency matrix among clusters of bacteria over
the 11 weeks of sampling assuming the interaction between clusters is
fixed.
It creates a dictionary of descriptive tuples like ("High", 2) for
high-oxygen week 2, and corresponding dataframe values. These
dataframes have weeks as columns and taxa ("bacteria") as rows.
Unlike find_temporal_adjacency_matrix(), we get only one predictive
matrix that represents the 10 transitions between sampling points.
Since the dictionary has 8 tuple keys for High/Low oxygen and 4
replicates for each condition, 8 interaction ("A") matrices are created.
These are accessed by the dictionary linear_mappings, with the same
tuples as keys.
The names of each node can be accessed by nodes_list, the other output.
:param min_abundance: minimum abundance to loook for in original data
:param phylo_column: most detailed phylogenetic column to consider
:param full_svd: if True, runs the full svd algorithm. If False,
runs a faster version.
"""
# Default values
if min_abundance is None:
min_abundance = 0
if phylo_column is None:
phylo_column = 'order'
if full_svd is None:
full_svd = False
# snapshots of samples over 11 weeks
snapshots = prepare_DMD_matrices(min_abundance, phylo_column, oxygen='all',debug=False)
linear_mappings = {}
nodes_list = {}
for descriptive_tuple in snapshots.keys():
df = snapshots[descriptive_tuple]
data = df.values
X = data[:, 0:10]
Y = data[:, 1:11]
# Preprocess the abundance data
X = normalize(X, axis=0)
Y = normalize(Y, axis=0)
U, s, V = np.linalg.svd(X, full_matrices=full_svd)
if full_svd is True: # slower
S = np.zeros((len(U), len(s)), dtype=float)
S[:len(s), :len(s)] = np.diag(s)
pseu_inv_x = np.dot(np.linalg.inv(V),
np.dot(np.linalg.pinv(S), np.linalg.inv(U)))
else: # faster
S = np.diag(s)
pseu_inv_x = np.dot(np.linalg.inv(V),
np.dot(np.linalg.inv(S), np.linalg.pinv(U)))
# Adjacency matrix between clusters
A = np.dot(Y, pseu_inv_x)
# A = np.dot(Y, np.linalg.pinv(X)) # full SVD (slower)
linear_mappings[descriptive_tuple] = A
nodes_list[descriptive_tuple] = list(df.index)
return linear_mappings, nodes_list
def adjacency_matrix_into_pandas(mappings_array, row_and_colnames):
"""
Turn one matrix with one set of labels into a Pandas DataFrame with
index (row) names set to row_and_colnames as well has column names set
to row_and_colnames.
:param mappings_array: numpy matrix produced from ___
:param row_and_colnames: numpy array of names produced by ___
:return: one Pandas DataFrame with row and column names.
"""
# Goal: return a Pandas DataFrame with suitable labels by combining the
# linear_mappings and nodes_list outputs of find_fixed_adjacency_matrix().
# todo: which labels do we use? So far labels are things like:
# Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales
# and sometimes
# unassigned,,, <-- when the taxonomy was not fully specified.
# for now just return the long strings:
return pd.DataFrame(mappings_array,
columns=row_and_colnames,
index=row_and_colnames)
def DMD_results_dict_from_numpy_to_pandas(adj_dict, node_name_dict):
# transform our dict of descriptive tuple:numpy array pairs into a dict of
# descriptive tuple:pandas dataframe dict.
# assert that the set of keys in both inputs match.
assert (set(adj_dict.keys())== set(node_name_dict.keys()))
dict_with_dataframe_values = {}
for key in adj_dict.keys():
np_to_pd = adjacency_matrix_into_pandas(adj_dict[key],
node_name_dict[key])
dict_with_dataframe_values[key] = np_to_pd
return dict_with_dataframe_values
def find_temporal_adjacency_matrix(min_abundance, phylo_column, full_svd):
"""
Find the adjacency matrix among clusters of bacteria from week to week,
assuming the interaction between clusters is changing.
:param min_abundance: ignore the bacteria if their abundance is always
below the min_abundance
:param phylo_column: the data is clustered based on the phylo_column
:param full_svd:the method of singular value decomposition. full SVD is
more accurate and slower than the reduced SVD
"""
# Default values
if min_abundance is None:
min_abundance = 0
if phylo_column is None:
phylo_column = 'family'
if full_svd is None:
full_svd = False
# snapshots of samples over 11 weeks
# todo: python reserves capital letters for classes.
snapshots = prepare_DMD_matrices(min_abundance, phylo_column, oxygen='all', debug=False)
linear_mappings = {}
nodes_list = {}
for descriptive_tuple in snapshots.keys():
df = snapshots[descriptive_tuple]
data = df.values
for time in range(10):
X = data[:, time:time+1]
Y = data[:, time+1:time+2]
# Preprocess the abundance data
X = normalize(X, axis=0)
Y = normalize(Y, axis=0)
U, s, V = np.linalg.svd(X, full_matrices=full_svd)
if full_svd is True: # slower
S = np.zeros((len(U), len(s)), dtype=complex)
S[:len(s), :len(s)] = np.diag(s)
pseu_inv_x = np.dot(np.linalg.inv(V),
np.dot(np.linalg.pinv(S), np.linalg.inv(U)))
else: # faster
S = np.diag(s)
pseu_inv_x = np.dot(np.linalg.inv(V),
np.dot(np.linalg.inv(S), np.linalg.pinv(U)))
# Adjacency matrix between clusters
A = np.dot(Y, pseu_inv_x)
# A = np.dot(Y, np.linalg.pinv(X)) # full SVD (slower)
key = descriptive_tuple + ('Week ' + str(time+1),)
linear_mappings[key] = A
nodes_list[key] = list(df.index)
return linear_mappings, nodes_list
def aggregate_adjacency_matrix_over_replicates(mappings):
"""
:param mappings: a python dictionarys of pandas data frame that contains
the adjacency matrices for all 8 replicates including
4 high O2 and 4 low O2
:return:
avg_mappings: a dictionary of pandas data frame for low and high
replicates mean
std_mappings: a dictionary of pandas data frame for low and high
replicates standard deviation
snr_mappings: a dictionary of pandas data frame for low and high
replicates signal to noise ratio
"""
std_mappings = {}
avg_mappings = {}
snr_mappings = {}
current_nodes = {}
high_rep_mappings = []
low_rep_mappings = []
current_nodes['Low'] = set([])
current_nodes['High'] = set([])
# create two lists, one for each high or low replicates including all
# labels observed in replicates
for key in mappings.keys():
if key[0] == "High":
current_nodes['High'] = \
current_nodes['High'].union(mappings[key].index)
else:
current_nodes['Low'] = \
current_nodes['Low'].union(mappings[key].index)
# add the missing label to each replicate
for key in mappings.keys():
if key[0] == "High":
for id in current_nodes['High']:
if id not in mappings[key].index:
# add one column
mappings[key][id] = [0.0]*len(mappings[key].index)
# add one row
mappings[key].loc[id] = \
[0.0]*len(mappings[key].columns)
# sort the index and columns labels of data frame in order to
# have an identical ordering in the adjacency matrix
mappings[key] = mappings[key].sort_index(axis=1)
mappings[key] = mappings[key].sort_index()
high_rep_mappings.append(mappings[key].values)
else:
for id in current_nodes['Low']:
if id not in mappings[key].index:
# add one column
mappings[key][id] = [0.0]*len(mappings[key].index)
# add one column
mappings[key].loc[id] = \
[0.0]*len(mappings[key].columns)
# sort the index and columns labels of data frame in order to have
# an identical ordering in the adjacency matrix
mappings[key] = mappings[key].sort_index(axis=1)
mappings[key] = mappings[key].sort_index()
low_rep_mappings.append(mappings[key].values)
# find the element by element average of adjacency matrix over replicates
# of high/low O2
avg_mappings['High'] = np.mean(high_rep_mappings, axis=0)
avg_mappings['Low'] = np.mean(low_rep_mappings, axis=0)
# convert from numpy array to pandas dataframe
avg_mappings = DMD_results_dict_from_numpy_to_pandas(avg_mappings,
current_nodes)
# find the element by element STD of adjacency matrix over replicates of
# high/low O2
std_mappings['High'] = np.std(high_rep_mappings, axis=0, ddof=1)
std_mappings['Low'] = np.std(low_rep_mappings, axis=0, ddof=1)
# convert from numpy array to pandas dataframe
std_mappings = DMD_results_dict_from_numpy_to_pandas(std_mappings,
current_nodes)
# find the element by element SNR of adjacency matrix over replicates of
# high/low O2
snr_mappings['High'] = avg_mappings['High']/std_mappings['High']
snr_mappings['Low'] = avg_mappings['Low'] / std_mappings['Low']
# convert from numpy array to pandas dataframe
snr_mappings = DMD_results_dict_from_numpy_to_pandas(snr_mappings,
current_nodes)
return std_mappings, avg_mappings, snr_mappings