/
preprocessing.py
314 lines (264 loc) · 13 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""Methods for loading and preprocessing LFP data from ns5 files"""
from ns5_process import ns5
from ns5_process.myutils import printnow
import numpy as np
import scipy.signal
def load_as_array(datafile, channel_groups, starts=None, downsampled_chunksize=128,
oversize=1.2, downsample_ratio=64, verbose=False, n_chunks=None):
"""Loads LFP from ns5 file, downsamples, returns as array.
datafile : ns5 file
channel_groups : list of lists, containing channel ids to grab.
Example 1: [[1,2,3], [5,6,7,8]]. Will return two groups, the average
of ch1 + ch2 + ch3, and the average of chs 5-8
Example 2: [[1], [2]]. Will return two groups, one for ch1 and one for
ch2.
starts : integer locations in samples where chunks should begin
If None, will return non-overlapping chunks spanning the file
downsampled_chunksize : size of returned chunks. For the purpose of this
method there is no benefit to choosing a power of 2 (because see below).
oversize : amount larger than returned chunk to actually grab, to avoid
edge effects. The size of the larger chunk is always a power of 2
for efficiency.
downsample_ratio : amount to downsample data before returning
verbose : display error messages
n_chunks : truncate `starts` to this length, mostly useful for
benchmarking
Returns: values, t
values : array of shape (n_chunks, n_groups, downsampled_chunksize)
t : array of same shape, containing times corresponding to each point
dim2 is redundant since all groups are timelocked.
"""
# Load header
l = ns5.Loader()
l.load_file(datafile)
n_samples = l.header.n_samples
fs = l.header.f_samp
# Set up chunk parameters
# We will actually do the operations on a slightly larger, potentially
# overlapping, chunk. Then we slice out just the target area.
downsample_ratio = int(downsample_ratio)
oversize = float(oversize)
downsampled_chunksize = int(downsampled_chunksize)
chunksize = downsample_ratio * downsampled_chunksize
bigchunksize = 2**(int(np.ceil(np.log2(oversize * chunksize))))
# Define how much to grab on either side of desired chunk
# bigchunk = data[start - prechunk:start - prechunk + bigchunksize]
prechunk = (bigchunksize - chunksize) / 2
# Define where to get the chunks from
if starts is None:
starts = np.arange(prechunk, n_samples + prechunk - bigchunksize + 1,
chunksize, dtype=np.int)
if n_chunks is not None:
starts = starts[:n_chunks]
# Calculate size of everything post downsampling
post_bigchunksize = bigchunksize / downsample_ratio
post_chunksize = chunksize / downsample_ratio
post_prechunk = prechunk / downsample_ratio
# Call to underlying implementation
# They all return the same values but do the operations in different orders
# Method 4 seems to be the best. It avoids the worst step, which is
# creation of intermediate arrays, by using get_chunk instead of
# get_chunk_by_channel. It also downsamples before meaning, to decrease
# the number of data points.
# Methods 1-3 use get_chunk_by_channel and 3 is the best of them because
# it downsamples before meaning and processes groups separately.
# There appear to be minimal performance gains from vectorizing the
# resampling operation, at least not if we include the cost of creating
# the array.
res, rest = _load_as_array6(l, channel_groups, starts, bigchunksize,
prechunk, fs, post_prechunk, post_chunksize, post_bigchunksize, verbose)
# Okay one final improvement - method 6 does the resampling all at once
# on the original chunk to benefit from any possible vectorization
# without creating a new array. Also it slices out the chunk before
# meaning to avoid unnecessary meaning.
#
# Note that for smaller downsample_ratios or especially non-integer
# ratios it would probably be beneficial to mean before downsampling.
# In that case method 5 is probably the best, though it could still
# benefit from some of the improvements in method 6.
return res, rest
# Different implementations of the inner loop
def _load_as_array1(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Process groups separately, mean first, then downsample"""
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk_by_channel(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Now process one group at a time
# This might be faster if mean and downsample all groups at once?
downsampled_l, t_l = [], []
for group in channel_groups:
# Mean and downsample
meaned = np.mean([raw[ch] for ch in group], axis=0)
downsampled, new_t = scipy.signal.resample(meaned,
post_bigchunksize, t=t)
# Slice out just desired chunk
# If you grab one more sample here on the ends, you can check how
# well the overlap is working between chunks
downsampled = downsampled[post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[post_prechunk:post_prechunk+post_chunksize]
# Append to result
downsampled_l.append(downsampled)
t_l.append(new_t)
res_l.append(downsampled_l)
rest_l.append(t_l)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest
def _load_as_array2(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Process groups together, mean first, then downsample"""
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk_by_channel(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Now process all groups at once
all = np.array([np.mean([raw[ch] for ch in group], axis=0)
for group in channel_groups])
dsall, new_t = scipy.signal.resample(all, post_bigchunksize, t=t, axis=1)
new_t = np.tile(new_t, (len(channel_groups), 1))
# Slice out just desired chunk
# If you grab one more sample here on the ends, you can check how
# well the overlap is working between chunks
dsall = dsall[:, post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[:, post_prechunk:post_prechunk+post_chunksize]
res_l.append(dsall)
rest_l.append(new_t)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest
def _load_as_array3(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Process groups separately, downsample first, then mean"""
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk_by_channel(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Now process one group at a time
downsampled_l, t_l = [], []
for group in channel_groups:
# Mean and downsample
rawgroup = np.asarray([raw[ch] for ch in group])
downsampled, new_t = scipy.signal.resample(rawgroup,
post_bigchunksize, t=t, axis=1)
downsampled = np.mean(downsampled, axis=0)
# Slice out just desired chunk
# If you grab one more sample here on the ends, you can check how
# well the overlap is working between chunks
downsampled = downsampled[post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[post_prechunk:post_prechunk+post_chunksize]
# Append to result
downsampled_l.append(downsampled)
t_l.append(new_t)
res_l.append(downsampled_l)
rest_l.append(t_l)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest
def _load_as_array4(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Avoid using intermediate dict object, downsample first"""
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Now process one group at a time
downsampled_l, t_l = [], []
for group in channel_groups:
# Column indexes into raw
igroup = [l.header.Channel_ID.index(ch) for ch in group]
# Mean and downsample
#rawgroup = np.asarray([raw[ch] for ch in group])
downsampled, new_t = scipy.signal.resample(raw[:, igroup], post_bigchunksize, t=t, axis=0)
downsampled = np.mean(downsampled, axis=1)
# Slice out just desired chunk
# If you grab one more sample here on the ends, you can check how
# well the overlap is working between chunks
downsampled = downsampled[post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[post_prechunk:post_prechunk+post_chunksize]
# Append to result
downsampled_l.append(downsampled)
t_l.append(new_t)
res_l.append(downsampled_l)
rest_l.append(t_l)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest
def _load_as_array5(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Avoid using intermediate dict object, mean first"""
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Now process one group at a time
downsampled_l, t_l = [], []
for group in channel_groups:
# Column indexes into raw
igroup = [l.header.Channel_ID.index(ch) for ch in group]
# Mean and downsample
#rawgroup = np.asarray([raw[ch] for ch in group])
meaned = np.mean(raw[:, igroup], axis=1)
downsampled, new_t = scipy.signal.resample(meaned, post_bigchunksize, t=t, axis=0)
# Slice out just desired chunk
# If you grab one more sample here on the ends, you can check how
# well the overlap is working between chunks
downsampled = downsampled[post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[post_prechunk:post_prechunk+post_chunksize]
# Append to result
downsampled_l.append(downsampled)
t_l.append(new_t)
res_l.append(downsampled_l)
rest_l.append(t_l)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest
def _load_as_array6(l, channel_groups, starts, bigchunksize, prechunk, fs,
post_prechunk, post_chunksize, post_bigchunksize, verbose):
"""Avoid using intermediate dict object, mean first"""
# Indexes into chunk columns
ichannel_groups = [
[l.header.Channel_ID.index(ch) for ch in group]
for group in channel_groups]
# Set up return values
res_l, rest_l = [], []
for start in starts:
# Grab current chunk
if verbose:
printnow("loading chunk starting at %d" % start)
raw = l.get_chunk(start=start-prechunk, n_samples=bigchunksize)
t = np.arange(start-prechunk, start - prechunk + bigchunksize) / float(fs)
# Resample and slice out extra all at once
dsraw, new_t = scipy.signal.resample(raw, post_bigchunksize, t=t, axis=0)
dsraw = dsraw[post_prechunk:post_prechunk+post_chunksize]
new_t = new_t[post_prechunk:post_prechunk+post_chunksize]
# Now mean each group
downsampled_l = [np.mean(dsraw[:, igroup], axis=1)
for igroup in ichannel_groups]
t_l = [new_t] * len(ichannel_groups)
res_l.append(downsampled_l)
rest_l.append(t_l)
res = np.array(res_l)
rest = np.array(rest_l)
return res, rest