/
convolutions.py
613 lines (523 loc) · 19.8 KB
/
convolutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
# coding=utf-8
"""
A collection of convolution algorithms accelerated with
`NumPy <https://numpy.org/>`_, `CuPy <https://cupy.dev/>`_ and
`Numba <https://numba.pydata.org/>`_. Written for instructive purposes and to
complement the somewhat lacking documentation on how to write `CUDA kernels
<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels>`_
in CuPy and Numba, and it does so with a motivating/fun application -- and no,
it's not Convolutional NNs 😒
|
Convolution is used here to evolve a Game of Life (the prototypical Cellular
Automaton model). CAs, which compute the viability of a cell (ie a binary value
in a 2D grid) based on the values (0 = dead, 1 = alive) of the 8 neighbouring
cells, naturally benefit from the optimizations used in NumPy's various
convolution algorithms.
|
There are multiple convolution methods in NumPy/SciPy and each of them can be
further accelerated with CuPy or Numba. So how are we to choose one? This
module aims to provide insights on this question in an easy and (hopefully)
fun way. Apart from sheer speed, relevant issues that will emerge through
experimentation are:
* Compatibility/universality: not all methods work on all systems. For
starters, to use the GPU-accelerated versions, you need a working CUDA
Developer's Toolkit installation. Furthermore, this code is untested on
Linux-based systems (although it is expected to work).
* (Not so) subtle issues with array sizes/alignment -- for example there
seems to be an issue with CuPy's ``RawKernel`` method for arrays whose
dimensions are not powers of 2. I'm currently investigating this and
will have something to say about it (a fix and/or explanation) in the
update.
* Choice of certain parameters, such as the "threads per block" in CUDA
kernels.
|
Notes
=====
* If you like CAs (who doesn't), check out `my other project
<https://github.com/GregSotiropoulos/cellular_automata>`_,
which has a full-fledged GUI that allows you to run, visualize and save
CAs, among other things.
* To use CUDA functionality (CuPy, Numba's ``cuda.jit``), you need to
install the appropriate version of the CUDA Toolkit and CuPy. Head
over to `CuPy's installation guide
<https://docs.cupy.dev/en/stable/install.html>`_ for details. The
existing functions have been tested on CUDA 11.0 and 11.1.
* Since there is no ``requirements.txt`` or ``setup.py``, users should be
aware that on Windows x64 system, NumPy has to have a version other
than 1.19.4 -- for the (gory) details of why this is, see `here
<https://github.com/numpy/numpy/wiki/FMod-Bug-on-Windows>`_.
"""
# NumPy
import numpy as np
from scipy.ndimage import convolve as conv_np_ndi
from scipy.signal import convolve2d as conv_np_sig_2d
from scipy.ndimage import uniform_filter as uf_np
# CuPy
import cupy as cp
from cupyx.scipy.ndimage import uniform_filter as uf_cp, convolve as conv_cp_ndi
from cupyx.scipy.signal import convolve2d as conv_cp_sig_2d
# Numba
import numba as nb
from numba import cuda, stencil
# stdlib
import sys
from time import perf_counter as t
from functools import partial
from itertools import chain
from math import ceil
from operator import attrgetter, sub, rshift
import re
from types import SimpleNamespace as Sns
from collections.abc import Mapping
def default_options():
"""
Get default configuration options (see ``options()``). The *_grades options
refer to the grid dimensions (shape_grades) and the number of generations
that the CA (cellular automaton) grid is to evolve for. 'Fast', 'slow' and
'mixed' refer to the parameter values used when the functions in the set to
be compared (determined by the wildcard/regex pattern of function names,
as shown in ``main()``) are all GPU-accelerated (so CuPy or Numba with
CUDA support),
:return: A dictionary of default options.
"""
return dict(
# CA grid shape
shape_grades=dict(
fast=(1024*4,)*2,
mixed=(1024*2,)*2,
slow=(1024,)*2
),
# number of generations to compute
gens_grades=dict(
fast=50,
mixed=100,
slow=100
),
int_types=('u1', 'i1'),
threads_per_block=(8, ) * 2, # CUDA's blockDim
random_seed=0,
run_func_prefix='run_ca_'
)
# this is the global (module-level) variable that holds the options
# (configuration) -- see options() function below
conv_opts = Sns()
def all_func_speeds(fnames=(), **opts):
"""
Determine whether the functions (passed as function names in ``fnames``)
to be tested are all fast, all slow or a mix of fast and slow ones.
:param fnames: A tuple of function names.
:param opts: Additional options.
:return: A dictionary of options. If `opts` was non-empty, the dictionary
includes those options, except for the 'fnames' and 'grade' items.
"""
opts['fnames'] = fns = *(
fn.strip().replace('run_ca_', '')
for fn in chain(
fnames,
opts.get('fnames', ()),
map(attrgetter('__name__'), opts.get('funcs', ()))
)
),
fast = *map(partial(re.search, r'(?:^\s*(cp_\w*|\w*cuda))(.*)\s*$'), fns),
if fast and all(fast):
opts['grade'] = 'fast'
return opts
slow = map(partial(re.search, r'^\s*(np_manual|\w*sten)\s*$'), fns)
if any(slow):
opts['grade'] = 'slow'
else:
opts['grade'] = 'mixed'
return opts
def options(*args, **kw):
"""
Get various options (configuration) as a nested namespace, so that
something like ``opts.a.b.c`` could be used. Note that the `data` top-level
attribute contains the initial inputs (starting grids) of the
:param args: Mappings (dictionaries) that may contain options. Non-mapping
arguments are silently discarded.
:param kw: Additional keywords. They override any items of the same name
present in any of the *args dictionaries.
:return: A nested namespace of options.
"""
# Attribute names for sub-namespaces.
np_cp_s = 'np', 'cp' # NumPy, CuPy
tps_s = 'u', 's' # unsigned, signed
o = Sns(**default_options()) # initialize namespace
od = vars(o)
np_cp = dict(np=np, cp=cp)
for a in (*args, kw):
if isinstance(a, Mapping):
od.update(a)
od.update(all_func_speeds(**od))
for sg in ('shape', 'gens'):
if sg not in od:
od[sg] = od[f'{sg}_grades'][o.grade]
print('setting', sg, 'to', getattr(o, sg))
o.h, o.w = o.shape
o.data = Sns(np=Sns(), cp=Sns())
odd = vars(o.data)
o.blocks_per_grid = *(
ceil(tpb / hw) for (tpb, hw) in zip(o.shape, o.threads_per_block)
),
npr = np.random
npr.seed(o.random_seed)
init_np = npr.randint(0, 2, o.shape, dtype=o.int_types[0])
for xp_s in np_cp_s: # for each ('np', 'cp')
xp = np_cp[xp_s] # actual NumPy or CuPy module
odxp = vars(odd[xp_s])
for i, tp_s in enumerate(tps_s): # for each 'u' (unsigned) or 's'
odxp[tp_s] = data = Sns()
data.dtype = dtp = xp.dtype(o.int_types[i])
data.init = xp.array(init_np).astype(dtp)
data.rng = xp.arange(5, 8).astype(dtp)
data.isin = xp.isin
data.ker = ker = xp.full((3, 3), 2-i, dtype=dtp)
ker[1, 1] = 1
data.ker_flat = ker.flatten()
data.grid = xp.mgrid[:o.h, :o.w]
#
data.ker_idxs = map(
sub, xp.indices(ker.shape), map(rshift, ker.shape, (1, 1))
)
data.idxs = *(
(
idx +
xp.broadcast_to(kidx.flatten(), (o.h, o.w, ker.size)).T
) % d
for (idx, kidx, d) in zip(data.grid, data.ker_idxs, o.shape)
),
global conv_opts
if 'conv_opts' not in globals() or not isinstance(conv_opts, Sns):
conv_opts = o
vars(conv_opts).update(od)
return conv_opts
def run_ca_np_manual():
o = conv_opts.data.np.s
ker, out, idxs = o.ker_flat, o.init.copy(), o.idxs
f = lambda: (out[idxs].T @ ker).T
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return out
def run_ca_np_uf():
out = conv_opts.data.np.s.init.copy()
f = lambda: uf_np(out*9, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return out
def run_ca_np_ndi_1():
od = conv_opts.data.np.u
isin, rng, ker, out = od.isin, od.rng, od.ker, od.init.copy()
f = lambda: conv_np_ndi(out, ker, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = isin(f(), rng)
return out
def run_ca_np_ndi_2():
od = conv_opts.data.np.s
ker, out = od.ker, od.init.copy()
f = lambda: conv_np_ndi(out, ker, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return out.astype(conv_opts.data.np.u.dtype)
def run_ca_np_sig_2d_1():
od = conv_opts.data.np.u
isin, rng, ker, out = od.isin, od.rng, od.ker, od.init.copy()
f = lambda: conv_np_sig_2d(out, ker, boundary='wrap', mode='same')
for i in range(conv_opts.gens):
out[:, :] = isin(f(), rng)
return out
def run_ca_np_sig_2d_2():
od = conv_opts.data.np.s
ker, out = od.ker, od.init.copy()
f = lambda: conv_np_sig_2d(out, ker, boundary='wrap', mode='same')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return out.astype(conv_opts.data.np.u.dtype)
def run_ca_cp_sig_2d_1():
od = conv_opts.data.cp.u
isin, rng, ker, out = od.isin, od.rng, od.ker, od.init.copy()
f = lambda: conv_cp_sig_2d(out, ker, boundary='wrap', mode='same')
for i in range(conv_opts.gens):
out[:, :] = isin(f(), rng)
return cp.asnumpy(out)
def run_ca_cp_sig_2d_2():
od = conv_opts.data.cp.s
ker, out = od.ker, od.init.copy()
f = lambda: conv_cp_sig_2d(out, ker, boundary='wrap', mode='same')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return cp.asnumpy(out)
def run_ca_cp_ndi_1():
od = conv_opts.data.cp.u
isin, rng, ker, out = od.isin, od.rng, od.ker, od.init.copy()
f = lambda: conv_cp_ndi(out, ker, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = isin(f(), rng)
return cp.asnumpy(out)
def run_ca_cp_ndi_2():
od = conv_opts.data.cp.s
ker, out = od.ker, od.init.copy()
f = lambda: conv_cp_ndi(out, ker, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return cp.asnumpy(out)
def run_ca_cp_uf():
out = conv_opts.data.cp.u.init.copy()
f = lambda: uf_cp(out*9, mode='wrap')
for i in range(conv_opts.gens):
out[:, :] = (f() - 3) >> out == 0
return cp.asnumpy(out)
def run_ca_cp_raw_ker():
o = conv_opts
od = o.data.cp.u
both = od.init.copy(), od.init.copy()
h, w = o.shape
cp_ker = cp.RawKernel(
# kernel header
fr'''
#define uchar unsigned char
#define N size_t({h})
#define M size_t({w})
extern "C" __global__ void cp_ker_ca(uchar in[N][M], uchar out[N][M])
'''
# kernel body
r'''
{
//int h = blockDim.x * gridDim.x;
//int w = blockDim.y * gridDim.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < M)
{
int a, b;
char v = -3;
for (a = i-1; a < i+2; ++a)
for (b = j-1; b < j+2; ++b)
v += in[a % N][b % M];
out[i][j] = v >> in[i][j] == 0;
}
}
''',
'cp_ker_ca'
)
j = 1
args = o.blocks_per_grid, o.threads_per_block
for _ in range(o.gens):
cp_ker(*args, both[::j])
j = -j
return cp.asnumpy(both[::j][0])
# @cp.fuse(kernel_name='cp_ker')
# def cp_ker(ii, jj):
# pass
def run_ca_nb_cuda_jit_1():
"""
:return:
"""
o = conv_opts
od = o.data.cp.u
both = od.init.copy()
both = both, both.copy()
h, w = od.init.shape
@cuda.jit
def conv_numba_cuda_1(in_, out_):
"""
Of all the different kernel variants that I tried (and I tried a LOT)
this one, with this particular indexing, seems to be the fastest:
ranging from ii (or jj) to ii+3 is measurably faster than the range
from ii-1 to to ii+2, even though the former requires the setting of
the ij variables (in the latter we can just use ii, jj for the final
indexing where the bitshift is.
Note that the situation is reversed in
the case of the raw kernel -- there the ii-1-based indexing is
slightly faster!
As a general rule, writing to memory (even setting simple scalar
variables) is noticeable faster than skipping precomputing and instead
redoing modulo etc operations (as is evident in the code below). As
long as it's not some ridiculously complex operation, repeating them
is usually faster than storing intermediate results. This seems to be
especially the case in C/C++ code; Python, on the othe hand, often
benefits from storing intermediate variables (presumably because it
reduces stack usage?)
:param in_:
:param out_:
"""
ii, jj = ij = cuda.grid(2)
if ii < h and jj < w:
v = -3
for i in range(ii-1, ii+2):
for j in range(jj-1, jj+2):
v += in_[i % h, j % w]
out_[ij] = v >> in_[ij] == 0
f = conv_numba_cuda_1[o.blocks_per_grid, o.threads_per_block]
j = 1
for _ in range(o.gens):
f(*both[::j])
j = -j
return cp.asnumpy(both[::j][0])
def run_ca_nb_cuda_jit_2():
"""
:return: NumPy (not CuPy) array with the last grid of the CA evolution
"""
o = conv_opts
od = o.data.cp.u
both = od.init.copy()
both = both, both.copy()
h, w = od.init.shape
#idx = *(np.arange(-1, x+1).astype(cp.int32) % x for x in (h, w)),
idxs = cp.arange(-1, h+1) % h, cp.arange(-1, w+1) % w
# cool way to construct an mgrid-like thing with a single complex 2darray
# ca = np.mgrid[-1:2, -1:2].T @ [1, 1.j]
@cuda.jit
def conv_numba_cuda_2(in_, out_, idx_i, idx_j):
"""
Of all the different kernel variants that I tried (and I tried a LOT)
this one, with this particular indexing, seems to be the fastest:
ranging from ii (or jj) to ii+3 is measurably faster than the range
from ii-1 to to ii+2, even though the former requires the setting of
the ij variables (in the latter we can just use ii, jj for the final
indexing where the bitshift is.
Note that the situation is reversed in
the case of the raw kernel -- there the ii-1-based indexing is
slightly faster!
As a general rule, writing to memory (even setting simple scalar
variables) is noticeable faster than skipping precomputing and instead
redoing modulo etc operations (as is evident in the code below). As
long as it's not some ridiculously complex operation, repeating them
is usually faster than storing intermediate results. This seems to be
especially the case in C/C++ code; Python, on the othe hand, often
benefits from storing intermediate variables (presumably because it
reduces stack usage?)
:param in_:
:param out_:
:param idx_i:
:param idx_j:
"""
ii, jj = ij = cuda.grid(2)
if ii < h and jj < w:
v = -3
for i in range(ii, ii+3):
for j in range(jj, jj+3):
v += in_[idx_i[i], idx_j[j]]
out_[ij] = v >> in_[ij] == 0
j = 1
f = lambda: conv_numba_cuda_2[o.blocks_per_grid, o.threads_per_block](
*both[::j], *idxs
)
for _ in range(o.gens):
f()
j = -j
return cp.asnumpy(both[::j][0])
def run_ca_nb_njit_sten():
"""
TODO: document this HARD -- very counterintuitive!
:return: NumPy (not CuPy) array with the last grid of the CA evolution
"""
o = conv_opts
data = o.data.np.u
out1 = data.init.copy()
@stencil(neighborhood=((0, 2), ) * 2)
def conv_numba_sten(in1):
"""
:param in1:
:return:
"""
v, r = -3, range(3)
for i in r:
for j in r:
v += in1[i, j]
return v >> in1[1, 1] == 0
for _ in range(o.gens):
# np.pad(out1, 1, mode='wrap') is equivalent to:
# out2 = np.pad(out1, 1) # pad regularly (with zeros)
# out2[1:-1, 1:-1] = out1
# out2[1:-1, 0] = out1[:, -1]
# out2[1:-1, -1] = out1[:, 0]
# out2[0, 1:-1] = out1[-1, :]
# out2[-1, 1:-1] = out1[0, :]
# out2[0, 0] = out1[-1, -1]
# out2[-1, -1] = out1[0, 0]
# out2[0, -1] = out1[-1, 0]
# out2[-1, 0] = out1[0, -1]
out2 = np.pad(out1, 1, mode='wrap')
conv_numba_sten(out2, out=out1)
return out1
# print results
def main(pat='*', print_arrays=False, **opts):
"""
Called when module is run as a script.
:param pat: Wildcard or regular expression pattern. Only functions whose
names (stripped of the prefix 'run_ca_') match the pattern are included
in the benchmark. If a pattern contains only alphanumeric/underscore
and wildcard ('*' and '?') characters only, it is considered a wildcard
pattern (which is automatically converted to a regular expression).
:param print_arrays:
:param opts:
:return:
"""
# if the pattern consists only of alphanumeric characters and/or
# underscores and/or '*' (wildcard matching zero or more characters)
# and/or '?' (wildcard matching exactly one character), it is considered
# a wildcard pattern (which is then converted to a regex pattern) otherwise
# a regex.
if re.match(r'^[\w*?]*$', pat):
if not pat.startswith('*'):
pat = '^' + pat
if not pat.endswith('*'):
pat += '$'
pat = pat.replace('*', '.*').replace('?', '.')
print(
'Function name regex',
'-------------------',
pat,
end='\n\n', sep='\n'
)
o = options(opts)
fn_prefix = o.run_func_prefix
o.fnames = fnames = *filter(
partial(re.search, pat),
(
fn[len(fn_prefix):]
for fn in vars(sys.modules[__name__]) if fn.startswith(fn_prefix)
)
),
print(*fnames, '\nGrade: ' + o.grade, sep='\n', end='\n\n')
gens = o.gens
n_fn = len(fnames)
arg = f'({gens})'
space = len(max(fnames, key=len)) + len(arg)
# grids, call strings (for display purposes), execution times
arrays, call_str, dts = [], [], []
for fn in fnames:
f = eval(fn_prefix + fn)
t0, arr = t(), f()
dt = t() - t0
dts.append(dt)
arrays.append(cp.asnumpy(arr))
call_str.append(f'{fn}{arg}'.center(space))
print(f'{f.__name__}:'.ljust(space+11), f'{dt:.3f} sec')
print()
for i in range(n_fn):
j = (i+1) % n_fn
# t0 = t()
print(
call_str[i],
f'{"!="[int(np.array_equal(arrays[i], arrays[j]))]}=',
call_str[j]
)
print()
# print(f'Compare-arrays dt: {t()-t0:.3f}')
if print_arrays:
print(conv_opts.data.np.u.init, '\n=================')
for a, c in zip(arrays, call_str):
print(f'{c}\n{a}\n=================')
return arrays, call_str
if __name__ == '__main__':
arrays, call_str = main(
r'cuda|^cp_',
#r'manual|np_ndi',
#r'nb*',
#r'cp.*(ndi|raw)|np_ndi|np_uf',
#r'np_ndi*',
print_arrays=False,
gens=1,
shape=(1024*2,)*2,
)