-
Notifications
You must be signed in to change notification settings - Fork 0
/
hw2.py
326 lines (273 loc) · 8.44 KB
/
hw2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import numpy as np
from numba import cuda
import math
import matplotlib.pyplot as plt
from map_parallel import sArray
import time
def gpu_total_memory():
'''
Query the GPU's properties via Numba to obtain the total memory of the device.
'''
context = cuda.current_context()
memory = context.get_memory_info()
return(memory[1])
def gpu_compute_capability():
'''
Query the GPU's properties via Numba to obtain the compute capability of the device.
'''
device = cuda.get_current_device()
compute = device.compute_capability
return(compute)
def gpu_name():
'''
Query the GPU's properties via Numba to obtain the name of the device.
'''
device = cuda.get_current_device()
name = device.name
return(name)
def max_float64s():
'''
Compute the maximum number of 64-bit floats that can be stored on the GPU
'''
context = cuda.current_context()
memory = context.get_memory_info()
size = 64
number = math.floor(memory[1]*8 / size)
return(number)
def map_64():
'''
Execute the map app modified to use 64-bit floats
'''
N = 110501888
x = np.linspace(0,1,N,dtype = np.float64)
y = sArray(x)
plt.plot(x,y)
plt.show()
@cuda.jit(device = True)
def f(x, r):
'''
Execute 1 iteration of the logistic map
'''
return r*x*(1 - x)
def f_CPU(x, r):
'''
Execute 1 iteration of the logistic map on CPU
'''
return r*x*(1 - x)
@cuda.jit()
def logistic_map_kernel(ss, r, x, transient, steady):
'''
Kernel for parallel iteration of logistic map
Arguments:
ss: 2D numpy device array to store steady state iterates for each value of r
r: 1D numpy device array of parameter values
x: float initial value
transient: int number of iterations before storing results
steady: int number of iterations to store
'''
i = cuda.grid(1)
n = r.size
if i < n:
# function body
x_old = x
for j in range(transient):
x_new = f(x_old,r[i])
x_old = x_new
for j in range(steady):
x_new = f(x_old,r[i])
x_old = x_new
ss[j][i] = x_old
def parallel_logistic_map(r, x, transient, steady):
'''
Parallel iteration of the logistic map
Arguments:
r: 1D numpy array of float64 parameter values
x: float initial value
transient: int number of iterations before storing results
steady: int number of iterations to store
Return:
2D numpy array of steady iterates for each entry in r
'''
n = r.size
d_r = cuda.to_device(r)
d_ss = cuda.device_array([steady, n], dtype = r.dtype)
TPB = 32
gridDim = (n+TPB-1)//TPB
blockDim = TPB
logistic_map_kernel[gridDim,blockDim](d_ss, d_r, x, transient, steady)
return d_ss.copy_to_host()
def serial_logistic_map(r,x,transient,steady):
ss = np.zeros((steady, r.size), dtype = r.dtype)
n = r.size
for i in range(n):
# This for loop changes the parameter r
x_old = x
for j in range(transient):
#This for loop moves the system a step foward
x_new = f_CPU(x_old,r[i])
x_old = x_new
for j in range(steady):
#This for loop moves the system a step foward
x_new = f_CPU(x_old,r[i])
x_old = x_new
ss[j][i] = x_old
return ss
@cuda.jit(device = True)
def iteration_count(cx, cy, dist, itrs):
'''
Computed number of Mandelbrot iterations
Arguments:
cx, cy: float64 parameter values
dist: float64 escape threshold
itrs: int iteration count limit
'''
x_old = 0
y_old = 0
radius = (x_old**2+y_old**2)**0.5
for i in range(itrs):
radius = (x_old**2+y_old**2)**0.5
if radius < dist:
x_new = x_old**2-y_old**2+cx
y_new = 2*x_old*y_old+cy
x_old = x_new
y_old = y_new
else:
break
return i
@cuda.jit
def mandelbrot_kernel(out, cx, cy, dist, itrs):
'''
Kernel for parallel computation of Mandelbrot iteration counts
Arguments:
out: 2D numpy device array for storing computed iteration counts
cx, cy: 1D numpy device arrays of parameter values
dist: float64 escape threshold
itrs: int iteration count limit
'''
i,j = cuda.grid(2)
if i < cx.size and j < cy.size:
# function body
out[j][i] = iteration_count(cx[i],cy[j],dist,itrs)
def parallel_mandelbrot(cx, cy, dist, itrs):
'''
Parallel computation of Mandelbrot iteration counts
Arguments:
cx, cy: 1D numpy arrays of parameter values
dist: float64 escape threshold
itrs: int iteration count limit
Return:
2D numpy array of iteration counts
'''
d_cx = cuda.to_device(cx)
d_cy = cuda.to_device(cy)
d_out = cuda.device_array([cx.size, cy.size], dtype = cx.dtype)
n_x = cx.size
n_y = cy.size
TPB_x = 32
TPB_y = 32
gridDim = ((n_x+TPB_x-1) // TPB_x, (n_y+TPB_y-1) // TPB_y)
blockDim = (TPB_x,TPB_y)
mandelbrot_kernel[gridDim, blockDim](d_out,d_cx, d_cy,dist,itrs)
return d_out.copy_to_host()
def escape(cx, cy, dist,itrs, x0=0, y0=0):
'''
Compute the number of iterations of the logistic map,
f(x+j*y)=(x+j*y)**2 + cx +j*cy with initial values x0 and y0
with default values of 0, to escape from a cirle centered at the origin.
inputs:
cx - float: the real component of the parameter value
cy - float: the imag component of the parameter value
dist: radius of the circle
itrs: int max number of iterations to compute
x0: initial value ofRR x; default value 0
y0: initial value of y; default value 0
returns:
an int scalar interation count
'''
x_old = x0
y_old = y0
radius = (x_old**2+y_old**2)**0.5
for i in range(itrs):
if radius < dist:
x_new = x_old**2-y_old**2+cx
y_new = 2*x_old*y_old+cy
x_old = x_new
y_old = y_new
radius = (x_old**2+y_old**2)**0.5
else:
break
return i
def serial_mandelbrot(cx,cy,dist,itrs):
"""
Compute escape iteration counts for an array of parameter values
input:
cx - array: 1d array of real part of parameter
cy - array: 1d array of imaginary part of parameter
dist - float: radius of circle for escape
output:
a 2d array of iteration count for each parameter value (indexed pair of values cx, cy)
"""
y_ = np.zeros((len(cx),len(cy)))
for i in range(len(cx)):
for j in range(len(cy)):
y_[j][i] = escape(cx[i],cy[j],dist,itrs)
return y_
if __name__ == "__main__":
#Problem 1
print("GPU memory in GB: ", gpu_total_memory()/1024**3)
print("Compute capability (Major, Minor): ", gpu_compute_capability())
print("GPU Model Name: ", gpu_name())
print("Max float64 count: ", max_float64s())
#PASTE YOUR OUTPUT HERE#
#GPU memory in GB: 2.0
#Compute capability (Major, Minor): (3, 0)
#GPU Model Name: b'GeForce GTX 760M'
#Max float64 count: 268435456
#Problem 2
#The largest array allowed is 134217728 by hand caculation. However, in real world, it can only afford 110501888
map_64()
#PASTE YOUR ERROR MESSAGES HERE#
#numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY
#Problem 3
print("3a) The for loops and their explaination are listed in the function")
print("3b) The first loop that changes r[i] assign a new parameter r in each simulation. Each iteration are independent of one another in this for loop.The second and third for loop moves the simulation a step foward. Each iteration will affect the next iteration.")
r = np.linspace(0,4,num = 1000)
x0 = 0.5
trans = 992
steady = 8
start = time.time()
ss = parallel_logistic_map(r,x0,trans,steady)
end = time.time()
ss_time = end-start
print("The time of parallel computation is " + str(ss_time) +"sec")
start = time.time()
ss2 = serial_logistic_map(r,x0,trans,steady)
end = time.time()
ss2_time = end-start
print("The time of serial computation is " + str(ss2_time) +"sec")
print("The acceleration factor is " + str(ss2_time / ss_time))
### plots for testing
#plt.figure()
#for i in range(steady):
# plt.scatter(r,ss[i,:],s = 0.1)
#plt.show()
#plt.figure()
#for i in range(steady):
# plt.scatter(r,ss2[i,:],s = 0.1)
#plt.show()
###
#Problem 4
print("4a)In serial code, for loops can bbe used to assign differenc cx or cy value, or to move the simulation. For loops that are used to assign different cx and cy are independent to one another.")
print("4b)I verified the result by comparing two plots below")
print("4c)My finest resolution of the 2d grid of cx and cy is 186*186, anything bigger than this will result an error: Call to cuMemcpyDtoH results in CUDA_ERROR_LAUNCH_FAILED")
print("4c)My largest square block of TPBs that can run on my gpu is 32*32, anything bigger than this will result an error: Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE")
cx = np.linspace(-2.5,2.5,100)
cy = np.linspace(-2.5,2.5,100)
dist = 2.5
itrs = 256
y_ = parallel_mandelbrot(cx,cy,dist,itrs)
y_2 = serial_mandelbrot(cx,cy,dist,itrs)
plt.pcolormesh(y_)
plt.show()
plt.pcolormesh(y_2)
plt.show()