def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) task_id = ga.read_inc(g_counter, 0) # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) task_id = ga.read_inc(g_counter, 0) while task_id < multiplier**3: task_next = task_list[task_id] a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev, a_nb_prev = a_next, a_nb_next b_prev, b_nb_prev = b_next, b_nb_next task_id = ga.read_inc(g_counter, 0) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def time_acc(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0] - lo[0] cols = hi[1] - lo[1] shifti = [rows, 0, rows] shiftj = [0, cols, cols] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump): ihi = ilo + chunk for jlo in range(lo[1], hi[1] - chunk - jump + 1, chunk + jump): jhi = jlo + chunk count += 1 if local: llo = [ilo, jlo] lhi = [ihi, jhi] ga.acc(g_a, buf[ga.zip(llo, lhi)], llo, lhi, 1) else: index = count % 3 llo = [ilo + shifti[index], jlo + shiftj[index]] lhi = [ihi + shifti[index], jhi + shiftj[index]] ga.acc(g_a, buf[ilo:ihi, jlo:jhi], llo, lhi, 1) seconds = time.time() - seconds return seconds / count
def time_acc(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0]-lo[0] cols = hi[1]-lo[1] shifti = [rows, 0, rows] shiftj = [0, cols, cols] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump): ihi = ilo + chunk for jlo in range(lo[1], hi[1]-chunk-jump+1, chunk+jump): jhi = jlo + chunk count += 1 if local: llo = [ilo,jlo] lhi = [ihi,jhi] ga.acc(g_a, buf[ga.zip(llo,lhi)], llo, lhi, 1) else: index = count%3 llo = [ilo+shifti[index],jlo+shiftj[index]] lhi = [ihi+shifti[index],jhi+shiftj[index]] ga.acc(g_a, buf[ilo:ihi,jlo:jhi], llo, lhi, 1) seconds = time.time() - seconds return seconds/count
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) task_id = ga.read_inc(g_counter, 0) # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) task_id = ga.read_inc(g_counter, 0) while task_id < multiplier**3: task_next = task_list[task_id] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next task_id = ga.read_inc(g_counter, 0) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) ntasks = multiplier**3 // nproc start = me * ntasks stop = (me + 1) * ntasks if me + 1 == nproc: stop += multiplier**3 % nproc # the srumma algorithm, more or less task_prev = task_list[start] a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) for i in range(start + 1, stop): task_next = task_list[i] a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev, a_nb_prev = a_next, a_nb_next b_prev, b_nb_prev = b_next, b_nb_next ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): task_list = get_task_list(chunk_size, multiplier) ### get first integer from g_counter and assign to 'task_id' # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) ### get next integer from g_counter and assign to 'task_id' while task_id < multiplier**3: task_next = task_list[task_id] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next ### get next integer from g_counter and assign to 'task_id' ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) ntasks = multiplier**3 // nproc start = me*ntasks stop = (me+1)*ntasks if me+1 == nproc: stop += multiplier**3 % nproc # the srumma algorithm, more or less task_prev = task_list[start] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) for i in range(start+1,stop): task_next = task_list[i] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def time_acc1(g_a, lo, hi, buf, chunk, jump, local): # Note: differs from test.F because the passed buffer must be the same # size/shape as the patch. The slicing should be fast as the buffer is 1D # and contiguous (and so is the slice). count = 0 rows = hi[0]-lo[0] shift = [rows, 2*rows, 3*rows] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump): ihi = ilo+chunk count += 1 if local: ga.acc(g_a, buf[ilo:ihi], [ilo], [ihi], 1.0) else: index = count%3 ga.acc(g_a, buf[ilo:ihi], ilo+shift[index], ihi+shift[index], 1.0) seconds = time.time() - seconds return seconds/count
def time_acc1(g_a, lo, hi, buf, chunk, jump, local): # Note: differs from test.F because the passed buffer must be the same # size/shape as the patch. The slicing should be fast as the buffer is 1D # and contiguous (and so is the slice). count = 0 rows = hi[0] - lo[0] shift = [rows, 2 * rows, 3 * rows] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump): ihi = ilo + chunk count += 1 if local: ga.acc(g_a, buf[ilo:ihi], [ilo], [ihi], 1.0) else: index = count % 3 ga.acc(g_a, buf[ilo:ihi], ilo + shift[index], ihi + shift[index], 1.0) seconds = time.time() - seconds return seconds / count
def check_accumulate_overlap(gatype): if 0 == me: print '> Checking overlapping accumulate ...', g_a = create_global_array(gatype) ga.zero(g_a) ga.acc(g_a, [1], (n/2,n/2), (n/2+1,n/2+1), 1) ga.sync() if MIRROR: if 0 == iproc: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - lprocs) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, lprocs)) else: if 0 == me: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - nproc) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, nproc)) if 0 == me: print 'OK' ga.destroy(g_a)
def check_accumulate_disjoint(gatype): """Each node accumulates into disjoint sections of the array.""" if 0 == me: print '> Checking disjoint accumulate ...', g_a = create_global_array(gatype) a = create_local_a(gatype) b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype)) if 0 == me: ga.put(g_a, a) ga.sync() inc = (n-1)/20 + 1 ij = 0 for i in range(0,n,inc): for j in range(0,n,inc): x = 10.0 lo = [i,j] hi = [min(i+inc,n), min(j+inc,n)] piece = b[ga.zip(lo,hi)] check = False if MIRROR: check = ij % lprocs == iproc else: check = ij % nproc == me if check: ga.acc(g_a, piece, lo, hi, x) ga.sync() ij += 1 # each process applies all updates to its local copy a[ga.zip(lo,hi)] += x * piece ga.sync() # all nodes check all of a if not np.all(ga.get(g_a) == a): ga.error('acc failed') if 0 == me: print 'OK' ga.destroy(g_a)
nprocs = ga.nnodes() myrank = ga.nodeid() g_pi = ga.create(ga.C_DBL, [1]) one_time = False if len(sys.argv) == 2: n = int(sys.argv[1]) one_time = True while True: if not one_time: if myrank == 0: n = get_n() n = ga.brdcst(n) else: n = ga.brdcst(0) if n == 0: break ga.zero(g_pi) mypi = comp_pi(n, myrank, nprocs) ga.acc(g_pi, mypi) ga.sync() if myrank == 0: pi = ga.get(g_pi)[0] prn_pi(pi, PI) if one_time: break ga.destroy(g_pi)