def parallel_task(): me = ga.pgroup_nodeid() nproc = ga.pgroup_nnodes() if not me: print "This is process 0 on group %s" % ga.pgroup_get_default() g_a = ga.create(ga.C_DBL, (3,4,5)) ga.randomize(g_a) if me == 0: print np.sum(ga.access(g_a))
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS] * NDIM chunk = [TOTALELEMS / nprocs - 1] * NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims) * 29 b = np.random.rand(*dims) * 37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo, hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0], 0), (hi[0], dims[0])) b = ga.get(g_b, (0, lo[1]), (dims[1], hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a, b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo,hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0],0), (hi[0],dims[0])) b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo, hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo,hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
"""Use ga.access() to sum locally per SMP node.""" import mpi4py.MPI import ga import numpy as np world_id = ga.nodeid() world_nproc = ga.nnodes() node_id = ga.cluster_nodeid() node_nproc = ga.cluster_nprocs(node_id) node_me = ga.cluster_procid(node_id,ga.nodeid()) g_a = ga.create(ga.C_DBL, (3,4,5,6)) if world_id == 0: ga.put(g_a, np.arange(3*4*5*6)) ga.sync() if node_me == 0: sum = 0 for i in range(node_nproc): smp_neighbor_world_id = ga.cluster_procid(node_id,i) buffer = ga.access(g_a, proc=smp_neighbor_world_id) sum += np.sum(buffer) print sum
import mpi4py.MPI # initialize Message Passing Interface import ga # initialize Global Arrays me = ga.nodeid() def print_distribution(g_a): for i in range(ga.nnodes()): lo,hi = ga.distribution(g_a, i) print "%s lo=%s hi=%s" % (i,lo,hi) # create some arrays g_a = ga.create(ga.C_DBL, (10,20,30), chunk=(-1,20,-1)) g_b = ga.create(ga.C_DBL, (10,20,30), chunk=(10,-1,-1)) if not me: print_distribution(g_a) print_distribution(g_b) ga.fill(g_a, 6) ga.copy(g_a,g_b) if not me: buffer = ga.access(g_b) print buffer.shape print buffer
def verify_using_np(g_a, g_b, g_c): a = ga.get(g_a) b = ga.get(g_b) c = ga.get(g_c) v = np.dot(a,b) val = int(np.abs(np.sum(c-v))>0.0001) val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3+1) else: g_a = ga.create(ga.C_DBL, [N,N]) g_b = ga.create(ga.C_DBL, [N,N]) g_c = ga.create(ga.C_DBL, [N,N]) g_counter = ga.create(ga.C_INT, [1]) ga.zero(g_counter) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N*N)) ga.put(g_b, np.random.random(N*N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER, g_counter) if me == 0: print "done" if me == 0:
import mpi4py.MPI # initialize Message Passing Interface import ga # initialize Global Arrays import numpy as np me = ga.nodeid() nproc = ga.nnodes() def print_distribution(g_a): for i in range(ga.nnodes()): lo,hi = ga.distribution(g_a, i) print "P=%s lo=%s hi=%s" % (i,lo,hi) # create some irregular arrays block = [3,2] map = [0,2,6,0,5] if nproc < np.prod(block): raise ValueError, "ERROR: fewer procs than requested blocks" g_a = ga.create_irreg(ga.C_DBL, [8,10], block, map, "Array A") if not g_a: ga.error("Could not create global array A",g_a) g_b = ga.create(ga.C_INT, (2,3,4,5,6)) if not me: print_distribution(g_a) print_distribution(g_b)
def verify_using_np(g_a, g_b, g_c): a = ga.get(g_a) b = ga.get(g_b) c = ga.get(g_c) v = np.dot(a, b) val = int(np.abs(np.sum(c - v)) > 0.0001) val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3 + 1) else: g_a = ga.create(ga.C_DBL, [N, N]) g_b = ga.create(ga.C_DBL, [N, N]) g_c = ga.create(ga.C_DBL, [N, N]) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N * N)) ga.put(g_b, np.random.random(N * N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER) if me == 0: print "done" if me == 0: print "verifying using ga.gemm...", ok = verify_using_ga(g_a, g_b, g_c)
"""Use ga.access() to sum locally per SMP node.""" import mpi4py.MPI import ga import numpy as np # Okay, we create the global array g_a = ga.create(ga.C_DBL, (3, 4, 5, 6)) if world_id == 0: ga.put(g_a, np.arange(3 * 4 * 5 * 6)) ga.sync() # You're on your own!
import mpi4py.MPI # initialize Message Passing Interface import ga # initialize Global Arrays me = ga.nodeid() def print_distribution(g_a): for i in range(ga.nnodes()): lo, hi = ga.distribution(g_a, i) print "%s lo=%s hi=%s" % (i, lo, hi) # create some arrays g_a = ga.create(ga.C_DBL, (10, 20, 30), chunk=(-1, 20, -1)) g_b = ga.create(ga.C_DBL, (10, 20, 30), chunk=(10, -1, -1)) if not me: print_distribution(g_a) print_distribution(g_b) ga.fill(g_a, 6) ga.copy(g_a, g_b) if not me: buffer = ga.access(g_b) print buffer.shape print buffer
def parallel_task(): me = ga.pgroup_nodeid() nproc = ga.pgroup_nnodes() ### print a message from the master of the group g_a = ga.create(ga.C_DBL, (3,4,5)) ga.randomize(g_a)
def comp_pi(n, myrank=0, nprocs=1): h = 1.0 / n; s = 0.0; for i in xrange(myrank + 1, n + 1, nprocs): x = h * (i - 0.5); s += 4.0 / (1.0 + x**2); return s * h def prn_pi(pi, PI): message = "pi is approximately %.16f, error is %.16f" print (message % (pi, abs(pi - PI))) nprocs = ga.nnodes() myrank = ga.nodeid() g_pi = ga.create(ga.C_DBL, [1]) one_time = False if len(sys.argv) == 2: n = int(sys.argv[1]) one_time = True while True: if not one_time: if myrank == 0: n = get_n() n = ga.brdcst(n) else: n = ga.brdcst(0) if n == 0: break
def parallel_task(): me = ga.pgroup_nodeid() nproc = ga.pgroup_nnodes() ### print a message from the master of the group g_a = ga.create(ga.C_DBL, (3, 4, 5)) ga.randomize(g_a)
def verify_using_np(g_a, g_b, g_c): a = ga.get(g_a) b = ga.get(g_b) c = ga.get(g_c) v = np.dot(a,b) val = int(np.abs(np.sum(c-v))>0.0001) val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3+1) else: g_a = ga.create(ga.C_DBL, [N,N]) g_b = ga.create(ga.C_DBL, [N,N]) g_c = ga.create(ga.C_DBL, [N,N]) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N*N)) ga.put(g_b, np.random.random(N*N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER) if me == 0: print "done" if me == 0: print "verifying using ga.gemm...", ok = verify_using_ga(g_a, g_b, g_c)
import mpi4py.MPI # initialize Message Passing Interface import ga # initialize Global Arrays me = ga.nodeid() def print_distribution(g_a): for i in range(ga.nnodes()): lo,hi = ga.distribution(g_a, i) print "%s lo=%s hi=%s" % (i,lo,hi) # create some arrays g_a = ga.create(ga.C_DBL, (10,20,30)) g_b = ga.create(ga.C_INT, (2,3,4,5,6)) if not me: print_distribution(g_a) print_distribution(g_b)