def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo, hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo,hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS] * NDIM chunk = [TOTALELEMS / nprocs - 1] * NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims) * 29 b = np.random.rand(*dims) * 37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo, hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0], 0), (hi[0], dims[0])) b = ga.get(g_b, (0, lo[1]), (dims[1], hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a, b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo,hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0],0), (hi[0],dims[0])) b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
v = np.dot(a,b) val = int(np.abs(np.sum(c-v))>0.0001) val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3+1) else: g_a = ga.create(ga.C_DBL, [N,N]) g_b = ga.create(ga.C_DBL, [N,N]) g_c = ga.create(ga.C_DBL, [N,N]) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N*N)) ga.put(g_b, np.random.random(N*N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER) if me == 0: print "done" if me == 0: print "verifying using ga.gemm...", ok = verify_using_ga(g_a, g_b, g_c) if me == 0: if ok: print "OKAY" else: print "FAILED"
"""Use ga.access() to sum locally per SMP node.""" import mpi4py.MPI import ga import numpy as np world_id = ga.nodeid() world_nproc = ga.nnodes() node_id = ga.cluster_nodeid() node_nproc = ga.cluster_nprocs(node_id) node_me = ga.cluster_procid(node_id,ga.nodeid()) g_a = ga.create(ga.C_DBL, (3,4,5,6)) if world_id == 0: ga.put(g_a, np.arange(3*4*5*6)) ga.sync() if node_me == 0: sum = 0 for i in range(node_nproc): smp_neighbor_world_id = ga.cluster_procid(node_id,i) buffer = ga.access(g_a, proc=smp_neighbor_world_id) sum += np.sum(buffer) print sum
val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3+1) else: g_a = ga.create(ga.C_DBL, [N,N]) g_b = ga.create(ga.C_DBL, [N,N]) g_c = ga.create(ga.C_DBL, [N,N]) g_counter = ga.create(ga.C_INT, [1]) ga.zero(g_counter) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N*N)) ga.put(g_b, np.random.random(N*N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER, g_counter) if me == 0: print "done" if me == 0: print "verifying using ga.gemm...", ok = verify_using_ga(g_a, g_b, g_c) if me == 0: if ok: print "OKAY" else: print "FAILED"
"""Use ga.access() to sum locally per SMP node.""" import mpi4py.MPI import ga import numpy as np # Okay, we create the global array g_a = ga.create(ga.C_DBL, (3, 4, 5, 6)) if world_id == 0: ga.put(g_a, np.arange(3 * 4 * 5 * 6)) ga.sync() # You're on your own!