def inv(dst, src): dst[:] = 0 eta = gpt.copy(src) ws = [gpt.copy(src) for _ in range(2)] cache_key_base = ( f"{dst.describe()}_{src.describe()}_{src.grid.obj}_{dst.grid.obj}" ) dt_solv = dt_distr = dt_hop = 0.0 for eo in range(2): ws[0][:] = 0 dt_distr -= gpt.time() cache_key = f"{cache_key_base}_{eo}_a" if cache_key not in cache: plan = gpt.copy_plan(src_blk, eta, embed_in_communicator=eta.grid) plan.destination += src_blk.view[sap.pos] plan.source += eta.view[sap.coor[eo]] cache[cache_key] = plan() cache[cache_key](src_blk, eta) dt_distr += gpt.time() dt_solv -= gpt.time() dst_blk[:] = 0 # for now solver[eo](dst_blk, src_blk) dt_solv += gpt.time() dt_distr -= gpt.time() cache_key = f"{cache_key_base}_{eo}_b" if cache_key not in cache: plan = gpt.copy_plan(ws[0], dst_blk, embed_in_communicator=ws[0].grid) plan.destination += ws[0].view[sap.coor[eo]] plan.source += dst_blk.view[sap.pos] cache[cache_key] = plan() cache[cache_key](ws[0], dst_blk) dt_distr += gpt.time() dt_hop -= gpt.time() if eo == 0: sap.op(ws[1], ws[0]) eta -= ws[1] dst += ws[0] dt_hop += gpt.time() gpt.message( f"SAP cycle; |rho|^2 = {gpt.norm2(eta):g}; |dst|^2 = {gpt.norm2(dst):g}" ) gpt.message( f"SAP Timings: distr {dt_distr:g} secs, blk_solver {dt_solv:g} secs, hop+update {dt_hop:g} secs" )
def perform(self, root): global basis_size, T, current_config if current_config is not None and current_config.conf_file != self.conf_file: current_config = None if current_config is None: current_config = config(self.conf_file) c = None vcj = [ g.vcolor(current_config.l_exact.U_grid) for jr in range(basis_size) ] for vcjj in vcj: vcjj[:] = 0 for tprime in range(T): basis_evec, basis_evals = g.load(self.basis_fmt % (self.conf, tprime)) plan = g.copy_plan(vcj[0], basis_evec[0], embed_in_communicator=vcj[0].grid) c = g.coordinates(basis_evec[0]) plan.destination += vcj[0].view[np.hstack( (c, np.ones((len(c), 1), dtype=np.int32) * tprime))] plan.source += basis_evec[0].view[c] plan = plan() for l in range(basis_size): plan(vcj[l], basis_evec[l]) for l in range(basis_size): g.message("Check norm:", l, g.norm2(vcj[l])) g.save(f"{root}/{self.name}/basis", vcj)
def merge_indices(dst, src, st, cache=default_merge_indices_cache): pos = gpt.coordinates(dst) assert st is not None result_otype = st[-1]() if result_otype is None: dst @= src return ndim = dst.otype.shape[st[0]] rank = len(st) - 1 islice = [slice(None, None, None) for i in range(len(dst.otype.shape))] ivec = [0] * rank cache_key = f"merge_indices_{dst.describe()}_{result_otype.__name__}_{dst.grid.obj}" tidx = [] src_i = [] for i in range(ndim**rank): idx = i for j in range(rank): c = idx % ndim islice[st[j]] = c ivec[j] = c idx //= ndim src_i.append(src[tuple(ivec)]) tidx.append(tuple(islice)) if cache_key not in cache: plan = gpt.copy_plan(dst, src_i) for i in range(ndim**rank): plan.destination += dst.view[(pos, ) + tidx[i]] plan.source += src_i[i].view[:] cache[cache_key] = plan() cache[cache_key](dst, src_i)
def promote(self, dst, src): tag = src.otype.__name__ if tag not in self.promote_plan: plan = gpt.copy_plan(dst, src, embed_in_communicator=dst.grid) plan.destination += dst.view[self.gcoor] plan.source += src.view[self.lcoor] self.promote_plan[tag] = plan() self.promote_plan[tag](dst, src)
def __setitem__(self, key, value): # unpack cache cache, key = unpack_cache_key(key) cache_key = None if cache is None else "set" # short code path to zero lattice if type(key) == slice and key == slice(None, None, None): if gpt.util.is_num(value): for o in self.v_obj: cgpt.lattice_set_to_number(o, value) return cache_key = ( f"{self.otype.__name__}_{self.checkerboard().__name__}_{self.grid.describe()}" ) cache = lattice.cache # general code path, map key pos, tidx, shape = gpt.map_key(self, key) n_pos = len(pos) # convert input to proper numpy array value = gpt.util.tensor_to_value( value, dtype=self.grid.precision.complex_dtype) if value is None: value = memoryview(bytearray()) # needed bytes and optional cyclic upscaling nbytes_needed = n_pos * numpy.prod( shape) * self.grid.precision.nbytes * 2 value = cgpt.copy_cyclic_upscale(value, nbytes_needed) # create plan if cache_key is None or cache_key not in cache: plan = gpt.copy_plan(self, value) plan.destination += gpt.lattice_view(self, pos, tidx) plan.source += gpt.global_memory_view( self.grid, [[self.grid.processor, value, 0, value.nbytes]] if value.nbytes > 0 else None, ) # skip optimization if we only use it once xp = plan( local_only=isinstance(pos, gpt.core.local_coordinates), skip_optimize=cache_key is None, ) if cache_key is not None: cache[cache_key] = xp else: xp = cache[cache_key] xp(self, value)
def bit_flipped_lattice(self, i): c = self.bit_map.coordinates nci = self.bit_map.not_coordinates[i] bfl = g.lattice(self.lattice) if i not in self.bit_flipped_plan: p = g.copy_plan(bfl, self.lattice) p.destination += bfl.view[c] p.source += self.lattice.view[nci] self.bit_flipped_plan[i] = p() self.bit_flipped_plan[i](bfl, self.lattice) return bfl
def bit_flipped_lattice(self, i): c = self.bit_map.coordinates nci = self.bit_map.not_coordinates[self.bit_permutation[i]] bfl = g.lattice(self.lattice) if i not in self.bit_flipped_plan: p = g.copy_plan(bfl, self.lattice) p.destination += bfl.view[c] p.source += self.lattice.view[nci] self.bit_flipped_plan[i] = p() # g.message( # self.bit_flipped_plan[i].info() # ) # TODO: it is odd that this maxes out at 22 GB/s ; focus on bandwidth benchmark first, why 500GB/s for prop and only 5 for singlet? self.bit_flipped_plan[i](bfl, self.lattice) return bfl
def block_extract(self, u2, U, idx): assert u2.otype.Nc == 2 and u2.otype.Ndim == 2 idx = list(idx) cache = ot_matrix_su_n_fundamental_group.cache cache_key = f"{self.Nc}_{idx}" if cache_key not in cache: pos = tuple([slice(None, None, None) for i in range(u2.grid.nd)]) plan = gpt.copy_plan(u2, U) for i in range(2): for j in range(2): plan.destination += u2.view[pos + (i, j)] plan.source += U.view[pos + (idx[i], idx[j])] cache[cache_key] = plan() cache[cache_key](u2, U)
def mk_qlat_gpt_copy_plan(ctype, total_site, multiplicity, tag): geo = q.Geometry(total_site, multiplicity) f_gpt = mk_gpt_field(ctype, geo) f_qlat = q.Field(ctype, geo) lexicographic_coordinates = g.coordinates(f_gpt) buf = f_qlat.mview() if tag == "qlat_from_gpt": qlat_from_gpt = g.copy_plan(buf, f_gpt) qlat_from_gpt.destination += g.global_memory_view( f_gpt.grid, [[f_gpt.grid.processor, buf, 0, buf.nbytes]]) qlat_from_gpt.source += f_gpt.view[lexicographic_coordinates] qlat_from_gpt = qlat_from_gpt(local_only=True) return qlat_from_gpt elif tag == "gpt_from_qlat": gpt_from_qlat = g.copy_plan(f_gpt, buf) gpt_from_qlat.source += g.global_memory_view( f_gpt.grid, [[f_gpt.grid.processor, buf, 0, buf.nbytes]]) gpt_from_qlat.destination += f_gpt.view[lexicographic_coordinates] gpt_from_qlat = gpt_from_qlat(local_only=True) return gpt_from_qlat else: q.displayln_info(tag) raise Exception("mk_qlat_gpt_copy_plan")
def __call__(self): plan = g.copy_plan(self.destinations, self.sources) buffer_descriptions = [] for i in range(len(self.sources)): src = self.sources[i] src_cb = src.checkerboard() coordinates = g.coordinates(src) L = src.grid.fdimensions for x in self.displacements[i]: buffer_descriptions.append((src.grid, src.otype, src_cb)) plan.destination += self.destinations[self.indices[i] [x]].view[:] plan.source += src.view[cgpt.coordinates_shift( coordinates, x, L)] return cshift_executer(buffer_descriptions, plan())
def assign_pos_view(): plan = g.copy_plan(lhs, l_dp) plan.destination += lhs.view[pos] plan.source += l_dp.view[pos] plan = plan() info = plan.info() for rank_dst, rank_src in info: assert rank_dst == rank_src assert rank_dst == lhs.grid.processor info_rank = info[(rank_dst, rank_src)] for index in info_rank: info_index = info_rank[index] # Make sure that after optimization only a single memcpy is needed assert info_index["blocks"] == 1 plan(lhs, l_dp)
def unsplit(first, second, cache=None, group_policy=split_group_policy.separate): if type(first) != list: return unsplit([first], [second]) n = len(first) N = len(second) Q = n // N assert n % N == 0 # Save memory by performing each group separately if N != 1 and group_policy == split_group_policy.separate: for i in range(N): unsplit([first[q * N + i] for q in range(Q)], [second[i]], cache, group_policy) return split_grid = second[0].grid sranks = split_grid.sranks srank = split_grid.srank lcoor = second[0].split_lcoor gcoor = second[0].split_gcoor src_data = second dst_data = first if cache is None: cache = {} cache_key = f"unsplit_plan_{first[0].grid.obj}_{second[0].grid.obj}_{first[0].otype.__name__}_{second[0].otype.__name__}_{n}_{N}" if cache_key not in cache: plan = gpt.copy_plan(dst_data, src_data, embed_in_communicator=first[0].grid) i = srank // (sranks // Q) for x in first[i * N:(i + 1) * N]: plan.destination += x.view[gcoor] for x in second: plan.source += x.view[lcoor] cache[cache_key] = plan() cache[cache_key](dst_data, src_data)
def separate_indices(x, st, cache=default_merge_indices_cache): pos = gpt.coordinates(x) cb = x.checkerboard() assert st is not None result_otype = st[-1]() if result_otype is None: return x ndim = x.otype.shape[st[0]] rank = len(st) - 1 islice = [slice(None, None, None) for i in range(len(x.otype.shape))] ivec = [0] * rank result = {} keys = [] tidx = [] dst = [] for i in range(ndim**rank): idx = i for j in range(rank): c = idx % ndim islice[st[j]] = c ivec[j] = c idx //= ndim keys.append(tuple(ivec)) tidx.append(tuple(islice)) for i in keys: v = gpt.lattice(x.grid, result_otype) v.checkerboard(cb) result[i] = v dst.append(v) cache_key = f"separate_indices_{cb.__name__}_{result_otype.__name__}_{x.otype.__name__}_{x.grid.describe()}_{x.grid.obj}" if cache_key not in cache: plan = gpt.copy_plan(dst, x) for i in range(len(tidx)): plan.destination += result[keys[i]].view[pos] plan.source += x.view[(pos, ) + tidx[i]] cache[cache_key] = plan() cache[cache_key](dst, x) return result
def __getitem__(self, key): # unpack cache cache, key = unpack_cache_key(key) cache_key = None if cache is None else "get" # general code path, map key pos, tidx, shape = gpt.map_key(self, key) n_pos = len(pos) # create target value = cgpt.ndarray((n_pos, *shape), self.grid.precision.complex_dtype) # create plan if cache_key is None or cache_key not in cache: plan = gpt.copy_plan(value, self) plan.destination += gpt.global_memory_view( self.grid, [[self.grid.processor, value, 0, value.nbytes]] if value.nbytes > 0 else None, ) plan.source += gpt.lattice_view(self, pos, tidx) xp = plan() if cache_key is not None: cache[cache_key] = xp else: xp = cache[cache_key] xp(value, self) # if only a single element is returned and we have the full shape, # wrap in a tensor if len(value) == 1 and shape == self.otype.shape: return gpt.util.value_to_tensor(value[0], self.otype) return value
def perform(self, root): global basis_size, sloppy_per_job, T, current_config if current_config is not None and current_config.conf_file != self.conf_file: current_config = None if current_config is None: current_config = config(self.conf_file) output_correlator = g.corr_io.writer(f"{root}/{self.name}/head.dat") vcj = g.load(f"{root}/{self.conf}/pm_basis/basis") for i0 in range(0, basis_size, sloppy_per_job): half_peramb = {} for l in g.load( f"{root}/{self.conf}/pm_{self.solver}_t{self.t}_i{i0}/propagators" ): for x in l: half_peramb[x] = l[x] g.mem_report(details=False) vc = g.vcolor(vcj[0].grid) c = g.coordinates(vc) prec = {"sloppy": 0, "exact": 1}[self.solver] for spin_prime in range(4): plan = None for spin in range(4): for i in range(i0, i0 + sloppy_per_job): hp = half_peramb[f"t{self.t}s{spin}c{i}_{self.solver}"] if plan is None: plan = g.copy_plan(vc, hp) plan.destination += vc.view[c] plan.source += hp.view[c, spin_prime, :] plan = plan() plan(vc, hp) t0 = g.time() slc_j = [ g(g.adj(vcj[j]) * vc) for j in range(basis_size) ] t1 = g.time() slc = g.slice(slc_j, 3) t2 = g.time() for j in range(basis_size): output_correlator.write( f"output/peramb_prec{prec}/n_{j}_{i}_s_{spin_prime}_{spin}_t_{self.t}", slc[j], ) t3 = g.time() if i % 50 == 0: g.message(spin_prime, spin, i, "Timing", t1 - t0, t2 - t1, t3 - t2) output_correlator.close()
def separate(lattices, dimension=-1): # expect list below if type(lattices) != list: lattices = [lattices] # evaluate in case it is an expression lattices = [gpt.eval(x) for x in lattices] # number of batches to separate batches = len(lattices) assert batches > 0 # make sure all have the same grid grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, batches)]) # allow negative indexing if dimension < 0: dimension += grid.nd assert dimension >= 0 else: assert dimension < grid.nd # number of slices (per batch) N = grid.fdimensions[dimension] n = N * batches # all lattices need to have same checkerboard cb = lattices[0].checkerboard() assert all([lattices[i].checkerboard() is cb for i in range(1, batches)]) # all lattices need to have same otype otype = lattices[0].otype assert all([ lattices[i].otype.__name__ == otype.__name__ for i in range(1, batches) ]) # create grid with dimension removed separated_grid = grid.removed_dimension(dimension) cb_mask = grid.cb.cb_mask[dimension] # create separate lattices and set their checkerboard separated_lattices = [gpt.lattice(separated_grid, otype) for i in range(n)] for i, x in enumerate(separated_lattices): j = i % N if cb_mask == 0 or j % 2 == 0: x.checkerboard(cb) else: x.checkerboard(cb.inv()) # construct coordinates separated_gcoor_zero = gpt.coordinates(separated_lattices[0]) separated_gcoor_one = (gpt.coordinates(separated_lattices[1]) if N > 1 and cb_mask == 1 else separated_gcoor_zero) separated_gcoor = [separated_gcoor_zero, separated_gcoor_one] # move data for i in range(N): gcoor = cgpt.coordinates_inserted_dimension(separated_gcoor[i % 2], dimension, [i]) plan = gpt.copy_plan(separated_lattices[i], lattices[0], embed_in_communicator=lattices[0].grid) plan.destination += separated_lattices[i].view[separated_gcoor[i % 2]] plan.source += lattices[0].view[gcoor] plan = plan() for j in range(batches): plan(separated_lattices[j * N + i], lattices[j]) # return return separated_lattices
def load(filename, params): # first check if this is right file format if not os.path.exists(filename + "/00/0000000000.compressed" ) or not os.path.exists(filename + "/metadata.txt"): raise NotImplementedError() # verbosity verbose = gpt.default.is_verbose("io") # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # need grids parameter assert params["grids"] is not None assert type(params["grids"]) == gpt.grid fgrid = params["grids"] assert fgrid.precision == gpt.single fdimensions = fgrid.fdimensions # read metadata metadata = read_metadata(filename + "/metadata.txt") s = get_ivec(metadata, "s") ldimensions = [s[4]] + s[:4] blocksize = get_ivec(metadata, "b") blocksize = [blocksize[4]] + blocksize[:4] nb = get_ivec(metadata, "nb") nb = [nb[4]] + nb[:4] crc32 = get_xvec(metadata, "crc32") neigen = int(metadata["neig"]) nbasis = int(metadata["nkeep"]) nsingle = int(metadata["nkeep_single"]) blocks = int(metadata["blocks"]) FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"]) nsingleCap = min([nsingle, nbasis]) # check nd = len(ldimensions) assert nd == 5 assert nd == len(fdimensions) assert nd == len(blocksize) assert fgrid.cb.n == 2 assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1] # create coarse grid cgrid = gpt.block.grid(fgrid, blocksize) # allow for partial loading of data if params["nmax"] is not None: nmax = params["nmax"] nbasis_max = min([nmax, nbasis]) neigen_max = min([nmax, neigen]) nsingleCap_max = min([nmax, nsingleCap]) else: nbasis_max = nbasis neigen_max = neigen nsingleCap_max = nsingleCap # allocate all lattices basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)] cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)] if params["advise_basis"] is not None: gpt.advise(basis, params["advise_basis"]) if params["advise_cevec"] is not None: gpt.advise(cevec, params["advise_cevec"]) # fix checkerboard of basis for i in range(nbasis_max): basis[i].checkerboard(site_cb) # mpi layout mpi = [] for i in range(nd): assert fdimensions[i] % ldimensions[i] == 0 mpi.append(fdimensions[i] // ldimensions[i]) assert mpi[0] == 1 # assert no mpi in 5th direction # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) # timing totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fread = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Loading %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "rb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 gpt.message("Read blocks", blocks) # make read-only to enable caching for x in pos: x.setflags(write=0) # dummy buffer data0 = memoryview(bytes()) # single-precision data data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(block_data_size_single * nsingleCap)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_munge -= gpt.time() # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB cgpt.munge_inner_outer(data_munged, data, nsingleCap, block_reduce) # data_munged: lattice0 lattice1 lattice2 ... dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap_max): distribute_plan( basis[i], data_munged[block_data_size_single * i:block_data_size_single * (i + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, mem_avail(), )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview( f.read(block_data_size_fp16 * (nbasis - nsingleCap))) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.fp16_to_fp32(data_fp32, data, 24) dt_fp16 += gpt.time() dt_munge -= gpt.time() cgpt.munge_inner_outer( data_munged, data_fp32, nbasis - nsingleCap, block_reduce, ) dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() if nsingleCap < nbasis_max: rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap, nbasis_max): j = i - nsingleCap distribute_plan( basis[i], data_munged[block_data_size_single * j:block_data_size_single * (j + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # coarse grid data data_fp32 = memoryview(bytearray(coarse_fp32_vector_size)) distribute_plan = None for j in range(neigen): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(coarse_vector_size)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.mixed_fp32fp16_to_fp32( data_fp32, data, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() data = data_fp32 else: data = data0 fgrid.barrier() dt_distr -= gpt.time() if j < neigen_max: if distribute_plan is None: distribute_plan = gpt.copy_plan(cevec[j], data) distribute_plan.destination += cevec[j].view[pos_coarse] distribute_plan.source += gpt.global_memory_view( cgrid, [[cgrid.processor, data, 0, data.nbytes]]) distribute_plan = distribute_plan() distribute_plan(cevec[j], data) dt_distr += gpt.time() if verbose and j % (neigen // 10) == 0: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # crc checks if f is not None: assert crc32_comp == crc32[cv.rank] # timing t1 = gpt.time() # verbosity if verbose: gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0))) # eigenvalues evln = list( filter(lambda x: x != "", open(filename + "/eigen-values.txt").read().split("\n"))) nev = int(evln[0]) ev = [float(x) for x in evln[1:]] assert len(ev) == nev return (basis, cevec, ev)
def save(filename, objs, params): # split data to save assert len(objs) == 3 basis = objs[0] cevec = objs[1] ev = objs[2] # verbosity verbose = gpt.default.is_verbose("io") if verbose: gpt.message( "Saving %d basis vectors, %d coarse-grid vectors, %d eigenvalues to %s" % (len(basis), len(cevec), len(ev), filename)) # create directory if gpt.rank() == 0: os.makedirs(filename, exist_ok=True) # now sync since only root has created directory gpt.barrier() # write eigenvalues if gpt.rank() == 0: f = open("%s/eigen-values.txt" % filename, "wt") f.write("%d\n" % len(ev)) for v in ev: f.write("%.15E\n" % v) f.close() # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # grids assert len(basis) > 0 assert len(cevec) > 0 fgrid = basis[0].grid cgrid = cevec[0].grid # mpi layout if params["mpi"] is not None: mpi = params["mpi"] else: mpi = fgrid.mpi assert mpi[0] == 1 # assert no mpi in 5th direction # params assert basis[0].checkerboard() == site_cb nd = 5 assert len(fgrid.ldimensions) == nd fdimensions = fgrid.fdimensions ldimensions = [conformDiv(fdimensions[i], mpi[i]) for i in range(nd)] assert fgrid.precision == gpt.single s = ldimensions b = [ conformDiv(fgrid.fdimensions[i], cgrid.fdimensions[i]) for i in range(nd) ] nb = [conformDiv(s[i], b[i]) for i in range(nd)] neigen = len(cevec) nbasis = len(basis) if "nsingle" in params: nsingle = params["nsingle"] assert nsingle <= nbasis else: nsingle = nbasis nsingleCap = min([nsingle, nbasis]) blocks = numpy.prod(nb) FP16_COEF_EXP_SHARE_FLOATS = 10 # write metadata if gpt.rank() == 0: fmeta = open("%s/metadata.txt" % filename, "wt") for i in range(nd): fmeta.write("s[%d] = %d\n" % (i, s[(i + 1) % nd])) for i in range(nd): fmeta.write("b[%d] = %d\n" % (i, b[(i + 1) % nd])) for i in range(nd): fmeta.write("nb[%d] = %d\n" % (i, nb[(i + 1) % nd])) fmeta.write("neig = %d\n" % neigen) fmeta.write("nkeep = %d\n" % nbasis) fmeta.write("nkeep_single = %d\n" % nsingle) fmeta.write("blocks = %d\n" % blocks) fmeta.write("FP16_COEF_EXP_SHARE_FLOATS = %d\n" % FP16_COEF_EXP_SHARE_FLOATS) fmeta.flush() # write crc32 later # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) crc32 = numpy.array([0] * cv0.ranks, dtype=numpy.uint64) # timing t0 = gpt.time() totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fwrite = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Saving %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) if fn is not None: os.makedirs(dn, exist_ok=True) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks totalSize = ( blocks * (block_data_size_single * nsingleCap + block_data_size_fp16 * (nbasis - nsingleCap)) + neigen * coarse_vector_size) totalSizeGB += totalSize / 1024.0**3.0 if v is not None else 0.0 # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "wb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 # make read-only to enable caching for x in pos: x.setflags(write=0) # single-precision data data = memoryview(bytearray(block_data_size_single * nsingleCap)) data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() lhs_size = basis[0].otype.nfloats * 4 * len(pos[b]) lhs = data_munged[0:lhs_size] distribute_plan = gpt.copy_plan(lhs, basis[0]) distribute_plan.destination += gpt.global_memory_view( fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]]) distribute_plan.source += basis[0].view[pos[b]] distribute_plan = distribute_plan() lhs = None for i in range(nsingleCap): distribute_plan( data_munged[block_data_size_single * i:block_data_size_single * (i + 1)], basis[i], ) dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() cgpt.munge_inner_outer( data, data_munged, block_reduce, nsingleCap, ) dt_munge += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data = memoryview( bytearray(block_data_size_fp16 * (nbasis - nsingleCap))) for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() lhs_size = basis[0].otype.nfloats * 4 * len(pos[b]) lhs = data_munged[0:lhs_size] distribute_plan = gpt.copy_plan(lhs, basis[0]) distribute_plan.destination += gpt.global_memory_view( fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]]) distribute_plan.source += basis[0].view[pos[b]] distribute_plan = distribute_plan() lhs = None for i in range(nsingleCap, nbasis): j = i - nsingleCap distribute_plan( data_munged[j * block_data_size_single:(j + 1) * block_data_size_single], basis[i], ) dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() cgpt.munge_inner_outer( data_fp32, data_munged, block_reduce, nbasis - nsingleCap, ) dt_munge += gpt.time() dt_fp16 -= gpt.time() cgpt.fp32_to_fp16(data, data_fp32, 24) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # coarse grid data data = memoryview(bytearray(coarse_vector_size)) data_fp32 = memoryview( bytearray(cevec[0].otype.nfloats * 4 * len(pos_coarse))) distribute_plan = gpt.copy_plan(data_fp32, cevec[0]) distribute_plan.destination += gpt.global_memory_view( cgrid, [[cgrid.processor, data_fp32, 0, data_fp32.nbytes]]) distribute_plan.source += cevec[0].view[pos_coarse] distribute_plan = distribute_plan() for j in range(neigen): fgrid.barrier() dt_distr -= gpt.time() distribute_plan(data_fp32, cevec[j]) dt_distr += gpt.time() if f is not None: dt_fp16 -= gpt.time() cgpt.fp32_to_mixed_fp32fp16( data, data_fp32, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose and j % (neigen // 10) == 0: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # save crc crc32[cv.rank] = crc32_comp # synchronize crc32 fgrid.globalsum(crc32) # timing t1 = gpt.time() # write crc to metadata if gpt.rank() == 0: for i in range(len(crc32)): fmeta.write("crc32[%d] = %X\n" % (i, crc32[i])) fmeta.close() # verbosity if verbose: gpt.message("* save %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0)))
def split_lattices(lattices, lcoor, gcoor, split_grid, N, cache, group_policy): # Example: # # Original # # lattice1,...,latticen | lattice1,...,latticen # # New # # lattice1,...,latticeN | latticeN+1,...,lattice2N # # Q = n // N = 2 # N is desired number of parallel split lattices per unsplit lattice # 1 <= N <= sranks, sranks % N == 0 n = len(lattices) assert n > 0 assert n % N == 0 Q = n // N # Save memory by performing each group separately if N != 1 and group_policy == split_group_policy.separate: res = [] for i in range(N): res += split_lattices( [lattices[q * N + i] for q in range(Q)], lcoor, gcoor, split_grid, 1, cache, group_policy, ) return res assert len(lcoor) == len(gcoor) grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)]) cb = lattices[0].checkerboard() assert all([lattices[i].checkerboard() is cb for i in range(1, n)]) otype = lattices[0].otype assert all( [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)]) l = [gpt.lattice(split_grid, otype) for i in range(N)] for x in l: x.checkerboard(cb) x.split_lcoor = lcoor x.split_gcoor = gcoor sranks = split_grid.sranks srank = split_grid.srank src_data = lattices dst_data = l # build views if cache is None: cache = {} cache_key = f"split_plan_{lattices[0].grid.obj}_{l[0].grid.obj}_{lattices[0].otype.__name__}_{l[0].otype.__name__}_{n}_{N}" if cache_key not in cache: plan = gpt.copy_plan(dst_data, src_data, embed_in_communicator=lattices[0].grid) i = srank // (sranks // Q) for x in lattices[i * N:(i + 1) * N]: plan.source += x.view[gcoor] for x in l: plan.destination += x.view[lcoor] cache[cache_key] = plan() cache[cache_key](dst_data, src_data) return l
g.message(f"Test {lhs.otype.__name__}") # warmup g.copy(lhs, rhs) t0 = g.time() for n in range(N): g.copy(lhs, rhs) t1 = g.time() g.message("%-50s %g GB/s" % ("copy:", GB / (t1 - t0))) pos = g.coordinates(lhs) # create plan during first assignment, exclude from benchmark plan = g.copy_plan(lhs, rhs) plan.destination += lhs.view[pos] plan.source += rhs.view[pos] plan = plan() # plan_info = plan.info()[0, 0][0, 0] # block_size = plan_info["size"] // plan_info["blocks"] # g.message(" " * 51 + f"block_size = {block_size}") # warmup plan(lhs, rhs) t0 = g.time() for n in range(N): plan(lhs, rhs) t1 = g.time() g.message("%-50s %g GB/s %g s" % ("copy_plan:", GB / (t1 - t0),
def merge(lattices, dimension=-1, N=-1): # if only one lattice is given, return immediately if type(lattices) != list: return lattices # number of lattices n = len(lattices) assert n > 0 # number of batches if N == -1: N = n batches = n // N assert n % N == 0 # all grids need to be the same grid = lattices[0].grid assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)]) # allow negative indexing if dimension < 0: dimension += grid.nd + 1 assert dimension >= 0 else: assert dimension <= grid.nd # infer checkerboarding of new dimension cb = [x.checkerboard() for x in lattices] if cb[0] is gpt.none: assert all([x is gpt.none for x in cb[1:]]) cb_mask = 0 else: assert all([ cb[j * N + i] is cb[j * N + i + 1].inv() for i in range(N - 1) for j in range(batches) ]) cb_mask = 1 # otypes must be consistent otype = lattices[0].otype assert all( [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)]) # create merged grid merged_grid = grid.inserted_dimension(dimension, N, cb_mask=cb_mask) # create merged lattices and set checkerboard merged_lattices = [gpt.lattice(merged_grid, otype) for i in range(batches)] for x in merged_lattices: x.checkerboard(cb[0]) # coordinates of source lattices gcoor_zero = gpt.coordinates(lattices[0]) gcoor_one = gpt.coordinates( lattices[1]) if N > 1 and cb_mask == 1 else gcoor_zero gcoor = [gcoor_zero, gcoor_one] # data transfer for i in range(N): merged_gcoor = cgpt.coordinates_inserted_dimension( gcoor[i % 2], dimension, [i]) plan = gpt.copy_plan( merged_lattices[0], lattices[i], embed_in_communicator=merged_lattices[0].grid, ) plan.destination += merged_lattices[0].view[merged_gcoor] plan.source += lattices[i].view[gcoor[i % 2]] plan = plan() for j in range(batches): plan(merged_lattices[j], lattices[j * N + i]) # if only one batch, remove list if len(merged_lattices) == 1: return merged_lattices[0] # return return merged_lattices