q = params["fmatrix"](U) # load basis vectors nbasis = params["nbasis"] # fg_basis,fg_cevec,fg_feval = g.load(params["basis"],{ # "grids" : q.F_grid_eo, "nmax" : nbasis, # "advise_basis" : g.infrequent_use, # "advise_cevec" : g.infrequent_use # }) rng = g.random("test") try: fg_basis = g.load("basis", {"grids": q.F_grid_eo})[0] except g.LoadError: fg_basis = g.advise( [g.vspincolor(q.F_grid_eo) for i in range(nbasis)], g.infrequent_use ) rng.zn(fg_basis) g.save("basis", [fg_basis]) # g.mem_report() # g.prefetch( fg_basis, g.to_accelerator) # g.mem_report() # w=fg_basis[-1] # g.orthogonalize(w,fg_basis[0:1]) # g.orthogonalize(w,fg_basis[0:15]) fg_basis = g.advise(fg_basis, g.infrequent_use) tg = g.block.grid(q.F_grid_eo, [12, 2, 2, 2, 2]) fg_cevec = g.advise([g.vcomplex(tg, 150) for i in range(nbasis)], g.infrequent_use)
def __call__(self, mat, src, ckpt=None): # verbosity verbose = g.default.is_verbose("irl") # checkpointer if ckpt is None: ckpt = g.checkpointer_none() ckpt.grid = src.grid self.ckpt = ckpt # first approximate largest eigenvalue pit = g.algorithms.eigen.power_iteration(eps=0.05, maxiter=10, real=True) lambda_max = pit(mat, src)[0] # parameters Nm = self.params["Nm"] Nu = self.params["Nu"] Nk = self.params["Nk"] Nstop = self.params["Nstop"] Np = Nm-Nk MaxIter=self.params["maxiter"] Np /= MaxIter assert Nm >= Nk and Nstop <= Nk print ( 'Nm=',Nm,'Nu=',Nu,'Nk=',Nk ) # tensors dtype = np.float64 ctype = np.complex128 lme = np.zeros((Nu,Nm), ctype) lmd = np.zeros((Nu,Nm), ctype) lme2 = np.zeros((Nu,Nm), ctype) lmd2 = np.empty((Nu,Nm), ctype) Qt = np.zeros((Nm,Nm),ctype) Q = np.zeros((Nm,Nm),ctype) ev = np.empty((Nm,), dtype) ev2_copy = np.empty((Nm,), dtype) # fields f = g.lattice(src) v = g.lattice(src) evec = [g.lattice(src) for i in range(Nm)] w = [g.lattice(src) for i in range(Nu)] w_copy = [g.lattice(src) for i in range(Nu)] # advice memory storage if not self.params["advise"] is None: g.advise(evec, self.params["advise"]) # scalars k1 = 1 k2 = Nk beta_k = 0.0 rng=g.random("test") # set initial vector # rng.zn(w) for i in range(Nu): rng.zn(w[i]) if i > 0: g.orthogonalize(w[i],evec[0:i]) evec[i]=g.copy(w[i]) evec[i] *= 1.0/ g.norm2(evec[i]) ** 0.5 g.message("norm(evec[%d]=%e "%(i,g.norm2(evec[i]))) if i > 0: for j in range(i): ip=g.innerProduct(evec[j],w[i]) if np.abs(ip) >1e-6: g.message("inner(evec[%d],w[%d])=%e %e"% (j,i,ip.real,ip.imag)) # evec[i] @= src[i] / g.norm2(src[i]) ** 0.5 # initial Nk steps Nblock_k = int(Nk/Nu) for b in range(Nblock_k): self.blockStep(mat, lmd, lme, evec, w, w_copy, Nm, b,Nu) Nblock_p = int(Np/Nu) # restarting loop # for it in range(self.params["maxiter"]): for it in range(MaxIter): if verbose: g.message("Restart iteration %d" % it) Nblock_l = Nblock_k + it*Nblock_p; Nblock_r = Nblock_l + Nblock_p; Nl = Nblock_l*Nu Nr = Nblock_r*Nu # ev2.resize(Nr) ev2 = np.empty((Nr,), dtype) for b in range(Nblock_l, Nblock_r): self.blockStep(mat, lmd, lme, evec, w, w_copy, Nm, b,Nu) for u in range(Nu): for k in range(Nr): lmd2[u,k]=lmd[u,k] lme2[u,k]=lme[u,k] Qt = np.identity(Nr, ctype) # diagonalize t0 = g.time() # self.diagonalize(ev2, lme2, Nm, Qt) self.diagonalize(ev2,lmd2,lme2,Nu,Nr,Qt) # def diagonalize(self, eval, lmd, lme, Nu, Nk, Nm, Qt): t1 = g.time() if verbose: g.message("Diagonalization took %g s" % (t1 - t0)) # sort ev2_copy = ev2.copy() ev2 = list(reversed(sorted(ev2))) for i in range(Nr): g.message("Rval[%d]= %e"%(i,ev2[i])) # rotate # t0 = g.time() # g.rotate(evec, Qt, k1 - 1, k2 + 1, 0, Nm) # t1 = g.time() # if verbose: # g.message("Basis rotation took %g s" % (t1 - t0)) # convergence test if it >= self.params["Nminres"]: if verbose: g.message("Rotation to test convergence") # diagonalize for k in range(Nr): ev2[k] = ev[k] # lme2[k] = lme[k] for u in range(Nu): for k in range(Nr): lmd2[u,k]=lmd[u,k] lme2[u,k]=lme[u,k] Qt = np.identity(Nm, ctype) t0 = g.time() # self.diagonalize(ev2, lme2, Nk, Qt) self.diagonalize(ev2,lmd2,lme2,Nu,Nr,Qt) t1 = g.time() if verbose: g.message("Diagonalization took %g s" % (t1 - t0)) B = g.copy(evec[0]) allconv = True if beta_k >= self.params["betastp"]: jj = 1 while jj <= Nstop: j = Nstop - jj g.linear_combination(B, evec[0:Nr], Qt[j, 0:Nr]) g.message("norm=%e"%(g.norm2(B))) B *= 1.0 / g.norm2(B) ** 0.5 if not ckpt.load(v): mat(v, B) ckpt.save(v) ev_test = g.innerProduct(B, v).real eps2 = g.norm2(v - ev_test * B) / lambda_max ** 2.0 if verbose: g.message( "%-65s %-45s %-50s" % ( "ev[ %d ] = %s" % (j, ev2_copy[j]), "<B|M|B> = %s" % (ev_test), "|M B - ev B|^2 / ev_max^2 = %s" % (eps2), ) ) if eps2 > self.params["resid"]: allconv = False if jj == Nstop: break jj = min([Nstop, 2 * jj]) if allconv: if verbose: g.message("Converged in %d iterations" % it) break t0 = g.time() g.rotate(evec, Qt, 0, Nstop, 0, Nk) t1 = g.time() if verbose: g.message("Final basis rotation took %g s" % (t1 - t0)) return (evec[0:Nstop], ev2_copy[0:Nstop])
def __call__(self, mat, src, ckpt=None): # verbosity verbose = g.default.is_verbose("irl") # checkpointer if ckpt is None: ckpt = g.checkpointer_none() ckpt.grid = src.grid self.ckpt = ckpt # first approximate largest eigenvalue pit = g.algorithms.eigen.power_iteration(eps=0.05, maxiter=10, real=True) lambda_max = pit(mat, src)[0] # parameters Nm = self.params["Nm"] Nk = self.params["Nk"] Nstop = self.params["Nstop"] assert Nm >= Nk and Nstop <= Nk # tensors dtype = np.float64 lme = np.empty((Nm, ), dtype) lme2 = np.empty((Nm, ), dtype) ev = np.empty((Nm, ), dtype) ev2 = np.empty((Nm, ), dtype) ev2_copy = np.empty((Nm, ), dtype) # fields f = g.lattice(src) v = g.lattice(src) evec = [g.lattice(src) for i in range(Nm)] # advice memory storage if not self.params["advise"] is None: g.advise(evec, self.params["advise"]) # scalars k1 = 1 k2 = Nk beta_k = 0.0 # set initial vector evec[0] @= src / g.norm2(src)**0.5 # initial Nk steps for k in range(Nk): self.step(mat, ev, lme, evec, f, Nm, k) # restarting loop for it in range(self.params["maxiter"]): if verbose: g.message("Restart iteration %d" % it) for k in range(Nk, Nm): self.step(mat, ev, lme, evec, f, Nm, k) f *= lme[Nm - 1] # eigenvalues for k in range(Nm): ev2[k] = ev[k + k1 - 1] lme2[k] = lme[k + k1 - 1] # diagonalize t0 = g.time() Qt = np.identity(Nm, dtype) self.diagonalize(ev2, lme2, Nm, Qt) t1 = g.time() if verbose: g.message("Diagonalization took %g s" % (t1 - t0)) # sort ev2_copy = ev2.copy() ev2 = list(reversed(sorted(ev2))) # implicitly shifted QR transformations Qt = np.identity(Nm, dtype) t0 = g.time() for ip in range(k2, Nm): g.qr_decomposition(ev, lme, Nm, Nm, Qt, ev2[ip], k1, Nm) t1 = g.time() if verbose: g.message("QR took %g s" % (t1 - t0)) # rotate t0 = g.time() g.rotate(evec, Qt, k1 - 1, k2 + 1, 0, Nm) t1 = g.time() if verbose: g.message("Basis rotation took %g s" % (t1 - t0)) # compression f *= Qt[k2 - 1, Nm - 1] f += lme[k2 - 1] * evec[k2] beta_k = g.norm2(f)**0.5 betar = 1.0 / beta_k evec[k2] @= betar * f lme[k2 - 1] = beta_k if verbose: g.message("beta_k = ", beta_k) # convergence test if it >= self.params["Nminres"]: if verbose: g.message("Rotation to test convergence") # diagonalize for k in range(Nm): ev2[k] = ev[k] lme2[k] = lme[k] Qt = np.identity(Nm, dtype) t0 = g.time() self.diagonalize(ev2, lme2, Nk, Qt) t1 = g.time() if verbose: g.message("Diagonalization took %g s" % (t1 - t0)) B = g.copy(evec[0]) allconv = True if beta_k >= self.params["betastp"]: jj = 1 while jj <= Nstop: j = Nstop - jj g.linear_combination(B, evec[0:Nk], Qt[j, 0:Nk]) B *= 1.0 / g.norm2(B)**0.5 if not ckpt.load(v): mat(v, B) ckpt.save(v) ev_test = g.inner_product(B, v).real eps2 = g.norm2(v - ev_test * B) / lambda_max**2.0 if verbose: g.message("%-65s %-45s %-50s" % ( "ev[ %d ] = %s" % (j, ev2_copy[j]), "<B|M|B> = %s" % (ev_test), "|M B - ev B|^2 / ev_max^2 = %s" % (eps2), )) if eps2 > self.params["resid"]: allconv = False if jj == Nstop: break jj = min([Nstop, 2 * jj]) if allconv: if verbose: g.message("Converged in %d iterations" % it) break t0 = g.time() g.rotate(evec, Qt, 0, Nstop, 0, Nk) t1 = g.time() if verbose: g.message("Final basis rotation took %g s" % (t1 - t0)) return (evec[0:Nstop], ev2_copy[0:Nstop])
def load(filename, params): # first check if this is right file format if not os.path.exists(filename + "/00/0000000000.compressed" ) or not os.path.exists(filename + "/metadata.txt"): raise NotImplementedError() # verbosity verbose = gpt.default.is_verbose("io") # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # need grids parameter assert params["grids"] is not None assert type(params["grids"]) == gpt.grid fgrid = params["grids"] assert fgrid.precision == gpt.single fdimensions = fgrid.fdimensions # read metadata metadata = read_metadata(filename + "/metadata.txt") s = get_ivec(metadata, "s") ldimensions = [s[4]] + s[:4] blocksize = get_ivec(metadata, "b") blocksize = [blocksize[4]] + blocksize[:4] nb = get_ivec(metadata, "nb") nb = [nb[4]] + nb[:4] crc32 = get_xvec(metadata, "crc32") neigen = int(metadata["neig"]) nbasis = int(metadata["nkeep"]) nsingle = int(metadata["nkeep_single"]) blocks = int(metadata["blocks"]) FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"]) nsingleCap = min([nsingle, nbasis]) # check nd = len(ldimensions) assert nd == 5 assert nd == len(fdimensions) assert nd == len(blocksize) assert fgrid.cb.n == 2 assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1] # create coarse grid cgrid = gpt.block.grid(fgrid, blocksize) # allow for partial loading of data if params["nmax"] is not None: nmax = params["nmax"] nbasis_max = min([nmax, nbasis]) neigen_max = min([nmax, neigen]) nsingleCap_max = min([nmax, nsingleCap]) else: nbasis_max = nbasis neigen_max = neigen nsingleCap_max = nsingleCap # allocate all lattices basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)] cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)] if params["advise_basis"] is not None: gpt.advise(basis, params["advise_basis"]) if params["advise_cevec"] is not None: gpt.advise(cevec, params["advise_cevec"]) # fix checkerboard of basis for i in range(nbasis_max): basis[i].checkerboard(site_cb) # mpi layout mpi = [] for i in range(nd): assert fdimensions[i] % ldimensions[i] == 0 mpi.append(fdimensions[i] // ldimensions[i]) assert mpi[0] == 1 # assert no mpi in 5th direction # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) # timing totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fread = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Loading %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "rb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 gpt.message("Read blocks", blocks) # make read-only to enable caching for x in pos: x.setflags(write=0) # dummy buffer data0 = memoryview(bytes()) # single-precision data data_munged = memoryview(bytearray(block_data_size_single * nsingleCap)) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(block_data_size_single * nsingleCap)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_munge -= gpt.time() # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB cgpt.munge_inner_outer(data_munged, data, nsingleCap, block_reduce) # data_munged: lattice0 lattice1 lattice2 ... dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap_max): distribute_plan( basis[i], data_munged[block_data_size_single * i:block_data_size_single * (i + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, mem_avail(), )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data_munged = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) for b in range(read_blocks): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview( f.read(block_data_size_fp16 * (nbasis - nsingleCap))) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.fp16_to_fp32(data_fp32, data, 24) dt_fp16 += gpt.time() dt_munge -= gpt.time() cgpt.munge_inner_outer( data_munged, data_fp32, nbasis - nsingleCap, block_reduce, ) dt_munge += gpt.time() else: data_munged = data0 fgrid.barrier() dt_distr -= gpt.time() if nsingleCap < nbasis_max: rhs = data_munged[0:block_data_size_single] distribute_plan = gpt.copy_plan(basis[0], rhs) distribute_plan.destination += basis[0].view[pos[b]] distribute_plan.source += gpt.global_memory_view( fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]]) rhs = None distribute_plan = distribute_plan() for i in range(nsingleCap, nbasis_max): j = i - nsingleCap distribute_plan( basis[i], data_munged[block_data_size_single * j:block_data_size_single * (j + 1)], ) dt_distr += gpt.time() if verbose: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # coarse grid data data_fp32 = memoryview(bytearray(coarse_fp32_vector_size)) distribute_plan = None for j in range(neigen): fgrid.barrier() dt_fread -= gpt.time() if f is not None: data = memoryview(f.read(coarse_vector_size)) globalReadGB = len(data) / 1024.0**3.0 else: globalReadGB = 0.0 globalReadGB = fgrid.globalsum(globalReadGB) dt_fread += gpt.time() totalSizeGB += globalReadGB if f is not None: dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() dt_fp16 -= gpt.time() cgpt.mixed_fp32fp16_to_fp32( data_fp32, data, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() data = data_fp32 else: data = data0 fgrid.barrier() dt_distr -= gpt.time() if j < neigen_max: if distribute_plan is None: distribute_plan = gpt.copy_plan(cevec[j], data) distribute_plan.destination += cevec[j].view[pos_coarse] distribute_plan.source += gpt.global_memory_view( cgrid, [[cgrid.processor, data, 0, data.nbytes]]) distribute_plan = distribute_plan() distribute_plan(cevec[j], data) dt_distr += gpt.time() if verbose and j % (neigen // 10) == 0: gpt.message( "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB" % ( totalSizeGB, totalSizeGB / dt_fread, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, mem_avail(), )) # crc checks if f is not None: assert crc32_comp == crc32[cv.rank] # timing t1 = gpt.time() # verbosity if verbose: gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0))) # eigenvalues evln = list( filter(lambda x: x != "", open(filename + "/eigen-values.txt").read().split("\n"))) nev = int(evln[0]) ev = [float(x) for x in evln[1:]] assert len(ev) == nev return (basis, cevec, ev)