Exemplo n.º 1
0
q = params["fmatrix"](U)

# load basis vectors
nbasis = params["nbasis"]
# fg_basis,fg_cevec,fg_feval = g.load(params["basis"],{
#    "grids" : q.F_grid_eo, "nmax" : nbasis,
#    "advise_basis" : g.infrequent_use,
#    "advise_cevec" : g.infrequent_use
# })
rng = g.random("test")

try:
    fg_basis = g.load("basis", {"grids": q.F_grid_eo})[0]
except g.LoadError:
    fg_basis = g.advise(
        [g.vspincolor(q.F_grid_eo) for i in range(nbasis)], g.infrequent_use
    )
    rng.zn(fg_basis)
    g.save("basis", [fg_basis])

# g.mem_report()
# g.prefetch( fg_basis, g.to_accelerator)
# g.mem_report()

# w=fg_basis[-1]
# g.orthogonalize(w,fg_basis[0:1])
# g.orthogonalize(w,fg_basis[0:15])

fg_basis = g.advise(fg_basis, g.infrequent_use)
tg = g.block.grid(q.F_grid_eo, [12, 2, 2, 2, 2])
fg_cevec = g.advise([g.vcomplex(tg, 150) for i in range(nbasis)], g.infrequent_use)
Exemplo n.º 2
0
    def __call__(self, mat, src, ckpt=None):

        # verbosity
        verbose = g.default.is_verbose("irl")

        # checkpointer
        if ckpt is None:
            ckpt = g.checkpointer_none()
        ckpt.grid = src.grid
        self.ckpt = ckpt

        # first approximate largest eigenvalue
        pit = g.algorithms.eigen.power_iteration(eps=0.05, maxiter=10, real=True)
        lambda_max = pit(mat, src)[0]

        # parameters
        Nm = self.params["Nm"]
        Nu = self.params["Nu"]
        Nk = self.params["Nk"]
        Nstop = self.params["Nstop"]
        Np = Nm-Nk
        MaxIter=self.params["maxiter"]
        Np /= MaxIter
        assert Nm >= Nk and Nstop <= Nk
        print ( 'Nm=',Nm,'Nu=',Nu,'Nk=',Nk )

        # tensors
        dtype = np.float64
        ctype = np.complex128
         
        lme = np.zeros((Nu,Nm), ctype)
        lmd = np.zeros((Nu,Nm), ctype)
        lme2 = np.zeros((Nu,Nm), ctype)
        lmd2 = np.empty((Nu,Nm), ctype)
        Qt = np.zeros((Nm,Nm),ctype)
        Q = np.zeros((Nm,Nm),ctype)
        ev = np.empty((Nm,), dtype)
        ev2_copy = np.empty((Nm,), dtype)

        # fields
        f = g.lattice(src)
        v = g.lattice(src)
        evec = [g.lattice(src) for i in range(Nm)]
        w = [g.lattice(src) for i in range(Nu)]
        w_copy = [g.lattice(src) for i in range(Nu)]

        # advice memory storage
        if not self.params["advise"] is None:
            g.advise(evec, self.params["advise"])

        # scalars
        k1 = 1
        k2 = Nk
        beta_k = 0.0

        rng=g.random("test")
        # set initial vector
#        rng.zn(w)
        for i in range(Nu):
            rng.zn(w[i])
            if i > 0: 
                g.orthogonalize(w[i],evec[0:i])
            evec[i]=g.copy(w[i])
            evec[i] *= 1.0/ g.norm2(evec[i]) ** 0.5
            g.message("norm(evec[%d]=%e "%(i,g.norm2(evec[i])))
            if i > 0: 
                for j in range(i):
                    ip=g.innerProduct(evec[j],w[i])
                    if np.abs(ip) >1e-6:
                        g.message("inner(evec[%d],w[%d])=%e %e"% (j,i,ip.real,ip.imag))
#           evec[i] @= src[i] / g.norm2(src[i]) ** 0.5

        # initial Nk steps
        Nblock_k = int(Nk/Nu)
        for b in range(Nblock_k):
            self.blockStep(mat, lmd, lme, evec, w, w_copy, Nm, b,Nu)

        Nblock_p = int(Np/Nu)
        # restarting loop
#        for it in range(self.params["maxiter"]):
        for it in range(MaxIter):
            if verbose:
                g.message("Restart iteration %d" % it)

            Nblock_l = Nblock_k + it*Nblock_p;
            Nblock_r = Nblock_l + Nblock_p;
            Nl = Nblock_l*Nu
            Nr = Nblock_r*Nu
#           ev2.resize(Nr)
            ev2 = np.empty((Nr,), dtype)

            for b in range(Nblock_l, Nblock_r):
                self.blockStep(mat,  lmd, lme, evec, w, w_copy, Nm, b,Nu)

            for u in range(Nu):
                for k in range(Nr):
                    lmd2[u,k]=lmd[u,k]
                    lme2[u,k]=lme[u,k]


            Qt = np.identity(Nr, ctype)
            
            # diagonalize
            t0 = g.time()
#            self.diagonalize(ev2, lme2, Nm, Qt)
            self.diagonalize(ev2,lmd2,lme2,Nu,Nr,Qt)
#    def diagonalize(self, eval, lmd, lme, Nu, Nk, Nm, Qt):
            t1 = g.time()

            if verbose:
                g.message("Diagonalization took %g s" % (t1 - t0))

            # sort
            ev2_copy = ev2.copy()
            ev2 = list(reversed(sorted(ev2)))

            for i in range(Nr):
                g.message("Rval[%d]= %e"%(i,ev2[i]))

            # rotate
#            t0 = g.time()
#            g.rotate(evec, Qt, k1 - 1, k2 + 1, 0, Nm)
#            t1 = g.time()

#            if verbose:
#                g.message("Basis rotation took %g s" % (t1 - t0))

            # convergence test
            if it >= self.params["Nminres"]:
                if verbose:
                    g.message("Rotation to test convergence")

                # diagonalize
                for k in range(Nr):
                    ev2[k] = ev[k]
            #        lme2[k] = lme[k]
                for u in range(Nu):
                    for k in range(Nr):
                        lmd2[u,k]=lmd[u,k]
                        lme2[u,k]=lme[u,k]
                Qt = np.identity(Nm, ctype)

                t0 = g.time()
#                self.diagonalize(ev2, lme2, Nk, Qt)
                self.diagonalize(ev2,lmd2,lme2,Nu,Nr,Qt)
                t1 = g.time()

                if verbose:
                    g.message("Diagonalization took %g s" % (t1 - t0))

                B = g.copy(evec[0])

                allconv = True
                if beta_k >= self.params["betastp"]:
                    jj = 1
                    while jj <= Nstop:
                        j = Nstop - jj
                        g.linear_combination(B, evec[0:Nr], Qt[j, 0:Nr])
                        g.message("norm=%e"%(g.norm2(B)))
                        B *= 1.0 / g.norm2(B) ** 0.5
                        if not ckpt.load(v):
                            mat(v, B)
                            ckpt.save(v)
                        ev_test = g.innerProduct(B, v).real
                        eps2 = g.norm2(v - ev_test * B) / lambda_max ** 2.0
                        if verbose:
                            g.message(
                                "%-65s %-45s %-50s"
                                % (
                                    "ev[ %d ] = %s" % (j, ev2_copy[j]),
                                    "<B|M|B> = %s" % (ev_test),
                                    "|M B - ev B|^2 / ev_max^2 = %s" % (eps2),
                                )
                            )
                        if eps2 > self.params["resid"]:
                            allconv = False
                        if jj == Nstop:
                            break
                        jj = min([Nstop, 2 * jj])

                if allconv:
                    if verbose:
                        g.message("Converged in %d iterations" % it)
                        break

        t0 = g.time()
        g.rotate(evec, Qt, 0, Nstop, 0, Nk)
        t1 = g.time()

        if verbose:
            g.message("Final basis rotation took %g s" % (t1 - t0))

        return (evec[0:Nstop], ev2_copy[0:Nstop])
Exemplo n.º 3
0
Arquivo: irl.py Projeto: mbruno46/gpt
    def __call__(self, mat, src, ckpt=None):

        # verbosity
        verbose = g.default.is_verbose("irl")

        # checkpointer
        if ckpt is None:
            ckpt = g.checkpointer_none()
        ckpt.grid = src.grid
        self.ckpt = ckpt

        # first approximate largest eigenvalue
        pit = g.algorithms.eigen.power_iteration(eps=0.05,
                                                 maxiter=10,
                                                 real=True)
        lambda_max = pit(mat, src)[0]

        # parameters
        Nm = self.params["Nm"]
        Nk = self.params["Nk"]
        Nstop = self.params["Nstop"]
        assert Nm >= Nk and Nstop <= Nk

        # tensors
        dtype = np.float64
        lme = np.empty((Nm, ), dtype)
        lme2 = np.empty((Nm, ), dtype)
        ev = np.empty((Nm, ), dtype)
        ev2 = np.empty((Nm, ), dtype)
        ev2_copy = np.empty((Nm, ), dtype)

        # fields
        f = g.lattice(src)
        v = g.lattice(src)
        evec = [g.lattice(src) for i in range(Nm)]

        # advice memory storage
        if not self.params["advise"] is None:
            g.advise(evec, self.params["advise"])

        # scalars
        k1 = 1
        k2 = Nk
        beta_k = 0.0

        # set initial vector
        evec[0] @= src / g.norm2(src)**0.5

        # initial Nk steps
        for k in range(Nk):
            self.step(mat, ev, lme, evec, f, Nm, k)

        # restarting loop
        for it in range(self.params["maxiter"]):
            if verbose:
                g.message("Restart iteration %d" % it)
            for k in range(Nk, Nm):
                self.step(mat, ev, lme, evec, f, Nm, k)
            f *= lme[Nm - 1]

            # eigenvalues
            for k in range(Nm):
                ev2[k] = ev[k + k1 - 1]
                lme2[k] = lme[k + k1 - 1]

            # diagonalize
            t0 = g.time()
            Qt = np.identity(Nm, dtype)
            self.diagonalize(ev2, lme2, Nm, Qt)
            t1 = g.time()

            if verbose:
                g.message("Diagonalization took %g s" % (t1 - t0))

            # sort
            ev2_copy = ev2.copy()
            ev2 = list(reversed(sorted(ev2)))

            # implicitly shifted QR transformations
            Qt = np.identity(Nm, dtype)
            t0 = g.time()
            for ip in range(k2, Nm):
                g.qr_decomposition(ev, lme, Nm, Nm, Qt, ev2[ip], k1, Nm)
            t1 = g.time()

            if verbose:
                g.message("QR took %g s" % (t1 - t0))

            # rotate
            t0 = g.time()
            g.rotate(evec, Qt, k1 - 1, k2 + 1, 0, Nm)
            t1 = g.time()

            if verbose:
                g.message("Basis rotation took %g s" % (t1 - t0))

            # compression
            f *= Qt[k2 - 1, Nm - 1]
            f += lme[k2 - 1] * evec[k2]
            beta_k = g.norm2(f)**0.5
            betar = 1.0 / beta_k
            evec[k2] @= betar * f
            lme[k2 - 1] = beta_k

            if verbose:
                g.message("beta_k = ", beta_k)

            # convergence test
            if it >= self.params["Nminres"]:
                if verbose:
                    g.message("Rotation to test convergence")

                # diagonalize
                for k in range(Nm):
                    ev2[k] = ev[k]
                    lme2[k] = lme[k]
                Qt = np.identity(Nm, dtype)

                t0 = g.time()
                self.diagonalize(ev2, lme2, Nk, Qt)
                t1 = g.time()

                if verbose:
                    g.message("Diagonalization took %g s" % (t1 - t0))

                B = g.copy(evec[0])

                allconv = True
                if beta_k >= self.params["betastp"]:
                    jj = 1
                    while jj <= Nstop:
                        j = Nstop - jj
                        g.linear_combination(B, evec[0:Nk], Qt[j, 0:Nk])
                        B *= 1.0 / g.norm2(B)**0.5
                        if not ckpt.load(v):
                            mat(v, B)
                            ckpt.save(v)
                        ev_test = g.inner_product(B, v).real
                        eps2 = g.norm2(v - ev_test * B) / lambda_max**2.0
                        if verbose:
                            g.message("%-65s %-45s %-50s" % (
                                "ev[ %d ] = %s" % (j, ev2_copy[j]),
                                "<B|M|B> = %s" % (ev_test),
                                "|M B - ev B|^2 / ev_max^2 = %s" % (eps2),
                            ))
                        if eps2 > self.params["resid"]:
                            allconv = False
                        if jj == Nstop:
                            break
                        jj = min([Nstop, 2 * jj])

                if allconv:
                    if verbose:
                        g.message("Converged in %d iterations" % it)
                        break

        t0 = g.time()
        g.rotate(evec, Qt, 0, Nstop, 0, Nk)
        t1 = g.time()

        if verbose:
            g.message("Final basis rotation took %g s" % (t1 - t0))

        return (evec[0:Nstop], ev2_copy[0:Nstop])
Exemplo n.º 4
0
def load(filename, params):

    # first check if this is right file format
    if not os.path.exists(filename + "/00/0000000000.compressed"
                          ) or not os.path.exists(filename + "/metadata.txt"):
        raise NotImplementedError()

    # verbosity
    verbose = gpt.default.is_verbose("io")

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # need grids parameter
    assert params["grids"] is not None
    assert type(params["grids"]) == gpt.grid
    fgrid = params["grids"]
    assert fgrid.precision == gpt.single
    fdimensions = fgrid.fdimensions

    # read metadata
    metadata = read_metadata(filename + "/metadata.txt")
    s = get_ivec(metadata, "s")
    ldimensions = [s[4]] + s[:4]
    blocksize = get_ivec(metadata, "b")
    blocksize = [blocksize[4]] + blocksize[:4]
    nb = get_ivec(metadata, "nb")
    nb = [nb[4]] + nb[:4]
    crc32 = get_xvec(metadata, "crc32")
    neigen = int(metadata["neig"])
    nbasis = int(metadata["nkeep"])
    nsingle = int(metadata["nkeep_single"])
    blocks = int(metadata["blocks"])
    FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"])
    nsingleCap = min([nsingle, nbasis])

    # check
    nd = len(ldimensions)
    assert nd == 5
    assert nd == len(fdimensions)
    assert nd == len(blocksize)
    assert fgrid.cb.n == 2
    assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1]

    # create coarse grid
    cgrid = gpt.block.grid(fgrid, blocksize)

    # allow for partial loading of data
    if params["nmax"] is not None:
        nmax = params["nmax"]
        nbasis_max = min([nmax, nbasis])
        neigen_max = min([nmax, neigen])
        nsingleCap_max = min([nmax, nsingleCap])
    else:
        nbasis_max = nbasis
        neigen_max = neigen
        nsingleCap_max = nsingleCap

    # allocate all lattices
    basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)]
    cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)]
    if params["advise_basis"] is not None:
        gpt.advise(basis, params["advise_basis"])
    if params["advise_cevec"] is not None:
        gpt.advise(cevec, params["advise_cevec"])

    # fix checkerboard of basis
    for i in range(nbasis_max):
        basis[i].checkerboard(site_cb)

    # mpi layout
    mpi = []
    for i in range(nd):
        assert fdimensions[i] % ldimensions[i] == 0
        mpi.append(fdimensions[i] // ldimensions[i])
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)

    # timing
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fread = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Loading %s with %d views per node" %
                    (filename, len(views)))
    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "rb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2
        gpt.message("Read blocks", blocks)

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # dummy buffer
        data0 = memoryview(bytes())

        # single-precision data
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))
        for b in range(read_blocks):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(block_data_size_single * nsingleCap))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_munge -= gpt.time()
                # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB
                cgpt.munge_inner_outer(data_munged, data, nsingleCap,
                                       block_reduce)
                # data_munged: lattice0 lattice1 lattice2 ...
                dt_munge += gpt.time()
            else:
                data_munged = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            rhs = data_munged[0:block_data_size_single]
            distribute_plan = gpt.copy_plan(basis[0], rhs)
            distribute_plan.destination += basis[0].view[pos[b]]
            distribute_plan.source += gpt.global_memory_view(
                fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
            rhs = None
            distribute_plan = distribute_plan()
            for i in range(nsingleCap_max):
                distribute_plan(
                    basis[i],
                    data_munged[block_data_size_single *
                                i:block_data_size_single * (i + 1)],
                )
            dt_distr += gpt.time()

            if verbose:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        mem_avail(),
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            for b in range(read_blocks):
                fgrid.barrier()
                dt_fread -= gpt.time()
                if f is not None:
                    data = memoryview(
                        f.read(block_data_size_fp16 * (nbasis - nsingleCap)))
                    globalReadGB = len(data) / 1024.0**3.0
                else:
                    globalReadGB = 0.0
                globalReadGB = fgrid.globalsum(globalReadGB)
                dt_fread += gpt.time()
                totalSizeGB += globalReadGB

                if f is not None:
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp16_to_fp32(data_fp32, data, 24)
                    dt_fp16 += gpt.time()
                    dt_munge -= gpt.time()
                    cgpt.munge_inner_outer(
                        data_munged,
                        data_fp32,
                        nbasis - nsingleCap,
                        block_reduce,
                    )
                    dt_munge += gpt.time()
                else:
                    data_munged = data0

                fgrid.barrier()
                dt_distr -= gpt.time()
                if nsingleCap < nbasis_max:
                    rhs = data_munged[0:block_data_size_single]
                    distribute_plan = gpt.copy_plan(basis[0], rhs)
                    distribute_plan.destination += basis[0].view[pos[b]]
                    distribute_plan.source += gpt.global_memory_view(
                        fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
                    rhs = None
                    distribute_plan = distribute_plan()
                    for i in range(nsingleCap, nbasis_max):
                        j = i - nsingleCap
                        distribute_plan(
                            basis[i],
                            data_munged[block_data_size_single *
                                        j:block_data_size_single * (j + 1)],
                        )
                dt_distr += gpt.time()

                if verbose:
                    gpt.message(
                        "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fread,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                            mem_avail(),
                        ))

        # coarse grid data
        data_fp32 = memoryview(bytearray(coarse_fp32_vector_size))
        distribute_plan = None
        for j in range(neigen):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(coarse_vector_size))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_fp16 -= gpt.time()
                cgpt.mixed_fp32fp16_to_fp32(
                    data_fp32,
                    data,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                data = data_fp32
            else:
                data = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            if j < neigen_max:
                if distribute_plan is None:
                    distribute_plan = gpt.copy_plan(cevec[j], data)
                    distribute_plan.destination += cevec[j].view[pos_coarse]
                    distribute_plan.source += gpt.global_memory_view(
                        cgrid, [[cgrid.processor, data, 0, data.nbytes]])
                    distribute_plan = distribute_plan()
                distribute_plan(cevec[j], data)
            dt_distr += gpt.time()

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                        mem_avail(),
                    ))

        # crc checks
        if f is not None:
            assert crc32_comp == crc32[cv.rank]

    # timing
    t1 = gpt.time()

    # verbosity
    if verbose:
        gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))

    # eigenvalues
    evln = list(
        filter(lambda x: x != "",
               open(filename + "/eigen-values.txt").read().split("\n")))
    nev = int(evln[0])
    ev = [float(x) for x in evln[1:]]
    assert len(ev) == nev
    return (basis, cevec, ev)