Exemplo n.º 1
0
        def inv(dst, src):
            dst[:] = 0
            eta = gpt.copy(src)
            ws = [gpt.copy(src) for _ in range(2)]
            cache_key_base = (
                f"{dst.describe()}_{src.describe()}_{src.grid.obj}_{dst.grid.obj}"
            )

            dt_solv = dt_distr = dt_hop = 0.0
            for eo in range(2):
                ws[0][:] = 0
                dt_distr -= gpt.time()
                cache_key = f"{cache_key_base}_{eo}_a"
                if cache_key not in cache:
                    plan = gpt.copy_plan(src_blk,
                                         eta,
                                         embed_in_communicator=eta.grid)
                    plan.destination += src_blk.view[sap.pos]
                    plan.source += eta.view[sap.coor[eo]]
                    cache[cache_key] = plan()
                cache[cache_key](src_blk, eta)
                dt_distr += gpt.time()

                dt_solv -= gpt.time()
                dst_blk[:] = 0  # for now
                solver[eo](dst_blk, src_blk)
                dt_solv += gpt.time()

                dt_distr -= gpt.time()
                cache_key = f"{cache_key_base}_{eo}_b"
                if cache_key not in cache:
                    plan = gpt.copy_plan(ws[0],
                                         dst_blk,
                                         embed_in_communicator=ws[0].grid)
                    plan.destination += ws[0].view[sap.coor[eo]]
                    plan.source += dst_blk.view[sap.pos]
                    cache[cache_key] = plan()
                cache[cache_key](ws[0], dst_blk)
                dt_distr += gpt.time()

                dt_hop -= gpt.time()
                if eo == 0:
                    sap.op(ws[1], ws[0])
                eta -= ws[1]
                dst += ws[0]
                dt_hop += gpt.time()

                gpt.message(
                    f"SAP cycle; |rho|^2 = {gpt.norm2(eta):g}; |dst|^2 = {gpt.norm2(dst):g}"
                )
                gpt.message(
                    f"SAP Timings: distr {dt_distr:g} secs, blk_solver {dt_solv:g} secs, hop+update {dt_hop:g} secs"
                )
Exemplo n.º 2
0
    def perform(self, root):
        global basis_size, T, current_config
        if current_config is not None and current_config.conf_file != self.conf_file:
            current_config = None
        if current_config is None:
            current_config = config(self.conf_file)

        c = None
        vcj = [
            g.vcolor(current_config.l_exact.U_grid) for jr in range(basis_size)
        ]
        for vcjj in vcj:
            vcjj[:] = 0

        for tprime in range(T):
            basis_evec, basis_evals = g.load(self.basis_fmt %
                                             (self.conf, tprime))

            plan = g.copy_plan(vcj[0],
                               basis_evec[0],
                               embed_in_communicator=vcj[0].grid)
            c = g.coordinates(basis_evec[0])
            plan.destination += vcj[0].view[np.hstack(
                (c, np.ones((len(c), 1), dtype=np.int32) * tprime))]
            plan.source += basis_evec[0].view[c]
            plan = plan()

            for l in range(basis_size):
                plan(vcj[l], basis_evec[l])

        for l in range(basis_size):
            g.message("Check norm:", l, g.norm2(vcj[l]))

        g.save(f"{root}/{self.name}/basis", vcj)
Exemplo n.º 3
0
def merge_indices(dst, src, st, cache=default_merge_indices_cache):
    pos = gpt.coordinates(dst)
    assert st is not None
    result_otype = st[-1]()
    if result_otype is None:
        dst @= src
        return
    ndim = dst.otype.shape[st[0]]
    rank = len(st) - 1
    islice = [slice(None, None, None) for i in range(len(dst.otype.shape))]
    ivec = [0] * rank
    cache_key = f"merge_indices_{dst.describe()}_{result_otype.__name__}_{dst.grid.obj}"

    tidx = []
    src_i = []
    for i in range(ndim**rank):
        idx = i
        for j in range(rank):
            c = idx % ndim
            islice[st[j]] = c
            ivec[j] = c
            idx //= ndim
        src_i.append(src[tuple(ivec)])
        tidx.append(tuple(islice))

    if cache_key not in cache:
        plan = gpt.copy_plan(dst, src_i)
        for i in range(ndim**rank):
            plan.destination += dst.view[(pos, ) + tidx[i]]
            plan.source += src_i[i].view[:]
        cache[cache_key] = plan()

    cache[cache_key](dst, src_i)
Exemplo n.º 4
0
 def promote(self, dst, src):
     tag = src.otype.__name__
     if tag not in self.promote_plan:
         plan = gpt.copy_plan(dst, src, embed_in_communicator=dst.grid)
         plan.destination += dst.view[self.gcoor]
         plan.source += src.view[self.lcoor]
         self.promote_plan[tag] = plan()
     self.promote_plan[tag](dst, src)
Exemplo n.º 5
0
    def __setitem__(self, key, value):
        # unpack cache
        cache, key = unpack_cache_key(key)
        cache_key = None if cache is None else "set"

        # short code path to zero lattice
        if type(key) == slice and key == slice(None, None, None):

            if gpt.util.is_num(value):
                for o in self.v_obj:
                    cgpt.lattice_set_to_number(o, value)
                return

            cache_key = (
                f"{self.otype.__name__}_{self.checkerboard().__name__}_{self.grid.describe()}"
            )
            cache = lattice.cache

        # general code path, map key
        pos, tidx, shape = gpt.map_key(self, key)
        n_pos = len(pos)

        # convert input to proper numpy array
        value = gpt.util.tensor_to_value(
            value, dtype=self.grid.precision.complex_dtype)
        if value is None:
            value = memoryview(bytearray())

        # needed bytes and optional cyclic upscaling
        nbytes_needed = n_pos * numpy.prod(
            shape) * self.grid.precision.nbytes * 2
        value = cgpt.copy_cyclic_upscale(value, nbytes_needed)

        # create plan
        if cache_key is None or cache_key not in cache:
            plan = gpt.copy_plan(self, value)
            plan.destination += gpt.lattice_view(self, pos, tidx)
            plan.source += gpt.global_memory_view(
                self.grid,
                [[self.grid.processor, value, 0, value.nbytes]]
                if value.nbytes > 0 else None,
            )

            # skip optimization if we only use it once
            xp = plan(
                local_only=isinstance(pos, gpt.core.local_coordinates),
                skip_optimize=cache_key is None,
            )
            if cache_key is not None:
                cache[cache_key] = xp
        else:
            xp = cache[cache_key]

        xp(self, value)
Exemplo n.º 6
0
 def bit_flipped_lattice(self, i):
     c = self.bit_map.coordinates
     nci = self.bit_map.not_coordinates[i]
     bfl = g.lattice(self.lattice)
     if i not in self.bit_flipped_plan:
         p = g.copy_plan(bfl, self.lattice)
         p.destination += bfl.view[c]
         p.source += self.lattice.view[nci]
         self.bit_flipped_plan[i] = p()
     self.bit_flipped_plan[i](bfl, self.lattice)
     return bfl
Exemplo n.º 7
0
 def bit_flipped_lattice(self, i):
     c = self.bit_map.coordinates
     nci = self.bit_map.not_coordinates[self.bit_permutation[i]]
     bfl = g.lattice(self.lattice)
     if i not in self.bit_flipped_plan:
         p = g.copy_plan(bfl, self.lattice)
         p.destination += bfl.view[c]
         p.source += self.lattice.view[nci]
         self.bit_flipped_plan[i] = p()
         # g.message(
         #     self.bit_flipped_plan[i].info()
         # )  # TODO: it is odd that this maxes out at 22 GB/s ; focus on bandwidth benchmark first, why 500GB/s for prop and only 5 for singlet?
     self.bit_flipped_plan[i](bfl, self.lattice)
     return bfl
Exemplo n.º 8
0
 def block_extract(self, u2, U, idx):
     assert u2.otype.Nc == 2 and u2.otype.Ndim == 2
     idx = list(idx)
     cache = ot_matrix_su_n_fundamental_group.cache
     cache_key = f"{self.Nc}_{idx}"
     if cache_key not in cache:
         pos = tuple([slice(None, None, None) for i in range(u2.grid.nd)])
         plan = gpt.copy_plan(u2, U)
         for i in range(2):
             for j in range(2):
                 plan.destination += u2.view[pos + (i, j)]
                 plan.source += U.view[pos + (idx[i], idx[j])]
         cache[cache_key] = plan()
     cache[cache_key](u2, U)
Exemplo n.º 9
0
def mk_qlat_gpt_copy_plan(ctype, total_site, multiplicity, tag):
    geo = q.Geometry(total_site, multiplicity)
    f_gpt = mk_gpt_field(ctype, geo)
    f_qlat = q.Field(ctype, geo)
    lexicographic_coordinates = g.coordinates(f_gpt)
    buf = f_qlat.mview()
    if tag == "qlat_from_gpt":
        qlat_from_gpt = g.copy_plan(buf, f_gpt)
        qlat_from_gpt.destination += g.global_memory_view(
            f_gpt.grid, [[f_gpt.grid.processor, buf, 0, buf.nbytes]])
        qlat_from_gpt.source += f_gpt.view[lexicographic_coordinates]
        qlat_from_gpt = qlat_from_gpt(local_only=True)
        return qlat_from_gpt
    elif tag == "gpt_from_qlat":
        gpt_from_qlat = g.copy_plan(f_gpt, buf)
        gpt_from_qlat.source += g.global_memory_view(
            f_gpt.grid, [[f_gpt.grid.processor, buf, 0, buf.nbytes]])
        gpt_from_qlat.destination += f_gpt.view[lexicographic_coordinates]
        gpt_from_qlat = gpt_from_qlat(local_only=True)
        return gpt_from_qlat
    else:
        q.displayln_info(tag)
        raise Exception("mk_qlat_gpt_copy_plan")
Exemplo n.º 10
0
 def __call__(self):
     plan = g.copy_plan(self.destinations, self.sources)
     buffer_descriptions = []
     for i in range(len(self.sources)):
         src = self.sources[i]
         src_cb = src.checkerboard()
         coordinates = g.coordinates(src)
         L = src.grid.fdimensions
         for x in self.displacements[i]:
             buffer_descriptions.append((src.grid, src.otype, src_cb))
             plan.destination += self.destinations[self.indices[i]
                                                   [x]].view[:]
             plan.source += src.view[cgpt.coordinates_shift(
                 coordinates, x, L)]
     return cshift_executer(buffer_descriptions, plan())
Exemplo n.º 11
0
Arquivo: core.py Projeto: waterret/gpt
def assign_pos_view():
    plan = g.copy_plan(lhs, l_dp)
    plan.destination += lhs.view[pos]
    plan.source += l_dp.view[pos]
    plan = plan()
    info = plan.info()
    for rank_dst, rank_src in info:
        assert rank_dst == rank_src
        assert rank_dst == lhs.grid.processor
        info_rank = info[(rank_dst, rank_src)]
        for index in info_rank:
            info_index = info_rank[index]
            # Make sure that after optimization only a single memcpy is needed
            assert info_index["blocks"] == 1
    plan(lhs, l_dp)
Exemplo n.º 12
0
Arquivo: split.py Projeto: wettig/gpt
def unsplit(first,
            second,
            cache=None,
            group_policy=split_group_policy.separate):
    if type(first) != list:
        return unsplit([first], [second])

    n = len(first)
    N = len(second)
    Q = n // N
    assert n % N == 0

    # Save memory by performing each group separately
    if N != 1 and group_policy == split_group_policy.separate:
        for i in range(N):
            unsplit([first[q * N + i] for q in range(Q)], [second[i]], cache,
                    group_policy)
        return

    split_grid = second[0].grid
    sranks = split_grid.sranks
    srank = split_grid.srank

    lcoor = second[0].split_lcoor
    gcoor = second[0].split_gcoor

    src_data = second
    dst_data = first

    if cache is None:
        cache = {}

    cache_key = f"unsplit_plan_{first[0].grid.obj}_{second[0].grid.obj}_{first[0].otype.__name__}_{second[0].otype.__name__}_{n}_{N}"
    if cache_key not in cache:
        plan = gpt.copy_plan(dst_data,
                             src_data,
                             embed_in_communicator=first[0].grid)
        i = srank // (sranks // Q)
        for x in first[i * N:(i + 1) * N]:
            plan.destination += x.view[gcoor]
        for x in second:
            plan.source += x.view[lcoor]
        cache[cache_key] = plan()

    cache[cache_key](dst_data, src_data)
Exemplo n.º 13
0
def separate_indices(x, st, cache=default_merge_indices_cache):
    pos = gpt.coordinates(x)
    cb = x.checkerboard()
    assert st is not None
    result_otype = st[-1]()
    if result_otype is None:
        return x
    ndim = x.otype.shape[st[0]]
    rank = len(st) - 1
    islice = [slice(None, None, None) for i in range(len(x.otype.shape))]
    ivec = [0] * rank
    result = {}

    keys = []
    tidx = []
    dst = []
    for i in range(ndim**rank):
        idx = i
        for j in range(rank):
            c = idx % ndim
            islice[st[j]] = c
            ivec[j] = c
            idx //= ndim
        keys.append(tuple(ivec))
        tidx.append(tuple(islice))

    for i in keys:
        v = gpt.lattice(x.grid, result_otype)
        v.checkerboard(cb)
        result[i] = v
        dst.append(v)

    cache_key = f"separate_indices_{cb.__name__}_{result_otype.__name__}_{x.otype.__name__}_{x.grid.describe()}_{x.grid.obj}"
    if cache_key not in cache:
        plan = gpt.copy_plan(dst, x)
        for i in range(len(tidx)):
            plan.destination += result[keys[i]].view[pos]
            plan.source += x.view[(pos, ) + tidx[i]]
        cache[cache_key] = plan()

    cache[cache_key](dst, x)

    return result
Exemplo n.º 14
0
    def __getitem__(self, key):

        # unpack cache
        cache, key = unpack_cache_key(key)
        cache_key = None if cache is None else "get"

        # general code path, map key
        pos, tidx, shape = gpt.map_key(self, key)
        n_pos = len(pos)

        # create target
        value = cgpt.ndarray((n_pos, *shape),
                             self.grid.precision.complex_dtype)

        # create plan
        if cache_key is None or cache_key not in cache:
            plan = gpt.copy_plan(value, self)
            plan.destination += gpt.global_memory_view(
                self.grid,
                [[self.grid.processor, value, 0, value.nbytes]]
                if value.nbytes > 0 else None,
            )
            plan.source += gpt.lattice_view(self, pos, tidx)
            xp = plan()

            if cache_key is not None:
                cache[cache_key] = xp
        else:
            xp = cache[cache_key]

        xp(value, self)

        # if only a single element is returned and we have the full shape,
        # wrap in a tensor
        if len(value) == 1 and shape == self.otype.shape:
            return gpt.util.value_to_tensor(value[0], self.otype)

        return value
Exemplo n.º 15
0
    def perform(self, root):
        global basis_size, sloppy_per_job, T, current_config
        if current_config is not None and current_config.conf_file != self.conf_file:
            current_config = None
        if current_config is None:
            current_config = config(self.conf_file)

        output_correlator = g.corr_io.writer(f"{root}/{self.name}/head.dat")

        vcj = g.load(f"{root}/{self.conf}/pm_basis/basis")

        for i0 in range(0, basis_size, sloppy_per_job):
            half_peramb = {}
            for l in g.load(
                    f"{root}/{self.conf}/pm_{self.solver}_t{self.t}_i{i0}/propagators"
            ):
                for x in l:
                    half_peramb[x] = l[x]

            g.mem_report(details=False)

            vc = g.vcolor(vcj[0].grid)
            c = g.coordinates(vc)
            prec = {"sloppy": 0, "exact": 1}[self.solver]

            for spin_prime in range(4):

                plan = None

                for spin in range(4):

                    for i in range(i0, i0 + sloppy_per_job):
                        hp = half_peramb[f"t{self.t}s{spin}c{i}_{self.solver}"]

                        if plan is None:
                            plan = g.copy_plan(vc, hp)
                            plan.destination += vc.view[c]
                            plan.source += hp.view[c, spin_prime, :]
                            plan = plan()

                        plan(vc, hp)

                        t0 = g.time()
                        slc_j = [
                            g(g.adj(vcj[j]) * vc) for j in range(basis_size)
                        ]
                        t1 = g.time()
                        slc = g.slice(slc_j, 3)
                        t2 = g.time()

                        for j in range(basis_size):
                            output_correlator.write(
                                f"output/peramb_prec{prec}/n_{j}_{i}_s_{spin_prime}_{spin}_t_{self.t}",
                                slc[j],
                            )

                        t3 = g.time()
                        if i % 50 == 0:
                            g.message(spin_prime, spin, i, "Timing", t1 - t0,
                                      t2 - t1, t3 - t2)

        output_correlator.close()
Exemplo n.º 16
0
def separate(lattices, dimension=-1):

    # expect list below
    if type(lattices) != list:
        lattices = [lattices]

    # evaluate in case it is an expression
    lattices = [gpt.eval(x) for x in lattices]

    # number of batches to separate
    batches = len(lattices)
    assert batches > 0

    # make sure all have the same grid
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, batches)])

    # allow negative indexing
    if dimension < 0:
        dimension += grid.nd
        assert dimension >= 0
    else:
        assert dimension < grid.nd

    # number of slices (per batch)
    N = grid.fdimensions[dimension]
    n = N * batches

    # all lattices need to have same checkerboard
    cb = lattices[0].checkerboard()
    assert all([lattices[i].checkerboard() is cb for i in range(1, batches)])

    # all lattices need to have same otype
    otype = lattices[0].otype
    assert all([
        lattices[i].otype.__name__ == otype.__name__
        for i in range(1, batches)
    ])

    # create grid with dimension removed
    separated_grid = grid.removed_dimension(dimension)
    cb_mask = grid.cb.cb_mask[dimension]

    # create separate lattices and set their checkerboard
    separated_lattices = [gpt.lattice(separated_grid, otype) for i in range(n)]
    for i, x in enumerate(separated_lattices):
        j = i % N
        if cb_mask == 0 or j % 2 == 0:
            x.checkerboard(cb)
        else:
            x.checkerboard(cb.inv())

    # construct coordinates
    separated_gcoor_zero = gpt.coordinates(separated_lattices[0])
    separated_gcoor_one = (gpt.coordinates(separated_lattices[1])
                           if N > 1 and cb_mask == 1 else separated_gcoor_zero)
    separated_gcoor = [separated_gcoor_zero, separated_gcoor_one]

    # move data
    for i in range(N):
        gcoor = cgpt.coordinates_inserted_dimension(separated_gcoor[i % 2],
                                                    dimension, [i])

        plan = gpt.copy_plan(separated_lattices[i],
                             lattices[0],
                             embed_in_communicator=lattices[0].grid)
        plan.destination += separated_lattices[i].view[separated_gcoor[i % 2]]
        plan.source += lattices[0].view[gcoor]
        plan = plan()

        for j in range(batches):
            plan(separated_lattices[j * N + i], lattices[j])

    # return
    return separated_lattices
Exemplo n.º 17
0
def load(filename, params):

    # first check if this is right file format
    if not os.path.exists(filename + "/00/0000000000.compressed"
                          ) or not os.path.exists(filename + "/metadata.txt"):
        raise NotImplementedError()

    # verbosity
    verbose = gpt.default.is_verbose("io")

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # need grids parameter
    assert params["grids"] is not None
    assert type(params["grids"]) == gpt.grid
    fgrid = params["grids"]
    assert fgrid.precision == gpt.single
    fdimensions = fgrid.fdimensions

    # read metadata
    metadata = read_metadata(filename + "/metadata.txt")
    s = get_ivec(metadata, "s")
    ldimensions = [s[4]] + s[:4]
    blocksize = get_ivec(metadata, "b")
    blocksize = [blocksize[4]] + blocksize[:4]
    nb = get_ivec(metadata, "nb")
    nb = [nb[4]] + nb[:4]
    crc32 = get_xvec(metadata, "crc32")
    neigen = int(metadata["neig"])
    nbasis = int(metadata["nkeep"])
    nsingle = int(metadata["nkeep_single"])
    blocks = int(metadata["blocks"])
    FP16_COEF_EXP_SHARE_FLOATS = int(metadata["FP16_COEF_EXP_SHARE_FLOATS"])
    nsingleCap = min([nsingle, nbasis])

    # check
    nd = len(ldimensions)
    assert nd == 5
    assert nd == len(fdimensions)
    assert nd == len(blocksize)
    assert fgrid.cb.n == 2
    assert fgrid.cb.cb_mask == [0, 1, 1, 1, 1]

    # create coarse grid
    cgrid = gpt.block.grid(fgrid, blocksize)

    # allow for partial loading of data
    if params["nmax"] is not None:
        nmax = params["nmax"]
        nbasis_max = min([nmax, nbasis])
        neigen_max = min([nmax, neigen])
        nsingleCap_max = min([nmax, nsingleCap])
    else:
        nbasis_max = nbasis
        neigen_max = neigen
        nsingleCap_max = nsingleCap

    # allocate all lattices
    basis = [gpt.vspincolor(fgrid) for i in range(nbasis_max)]
    cevec = [gpt.vcomplex(cgrid, nbasis) for i in range(neigen_max)]
    if params["advise_basis"] is not None:
        gpt.advise(basis, params["advise_basis"])
    if params["advise_cevec"] is not None:
        gpt.advise(cevec, params["advise_cevec"])

    # fix checkerboard of basis
    for i in range(nbasis_max):
        basis[i].checkerboard(site_cb)

    # mpi layout
    mpi = []
    for i in range(nd):
        assert fdimensions[i] % ldimensions[i] == 0
        mpi.append(fdimensions[i] // ldimensions[i])
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)

    # timing
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fread = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Loading %s with %d views per node" %
                    (filename, len(views)))
    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        coarse_fp32_vector_size = 2 * (4 * nbasis) * blocks

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "rb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2
        gpt.message("Read blocks", blocks)

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # dummy buffer
        data0 = memoryview(bytes())

        # single-precision data
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))
        for b in range(read_blocks):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(block_data_size_single * nsingleCap))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_munge -= gpt.time()
                # data: lattice0_posA lattice1_posA .... lattice0_posB lattice1_posB
                cgpt.munge_inner_outer(data_munged, data, nsingleCap,
                                       block_reduce)
                # data_munged: lattice0 lattice1 lattice2 ...
                dt_munge += gpt.time()
            else:
                data_munged = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            rhs = data_munged[0:block_data_size_single]
            distribute_plan = gpt.copy_plan(basis[0], rhs)
            distribute_plan.destination += basis[0].view[pos[b]]
            distribute_plan.source += gpt.global_memory_view(
                fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
            rhs = None
            distribute_plan = distribute_plan()
            for i in range(nsingleCap_max):
                distribute_plan(
                    basis[i],
                    data_munged[block_data_size_single *
                                i:block_data_size_single * (i + 1)],
                )
            dt_distr += gpt.time()

            if verbose:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        mem_avail(),
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            for b in range(read_blocks):
                fgrid.barrier()
                dt_fread -= gpt.time()
                if f is not None:
                    data = memoryview(
                        f.read(block_data_size_fp16 * (nbasis - nsingleCap)))
                    globalReadGB = len(data) / 1024.0**3.0
                else:
                    globalReadGB = 0.0
                globalReadGB = fgrid.globalsum(globalReadGB)
                dt_fread += gpt.time()
                totalSizeGB += globalReadGB

                if f is not None:
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp16_to_fp32(data_fp32, data, 24)
                    dt_fp16 += gpt.time()
                    dt_munge -= gpt.time()
                    cgpt.munge_inner_outer(
                        data_munged,
                        data_fp32,
                        nbasis - nsingleCap,
                        block_reduce,
                    )
                    dt_munge += gpt.time()
                else:
                    data_munged = data0

                fgrid.barrier()
                dt_distr -= gpt.time()
                if nsingleCap < nbasis_max:
                    rhs = data_munged[0:block_data_size_single]
                    distribute_plan = gpt.copy_plan(basis[0], rhs)
                    distribute_plan.destination += basis[0].view[pos[b]]
                    distribute_plan.source += gpt.global_memory_view(
                        fgrid, [[fgrid.processor, rhs, 0, rhs.nbytes]])
                    rhs = None
                    distribute_plan = distribute_plan()
                    for i in range(nsingleCap, nbasis_max):
                        j = i - nsingleCap
                        distribute_plan(
                            basis[i],
                            data_munged[block_data_size_single *
                                        j:block_data_size_single * (j + 1)],
                        )
                dt_distr += gpt.time()

                if verbose:
                    gpt.message(
                        "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fread,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                            mem_avail(),
                        ))

        # coarse grid data
        data_fp32 = memoryview(bytearray(coarse_fp32_vector_size))
        distribute_plan = None
        for j in range(neigen):
            fgrid.barrier()
            dt_fread -= gpt.time()
            if f is not None:
                data = memoryview(f.read(coarse_vector_size))
                globalReadGB = len(data) / 1024.0**3.0
            else:
                globalReadGB = 0.0
            globalReadGB = fgrid.globalsum(globalReadGB)
            dt_fread += gpt.time()
            totalSizeGB += globalReadGB

            if f is not None:
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()
                dt_fp16 -= gpt.time()
                cgpt.mixed_fp32fp16_to_fp32(
                    data_fp32,
                    data,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                data = data_fp32
            else:
                data = data0

            fgrid.barrier()
            dt_distr -= gpt.time()
            if j < neigen_max:
                if distribute_plan is None:
                    distribute_plan = gpt.copy_plan(cevec[j], data)
                    distribute_plan.destination += cevec[j].view[pos_coarse]
                    distribute_plan.source += gpt.global_memory_view(
                        cgrid, [[cgrid.processor, data, 0, data.nbytes]])
                    distribute_plan = distribute_plan()
                distribute_plan(cevec[j], data)
            dt_distr += gpt.time()

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* read %g GB: fread at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s; available = %g GB"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fread,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                        mem_avail(),
                    ))

        # crc checks
        if f is not None:
            assert crc32_comp == crc32[cv.rank]

    # timing
    t1 = gpt.time()

    # verbosity
    if verbose:
        gpt.message("* load %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))

    # eigenvalues
    evln = list(
        filter(lambda x: x != "",
               open(filename + "/eigen-values.txt").read().split("\n")))
    nev = int(evln[0])
    ev = [float(x) for x in evln[1:]]
    assert len(ev) == nev
    return (basis, cevec, ev)
Exemplo n.º 18
0
def save(filename, objs, params):

    # split data to save
    assert len(objs) == 3
    basis = objs[0]
    cevec = objs[1]
    ev = objs[2]

    # verbosity
    verbose = gpt.default.is_verbose("io")
    if verbose:
        gpt.message(
            "Saving %d basis vectors, %d coarse-grid vectors, %d eigenvalues to %s"
            % (len(basis), len(cevec), len(ev), filename))

    # create directory
    if gpt.rank() == 0:
        os.makedirs(filename, exist_ok=True)

    # now sync since only root has created directory
    gpt.barrier()

    # write eigenvalues
    if gpt.rank() == 0:
        f = open("%s/eigen-values.txt" % filename, "wt")
        f.write("%d\n" % len(ev))
        for v in ev:
            f.write("%.15E\n" % v)
        f.close()

    # site checkerboard
    # only odd is used in this file format but
    # would be easy to generalize here
    site_cb = gpt.odd

    # grids
    assert len(basis) > 0
    assert len(cevec) > 0
    fgrid = basis[0].grid
    cgrid = cevec[0].grid

    # mpi layout
    if params["mpi"] is not None:
        mpi = params["mpi"]
    else:
        mpi = fgrid.mpi
    assert mpi[0] == 1  # assert no mpi in 5th direction

    # params
    assert basis[0].checkerboard() == site_cb
    nd = 5
    assert len(fgrid.ldimensions) == nd
    fdimensions = fgrid.fdimensions
    ldimensions = [conformDiv(fdimensions[i], mpi[i]) for i in range(nd)]
    assert fgrid.precision == gpt.single
    s = ldimensions
    b = [
        conformDiv(fgrid.fdimensions[i], cgrid.fdimensions[i])
        for i in range(nd)
    ]
    nb = [conformDiv(s[i], b[i]) for i in range(nd)]
    neigen = len(cevec)
    nbasis = len(basis)
    if "nsingle" in params:
        nsingle = params["nsingle"]
        assert nsingle <= nbasis
    else:
        nsingle = nbasis
    nsingleCap = min([nsingle, nbasis])
    blocks = numpy.prod(nb)
    FP16_COEF_EXP_SHARE_FLOATS = 10

    # write metadata
    if gpt.rank() == 0:
        fmeta = open("%s/metadata.txt" % filename, "wt")
        for i in range(nd):
            fmeta.write("s[%d] = %d\n" % (i, s[(i + 1) % nd]))
        for i in range(nd):
            fmeta.write("b[%d] = %d\n" % (i, b[(i + 1) % nd]))
        for i in range(nd):
            fmeta.write("nb[%d] = %d\n" % (i, nb[(i + 1) % nd]))
        fmeta.write("neig = %d\n" % neigen)
        fmeta.write("nkeep = %d\n" % nbasis)
        fmeta.write("nkeep_single = %d\n" % nsingle)
        fmeta.write("blocks = %d\n" % blocks)
        fmeta.write("FP16_COEF_EXP_SHARE_FLOATS = %d\n" %
                    FP16_COEF_EXP_SHARE_FLOATS)
        fmeta.flush()  # write crc32 later

    # create cartesian view on fine grid
    cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb)
    views = cv0.views_for_node(fgrid)
    crc32 = numpy.array([0] * cv0.ranks, dtype=numpy.uint64)
    # timing
    t0 = gpt.time()
    totalSizeGB = 0
    dt_fp16 = 1e-30
    dt_distr = 1e-30
    dt_munge = 1e-30
    dt_crc = 1e-30
    dt_fwrite = 1e-30
    t0 = gpt.time()

    # load all views
    if verbose:
        gpt.message("Saving %s with %d views per node" %
                    (filename, len(views)))

    for i, v in enumerate(views):
        cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions,
                                fgrid.cb, site_cb)
        cvc = gpt.cartesian_view(v if v is not None else -1, mpi,
                                 cgrid.fdimensions, gpt.full, gpt.none)
        pos_coarse = gpt.coordinates(cvc, "canonical")

        dn, fn = get_local_name(filename, cv)
        if fn is not None:
            os.makedirs(dn, exist_ok=True)

        # sizes
        slot_lsites = numpy.prod(cv.view_dimensions)
        assert slot_lsites % blocks == 0
        block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4
        block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2,
                                          24)
        coarse_block_size_part_fp32 = 2 * (4 * nsingleCap)
        coarse_block_size_part_fp16 = 2 * (FP_16_SIZE(
            nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS))
        coarse_vector_size = (coarse_block_size_part_fp32 +
                              coarse_block_size_part_fp16) * blocks
        totalSize = (
            blocks *
            (block_data_size_single * nsingleCap + block_data_size_fp16 *
             (nbasis - nsingleCap)) + neigen * coarse_vector_size)
        totalSizeGB += totalSize / 1024.0**3.0 if v is not None else 0.0

        # checksum
        crc32_comp = 0

        # file
        f = gpt.FILE(fn, "wb") if fn is not None else None

        # block positions
        pos = [
            cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb,
                                        "canonicalOdd") for b in range(blocks)
        ]

        # group blocks
        read_blocks = blocks
        block_reduce = 1
        max_read_blocks = get_param(params, "max_read_blocks", 8)
        while read_blocks > max_read_blocks and read_blocks % 2 == 0:
            pos = [
                numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1]))
                for i in range(read_blocks // 2)
            ]
            block_data_size_single *= 2
            block_data_size_fp16 *= 2
            read_blocks //= 2
            block_reduce *= 2

        # make read-only to enable caching
        for x in pos:
            x.setflags(write=0)

        # single-precision data
        data = memoryview(bytearray(block_data_size_single * nsingleCap))
        data_munged = memoryview(bytearray(block_data_size_single *
                                           nsingleCap))

        for b in range(read_blocks):
            fgrid.barrier()
            dt_distr -= gpt.time()
            lhs_size = basis[0].otype.nfloats * 4 * len(pos[b])
            lhs = data_munged[0:lhs_size]
            distribute_plan = gpt.copy_plan(lhs, basis[0])
            distribute_plan.destination += gpt.global_memory_view(
                fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]])
            distribute_plan.source += basis[0].view[pos[b]]
            distribute_plan = distribute_plan()
            lhs = None
            for i in range(nsingleCap):
                distribute_plan(
                    data_munged[block_data_size_single *
                                i:block_data_size_single * (i + 1)],
                    basis[i],
                )
            dt_distr += gpt.time()

            if f is not None:
                dt_munge -= gpt.time()
                cgpt.munge_inner_outer(
                    data,
                    data_munged,
                    block_reduce,
                    nsingleCap,
                )
                dt_munge += gpt.time()
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()

            fgrid.barrier()
            dt_fwrite -= gpt.time()
            if f is not None:
                f.write(data)
                globalWriteGB = len(data) / 1024.0**3.0
            else:
                globalWriteGB = 0.0
            globalWriteGB = fgrid.globalsum(globalWriteGB)
            dt_fwrite += gpt.time()
            totalSizeGB += globalWriteGB

            if verbose:
                gpt.message(
                    "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fwrite,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                    ))

        # fp16 data
        if nbasis != nsingleCap:
            # allocate data buffer
            data_fp32 = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data_munged = memoryview(
                bytearray(block_data_size_single * (nbasis - nsingleCap)))
            data = memoryview(
                bytearray(block_data_size_fp16 * (nbasis - nsingleCap)))
            for b in range(read_blocks):
                fgrid.barrier()
                dt_distr -= gpt.time()
                lhs_size = basis[0].otype.nfloats * 4 * len(pos[b])
                lhs = data_munged[0:lhs_size]
                distribute_plan = gpt.copy_plan(lhs, basis[0])
                distribute_plan.destination += gpt.global_memory_view(
                    fgrid, [[fgrid.processor, lhs, 0, lhs.nbytes]])
                distribute_plan.source += basis[0].view[pos[b]]
                distribute_plan = distribute_plan()
                lhs = None
                for i in range(nsingleCap, nbasis):
                    j = i - nsingleCap
                    distribute_plan(
                        data_munged[j * block_data_size_single:(j + 1) *
                                    block_data_size_single],
                        basis[i],
                    )
                dt_distr += gpt.time()

                if f is not None:
                    dt_munge -= gpt.time()
                    cgpt.munge_inner_outer(
                        data_fp32,
                        data_munged,
                        block_reduce,
                        nbasis - nsingleCap,
                    )
                    dt_munge += gpt.time()
                    dt_fp16 -= gpt.time()
                    cgpt.fp32_to_fp16(data, data_fp32, 24)
                    dt_fp16 += gpt.time()
                    dt_crc -= gpt.time()
                    crc32_comp = gpt.crc32(data, crc32_comp)
                    dt_crc += gpt.time()

                fgrid.barrier()
                dt_fwrite -= gpt.time()
                if f is not None:
                    f.write(data)
                    globalWriteGB = len(data) / 1024.0**3.0
                else:
                    globalWriteGB = 0.0
                globalWriteGB = fgrid.globalsum(globalWriteGB)
                dt_fwrite += gpt.time()
                totalSizeGB += globalWriteGB

                if verbose:
                    gpt.message(
                        "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s"
                        % (
                            totalSizeGB,
                            totalSizeGB / dt_fwrite,
                            totalSizeGB / dt_crc,
                            totalSizeGB / dt_munge,
                            totalSizeGB / dt_distr,
                            totalSizeGB / dt_fp16,
                        ))

        # coarse grid data
        data = memoryview(bytearray(coarse_vector_size))
        data_fp32 = memoryview(
            bytearray(cevec[0].otype.nfloats * 4 * len(pos_coarse)))
        distribute_plan = gpt.copy_plan(data_fp32, cevec[0])
        distribute_plan.destination += gpt.global_memory_view(
            cgrid, [[cgrid.processor, data_fp32, 0, data_fp32.nbytes]])
        distribute_plan.source += cevec[0].view[pos_coarse]
        distribute_plan = distribute_plan()
        for j in range(neigen):
            fgrid.barrier()
            dt_distr -= gpt.time()
            distribute_plan(data_fp32, cevec[j])
            dt_distr += gpt.time()

            if f is not None:
                dt_fp16 -= gpt.time()
                cgpt.fp32_to_mixed_fp32fp16(
                    data,
                    data_fp32,
                    coarse_block_size_part_fp32,
                    coarse_block_size_part_fp16,
                    FP16_COEF_EXP_SHARE_FLOATS,
                )
                dt_fp16 += gpt.time()
                dt_crc -= gpt.time()
                crc32_comp = gpt.crc32(data, crc32_comp)
                dt_crc += gpt.time()

            fgrid.barrier()
            dt_fwrite -= gpt.time()
            if f is not None:
                f.write(data)
                globalWriteGB = len(data) / 1024.0**3.0
            else:
                globalWriteGB = 0.0
            globalWriteGB = fgrid.globalsum(globalWriteGB)
            dt_fwrite += gpt.time()
            totalSizeGB += globalWriteGB

            if verbose and j % (neigen // 10) == 0:
                gpt.message(
                    "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s"
                    % (
                        totalSizeGB,
                        totalSizeGB / dt_fwrite,
                        totalSizeGB / dt_crc,
                        totalSizeGB / dt_munge,
                        totalSizeGB / dt_distr,
                        totalSizeGB / dt_fp16,
                    ))

        # save crc
        crc32[cv.rank] = crc32_comp

    # synchronize crc32
    fgrid.globalsum(crc32)

    # timing
    t1 = gpt.time()

    # write crc to metadata
    if gpt.rank() == 0:
        for i in range(len(crc32)):
            fmeta.write("crc32[%d] = %X\n" % (i, crc32[i]))
        fmeta.close()

    # verbosity
    if verbose:
        gpt.message("* save %g GB at %g GB/s" % (totalSizeGB, totalSizeGB /
                                                 (t1 - t0)))
Exemplo n.º 19
0
Arquivo: split.py Projeto: wettig/gpt
def split_lattices(lattices, lcoor, gcoor, split_grid, N, cache, group_policy):
    # Example:
    #
    # Original
    #
    # lattice1,...,latticen | lattice1,...,latticen
    #
    # New
    #
    # lattice1,...,latticeN | latticeN+1,...,lattice2N
    #
    # Q = n // N = 2

    # N is desired number of parallel split lattices per unsplit lattice
    # 1 <= N <= sranks, sranks % N == 0

    n = len(lattices)
    assert n > 0
    assert n % N == 0
    Q = n // N

    # Save memory by performing each group separately
    if N != 1 and group_policy == split_group_policy.separate:
        res = []
        for i in range(N):
            res += split_lattices(
                [lattices[q * N + i] for q in range(Q)],
                lcoor,
                gcoor,
                split_grid,
                1,
                cache,
                group_policy,
            )
        return res

    assert len(lcoor) == len(gcoor)
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)])
    cb = lattices[0].checkerboard()
    assert all([lattices[i].checkerboard() is cb for i in range(1, n)])
    otype = lattices[0].otype
    assert all(
        [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)])

    l = [gpt.lattice(split_grid, otype) for i in range(N)]

    for x in l:
        x.checkerboard(cb)
        x.split_lcoor = lcoor
        x.split_gcoor = gcoor
    sranks = split_grid.sranks
    srank = split_grid.srank

    src_data = lattices
    dst_data = l

    # build views
    if cache is None:
        cache = {}

    cache_key = f"split_plan_{lattices[0].grid.obj}_{l[0].grid.obj}_{lattices[0].otype.__name__}_{l[0].otype.__name__}_{n}_{N}"
    if cache_key not in cache:
        plan = gpt.copy_plan(dst_data,
                             src_data,
                             embed_in_communicator=lattices[0].grid)
        i = srank // (sranks // Q)
        for x in lattices[i * N:(i + 1) * N]:
            plan.source += x.view[gcoor]
        for x in l:
            plan.destination += x.view[lcoor]
        cache[cache_key] = plan()

    cache[cache_key](dst_data, src_data)

    return l
Exemplo n.º 20
0
    g.message(f"Test {lhs.otype.__name__}")

    # warmup
    g.copy(lhs, rhs)

    t0 = g.time()
    for n in range(N):
        g.copy(lhs, rhs)
    t1 = g.time()
    g.message("%-50s %g GB/s" % ("copy:", GB / (t1 - t0)))

    pos = g.coordinates(lhs)

    # create plan during first assignment, exclude from benchmark
    plan = g.copy_plan(lhs, rhs)
    plan.destination += lhs.view[pos]
    plan.source += rhs.view[pos]
    plan = plan()
    # plan_info = plan.info()[0, 0][0, 0]
    # block_size = plan_info["size"] // plan_info["blocks"]
    # g.message(" " * 51 + f"block_size = {block_size}")

    # warmup
    plan(lhs, rhs)

    t0 = g.time()
    for n in range(N):
        plan(lhs, rhs)
    t1 = g.time()
    g.message("%-50s %g GB/s %g s" % ("copy_plan:", GB / (t1 - t0),
Exemplo n.º 21
0
def merge(lattices, dimension=-1, N=-1):

    # if only one lattice is given, return immediately
    if type(lattices) != list:
        return lattices

    # number of lattices
    n = len(lattices)
    assert n > 0

    # number of batches
    if N == -1:
        N = n
    batches = n // N
    assert n % N == 0

    # all grids need to be the same
    grid = lattices[0].grid
    assert all([lattices[i].grid.obj == grid.obj for i in range(1, n)])

    # allow negative indexing
    if dimension < 0:
        dimension += grid.nd + 1
        assert dimension >= 0
    else:
        assert dimension <= grid.nd

    # infer checkerboarding of new dimension
    cb = [x.checkerboard() for x in lattices]
    if cb[0] is gpt.none:
        assert all([x is gpt.none for x in cb[1:]])
        cb_mask = 0
    else:
        assert all([
            cb[j * N + i] is cb[j * N + i + 1].inv() for i in range(N - 1)
            for j in range(batches)
        ])
        cb_mask = 1

    # otypes must be consistent
    otype = lattices[0].otype
    assert all(
        [lattices[i].otype.__name__ == otype.__name__ for i in range(1, n)])

    # create merged grid
    merged_grid = grid.inserted_dimension(dimension, N, cb_mask=cb_mask)

    # create merged lattices and set checkerboard
    merged_lattices = [gpt.lattice(merged_grid, otype) for i in range(batches)]
    for x in merged_lattices:
        x.checkerboard(cb[0])

    # coordinates of source lattices
    gcoor_zero = gpt.coordinates(lattices[0])
    gcoor_one = gpt.coordinates(
        lattices[1]) if N > 1 and cb_mask == 1 else gcoor_zero
    gcoor = [gcoor_zero, gcoor_one]

    # data transfer
    for i in range(N):
        merged_gcoor = cgpt.coordinates_inserted_dimension(
            gcoor[i % 2], dimension, [i])

        plan = gpt.copy_plan(
            merged_lattices[0],
            lattices[i],
            embed_in_communicator=merged_lattices[0].grid,
        )
        plan.destination += merged_lattices[0].view[merged_gcoor]
        plan.source += lattices[i].view[gcoor[i % 2]]
        plan = plan()

        for j in range(batches):
            plan(merged_lattices[j], lattices[j * N + i])

    # if only one batch, remove list
    if len(merged_lattices) == 1:
        return merged_lattices[0]

    # return
    return merged_lattices