示例#1
0
def test_all_counters_parallel_matmul():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 16)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / 16

    op_map = lp.get_op_map(knl)
    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
    i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
    i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params)

    assert f32mul + f32add == n * m * l * 2

    op_map = lp.get_mem_access_map(knl)

    f32coal = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='load',
                                  variable='b')].eval_with_dict(params)
    f32coal += op_map[lp.MemAccess('global',
                                   np.float32,
                                   stride=1,
                                   direction='load',
                                   variable='a')].eval_with_dict(params)

    assert f32coal == n * m + m * l

    f32coal = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='store',
                                  variable='c')].eval_with_dict(params)

    assert f32coal == n * l

    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
    local_mem_l = local_mem_map[lp.MemAccess(
        'local', np.dtype(np.float32),
        direction='load')].eval_with_dict(params)
    assert local_mem_l == n * m * l * 2
示例#2
0
def test_mem_access_counter_logic():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
    ],
                         name="logic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')

    f32_g_l = reduced_map[lp.MemAccess(
        'global', to_loopy_type(np.float32),
        direction='load')].eval_with_dict(params)
    f64_g_l = reduced_map[lp.MemAccess(
        'global', to_loopy_type(np.float64),
        direction='load')].eval_with_dict(params)
    f64_g_s = reduced_map[lp.MemAccess(
        'global', to_loopy_type(np.float64),
        direction='store')].eval_with_dict(params)
    assert f32_g_l == 2 * n * m
    assert f64_g_l == n * m
    assert f64_g_s == n * m
示例#3
0
def test_mem_access_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
    ],
                         name="specialops",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = mem_map[lp.MemAccess('global',
                               np.float32,
                               stride=0,
                               direction='load',
                               variable='a')].eval_with_dict(params)
    f32 += mem_map[lp.MemAccess('global',
                                np.float32,
                                stride=0,
                                direction='load',
                                variable='b')].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global',
                               np.dtype(np.float64),
                               stride=0,
                               direction='load',
                               variable='g')].eval_with_dict(params)
    f64 += mem_map[lp.MemAccess('global',
                                np.dtype(np.float64),
                                stride=0,
                                direction='load',
                                variable='h')].eval_with_dict(params)
    assert f32 == 2 * n * m * l
    assert f64 == 2 * n * m

    f32 = mem_map[lp.MemAccess('global',
                               np.float32,
                               stride=0,
                               direction='store',
                               variable='c')].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global',
                               np.float64,
                               stride=0,
                               direction='store',
                               variable='e')].eval_with_dict(params)
    assert f32 == n * m * l
    assert f64 == n * m

    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
    #tot = lp.eval_and_sum_polys(filtered_map, params)
    tot = filtered_map.eval_and_sum(params)
    assert tot == n * m * l + n * m
示例#4
0
def test_mem_access_counter_nonconsec():

    knl = lp.make_kernel(
        "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [
            """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
        ],
        name="nonconsec",
        assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.split_iname(knl, "i", 16)
    knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f64nonconsec = mem_map[lp.MemAccess('global',
                                        np.float64,
                                        stride=Variable('m'),
                                        direction='load',
                                        variable='g')].eval_with_dict(params)
    f64nonconsec += mem_map[lp.MemAccess('global',
                                         np.float64,
                                         stride=Variable('m'),
                                         direction='load',
                                         variable='h')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.dtype(np.float32),
                                        stride=Variable('m') * Variable('ell'),
                                        direction='load',
                                        variable='a')].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global',
                                         np.dtype(np.float32),
                                         stride=Variable('m') *
                                         Variable('ell'),
                                         direction='load',
                                         variable='b')].eval_with_dict(params)
    assert f64nonconsec == 2 * n * m
    assert f32nonconsec == 3 * n * m * ell

    f64nonconsec = mem_map[lp.MemAccess('global',
                                        np.float64,
                                        stride=Variable('m'),
                                        direction='store',
                                        variable='e')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.float32,
                                        stride=Variable('m') * Variable('ell'),
                                        direction='store',
                                        variable='c')].eval_with_dict(params)
    assert f64nonconsec == n * m
    assert f32nonconsec == n * m * ell
示例#5
0
def test_mem_access_counter_consec():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
                         ],
                         name="consec",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})

    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    f64consec = mem_map[lp.MemAccess('global',
                                     np.float64,
                                     stride=1,
                                     direction='load',
                                     variable='g')].eval_with_dict(params)
    f64consec += mem_map[lp.MemAccess('global',
                                      np.float64,
                                      stride=1,
                                      direction='load',
                                      variable='h')].eval_with_dict(params)
    f32consec = mem_map[lp.MemAccess('global',
                                     np.float32,
                                     stride=1,
                                     direction='load',
                                     variable='a')].eval_with_dict(params)
    f32consec += mem_map[lp.MemAccess('global',
                                      np.dtype(np.float32),
                                      stride=1,
                                      direction='load',
                                      variable='b')].eval_with_dict(params)
    assert f64consec == 2 * n * m
    assert f32consec == 3 * n * m * l

    f64consec = mem_map[lp.MemAccess('global',
                                     np.float64,
                                     stride=1,
                                     direction='store',
                                     variable='e')].eval_with_dict(params)
    f32consec = mem_map[lp.MemAccess('global',
                                     np.float32,
                                     stride=1,
                                     direction='store',
                                     variable='c')].eval_with_dict(params)
    assert f64consec == n * m
    assert f32consec == n * m * l
示例#6
0
def test_mem_access_counter_basic():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k] = g[i,k]*h[i,k+1]
                """
                         ],
                         name="basic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32l = mem_map[lp.MemAccess('global',
                                np.float32,
                                stride=0,
                                direction='load',
                                variable='a')].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global',
                                 np.float32,
                                 stride=0,
                                 direction='load',
                                 variable='b')].eval_with_dict(params)
    f64l = mem_map[lp.MemAccess('global',
                                np.float64,
                                stride=0,
                                direction='load',
                                variable='g')].eval_with_dict(params)
    f64l += mem_map[lp.MemAccess('global',
                                 np.float64,
                                 stride=0,
                                 direction='load',
                                 variable='h')].eval_with_dict(params)
    assert f32l == 3 * n * m * l
    assert f64l == 2 * n * m

    f32s = mem_map[lp.MemAccess('global',
                                np.dtype(np.float32),
                                stride=0,
                                direction='store',
                                variable='c')].eval_with_dict(params)
    f64s = mem_map[lp.MemAccess('global',
                                np.dtype(np.float64),
                                stride=0,
                                direction='store',
                                variable='e')].eval_with_dict(params)
    assert f32s == n * m * l
    assert f64s == n * m
示例#7
0
def test_mem_access_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))

    subgroup_size = 32

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=subgroup_size)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, subgroup_size)

    f32l = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group

    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32s == (n*ell)*n_workgroups*subgroups_per_group

    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                 ).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                                 ).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4*f32l
    assert st_bytes == 4*f32s
示例#8
0
def test_mem_access_counter_bitwise():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
    ],
                         name="bitwise",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.int32, b=np.int32, g=np.int32, h=np.int32))

    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32 = mem_map[lp.MemAccess('global',
                               np.int32,
                               stride=0,
                               direction='load',
                               variable='a')].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global',
                                np.int32,
                                stride=0,
                                direction='load',
                                variable='b')].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global',
                                np.int32,
                                stride=0,
                                direction='load',
                                variable='g')].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global',
                                np.dtype(np.int32),
                                stride=0,
                                direction='load',
                                variable='h')].eval_with_dict(params)
    assert i32 == 4 * n * m + 2 * n * m * l

    i32 = mem_map[lp.MemAccess('global',
                               np.int32,
                               stride=0,
                               direction='store',
                               variable='c')].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global',
                                np.int32,
                                stride=0,
                                direction='store',
                                variable='e')].eval_with_dict(params)
    assert i32 == n * m + n * m * l
示例#9
0
def test_mem_access_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell,
                    g[i,k]*2,
                    g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))

    subgroup_size = 32

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=subgroup_size)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, subgroup_size)

    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')

    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
                                       direction='load')
                          ].eval_with_dict(params)
    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                       direction='load')
                          ].eval_with_dict(params)
    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                       direction='store')
                          ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group
    assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group
    assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
示例#10
0
def test_mem_access_counter_reduction():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32l = mem_map[lp.MemAccess('global',
                                np.float32,
                                stride=0,
                                direction='load',
                                variable='a')].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global',
                                 np.float32,
                                 stride=0,
                                 direction='load',
                                 variable='b')].eval_with_dict(params)
    assert f32l == 2 * n * m * l

    f32s = mem_map[lp.MemAccess('global',
                                np.dtype(np.float32),
                                stride=0,
                                direction='store',
                                variable='c')].eval_with_dict(params)
    assert f32s == n * l

    ld_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['load'
                                            ]).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['store'
                                            ]).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4 * f32l
    assert st_bytes == 4 * f32s
示例#11
0
def test_summations_and_filters():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)

    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                count_granularity=[CG.SUBGROUP]
                                ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert loads_a == (2*n*m*ell)*n_subgroups

    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                      count_granularity=[CG.SUBGROUP]
                                      ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert global_stores == (n*m*ell + n*m)*n_subgroups

    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups
    assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups

    # ignore stride and variable names in this map
    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
                          ].eval_with_dict(params)
    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                          ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32lall == (3*n*m*ell)*n_subgroups
    assert f64lall == (2*n*m)*n_subgroups

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    #for k, v in op_map.items():
    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
    assert f32 == n*m*ell*3
    assert f64 == n*m
    assert i32 == n*m*2

    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
    assert addsub_all == n*m*ell + n*m*2
    assert f32ops_all == n*m*ell*3

    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
    assert non_field == 0

    ops_nodtype = op_map.group_by('name')
    ops_noname = op_map.group_by('dtype')
    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
    assert mul_all == n*m*ell + n*m
    assert f64ops_all == n*m

    def func_filter(key):
        return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
               key.direction == 'load'
    f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64l == (2*n*m)*n_subgroups
示例#12
0
# peek at generated code
evt, (out, ) = knl(queue, a=x_vec_host)

knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = 0", assumptions="n>=1")
knl = lp.split_iname(knl, "i", 16)  # split loop variable
knl = lp.prioritize_loops(knl, "i_outer,i_inner")
knl = lp.set_options(knl, "write_cl")
evt, (out, ) = knl(queue, a=x_vec_dev)

knl = lp.make_kernel("{ [i]: 0<=i<n }",
                     "a[i] = a[i] * b[i] + c[i]",
                     assumptions="n>=0 and n mod 4 = 0")
orig_knl = knl  # copy kernel, test assumptions, and unrolling
knl = lp.split_iname(knl, "i", 4)
knl = lp.tag_inames(knl, dict(i_inner="unr"))
knl = lp.prioritize_loops(knl, "i_outer,i_inner")
knl = lp.set_options(knl, "write_cl")
evt, (out, ) = knl(queue, a=x_vec_dev, b=y_vec_dev, c=z_vec_dev)

from warnings import resetwarnings, filterwarnings
resetwarnings()  # surpress some warnings during stats
filterwarnings('ignore', category=Warning)

knl = lp.add_and_infer_dtypes(knl,
                              dict(a=np.float32, b=np.float32, c=np.float32))
op_map = lp.get_op_map(knl)  # get operations counting
print(lp.stringify_stats_mapping(op_map))

mem_map = lp.get_mem_access_map(knl)  # get memory access(load, store) counting
print(lp.stringify_stats_mapping(mem_map))
示例#13
0
    def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \
      -> tuple:
        """Generate the kernel stats for a program with its args."""
        args_tuple = tuple(
            (key, value.shape) if hasattr(value, "shape") else (key, value)
            for key, value in kwargs.items())

        # Are kernel stats already in the cache?
        try:
            self.kernel_stats[t_unit][args_tuple]
            return args_tuple
        except KeyError:
            # If not, calculate and cache the stats
            ep_name = t_unit.default_entrypoint.name
            executor = t_unit.target.get_kernel_executor(t_unit,
                                                         self.queue,
                                                         entrypoint=ep_name)
            info = executor.translation_unit_info(
                ep_name, executor.arg_to_dtype_set(kwargs))

            typed_t_unit = executor.get_typed_and_scheduled_translation_unit(
                ep_name, executor.arg_to_dtype_set(kwargs))
            kernel = typed_t_unit[ep_name]

            idi = info.implemented_data_info

            param_dict = kwargs.copy()
            param_dict.update({
                k: None
                for k in kernel.arg_dict.keys() if k not in param_dict
            })

            param_dict.update(
                {d.name: None
                 for d in idi if d.name not in param_dict})

            # Generate the wrapper code
            wrapper = executor.get_wrapper_generator()

            gen = PythonFunctionGenerator("_mcom_gen_args_profile",
                                          list(param_dict))

            wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi)

            param_names = kernel.all_params()
            gen("return {%s}" % ", ".join(f"{repr(name)}: {name}"
                                          for name in param_names))

            # Run the wrapper code, save argument values in domain_params
            domain_params = gen.get_picklable_function()(**param_dict)

            # Get flops/memory statistics
            op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess")
            bytes_accessed = lp.get_mem_access_map(
                typed_t_unit, subgroup_size="guess") \
                            .to_bytes().eval_and_sum(domain_params)

            flops = op_map.filter_by(
                dtype=[np.float32, np.float64]).eval_and_sum(domain_params)

            # Footprint gathering is not yet available in loopy with
            # kernel callables:
            # https://github.com/inducer/loopy/issues/399
            if 0:
                try:
                    footprint = lp.gather_access_footprint_bytes(typed_t_unit)
                    footprint_bytes = sum(
                        footprint[k].eval_with_dict(domain_params)
                        for k in footprint)

                except lp.symbolic.UnableToDetermineAccessRange:
                    footprint_bytes = None
            else:
                footprint_bytes = None

            res = SingleCallKernelProfile(time=0,
                                          flops=flops,
                                          bytes_accessed=bytes_accessed,
                                          footprint_bytes=footprint_bytes)

            self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res

            if self.logmgr:
                if f"{ep_name}_time" not in self.logmgr.quantity_data:
                    self.logmgr.add_quantity(KernelProfile(self, ep_name))

            return args_tuple
示例#14
0
def test_mem_access_counter_consec():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
            ],
            name="consec", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(
                a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size='guess')
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    f64consec = mem_map[lp.MemAccess(
                    'global', np.float64,
                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
                    direction='load', variable='g',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    f64consec += mem_map[lp.MemAccess(
                    'global', np.float64,
                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
                    direction='load', variable='h',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    f32consec = mem_map[lp.MemAccess(
                    'global', np.float32,
                    lid_strides={0: 1},
                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                    direction='load', variable='a',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    f32consec += mem_map[lp.MemAccess(
                    'global', np.dtype(np.float32),
                    lid_strides={0: 1},
                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                    direction='load', variable='b',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f64consec == 2*n*m*ell
    assert f32consec == 3*n*m*ell

    f64consec = mem_map[lp.MemAccess(
                    'global', np.float64,
                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
                    direction='store', variable='e',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    f32consec = mem_map[lp.MemAccess(
                    'global', np.float32,
                    lid_strides={0: 1},
                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                    direction='store', variable='c',
                    count_granularity=CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f64consec == n*m*ell
    assert f32consec == n*m*ell
示例#15
0
def test_mem_access_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int32, h=np.int32))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    i32 = mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert i32 == (4*n*m+2*n*m*ell)*n_subgroups

    i32 = mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert i32 == (n*m+n*m*ell)*n_subgroups
示例#16
0
def test_mem_access_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k] = g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    f32l = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
    f64l = mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64l += mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32l == (3*n*m*ell)*n_subgroups
    assert f64l == (2*n*m)*n_subgroups

    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32s == (n*m*ell)*n_subgroups
    assert f64s == (n*m)*n_subgroups
示例#17
0
def test_mem_access_counter_mixed():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
                         ],
                         name="mixed",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(
        knl,
        dict(a=np.float32,
             b=np.float32,
             g=np.float64,
             h=np.float64,
             x=np.float32))
    threads = 16
    knl = lp.split_iname(knl, "j", threads)
    knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})

    mem_map = lp.get_mem_access_map(knl)  # noqa
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f64uniform = mem_map[lp.MemAccess('global',
                                      np.float64,
                                      stride=0,
                                      direction='load',
                                      variable='g')].eval_with_dict(params)
    f64uniform += mem_map[lp.MemAccess('global',
                                       np.float64,
                                       stride=0,
                                       direction='load',
                                       variable='h')].eval_with_dict(params)
    f32uniform = mem_map[lp.MemAccess('global',
                                      np.float32,
                                      stride=0,
                                      direction='load',
                                      variable='x')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.dtype(np.float32),
                                        stride=Variable('m'),
                                        direction='load',
                                        variable='a')].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global',
                                         np.dtype(np.float32),
                                         stride=Variable('m'),
                                         direction='load',
                                         variable='b')].eval_with_dict(params)
    assert f64uniform == 2 * n * m
    assert f32uniform == n * m * l / threads
    assert f32nonconsec == 3 * n * m * l

    f64uniform = mem_map[lp.MemAccess('global',
                                      np.float64,
                                      stride=0,
                                      direction='store',
                                      variable='e')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.float32,
                                        stride=Variable('m'),
                                        direction='store',
                                        variable='c')].eval_with_dict(params)
    assert f64uniform == n * m
    assert f32nonconsec == n * m * l
示例#18
0
def test_mem_access_counter_specialops():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
            ],
            name="specialops", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                            g=np.float64, h=np.float64))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    f32 = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f32 += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32 == (2*n*m*ell)*n_subgroups
    assert f64 == (2*n*m)*n_subgroups

    f32 = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32 == (n*m*ell)*n_subgroups
    assert f64 == (n*m)*n_subgroups

    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                         count_granularity=CG.SUBGROUP)
    tot = filtered_map.eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert tot == (n*m*ell + n*m)*n_subgroups
示例#19
0
def test_summations_and_filters():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
                         ],
                         name="basic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    mem_map = lp.get_mem_access_map(knl)

    loads_a = mem_map.filter_by(direction=['load'],
                                variable=['a']).eval_and_sum(params)
    assert loads_a == 2 * n * m * l

    global_stores = mem_map.filter_by(mtype=['global'],
                                      direction=['store']).eval_and_sum(params)
    assert global_stores == n * m * l + n * m

    ld_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['load'
                                            ]).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['store'
                                            ]).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4 * n * m * l * 3 + 8 * n * m * 2
    assert st_bytes == 4 * n * m * l + 8 * n * m

    # ignore stride and variable names in this map
    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
    f32lall = reduced_map[lp.MemAccess(
        'global', np.float32, direction='load')].eval_with_dict(params)
    f64lall = reduced_map[lp.MemAccess(
        'global', np.float64, direction='load')].eval_with_dict(params)
    assert f32lall == 3 * n * m * l
    assert f64lall == 2 * n * m

    op_map = lp.get_op_map(knl)
    #for k, v in op_map.items():
    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
    assert f32 == n * m * l * 3
    assert f64 == n * m
    assert i32 == n * m * 2

    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
    assert addsub_all == n * m * l + n * m * 2
    assert f32ops_all == n * m * l * 3

    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
    assert non_field == 0

    ops_nodtype = op_map.group_by('name')
    ops_noname = op_map.group_by('dtype')
    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
    assert mul_all == n * m * l + n * m
    assert f64ops_all == n * m

    def func_filter(key):
        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
               key.direction == 'load'

    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
    assert s1f64l == 2 * n * m
示例#20
0
def test_mem_access_counter_mixed():
    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
            ],
            name="mixed", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(
                a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                x=np.float32))

    group_size_0 = 65

    knl = lp.split_iname(knl, "j", group_size_0)
    knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = div_ceil(ell, group_size_0)
    group_size = group_size_0
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='g',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='h',
                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
    f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='x',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='load',
                                variable='a',
                                count_granularity=CG.WORKITEM)
                           ].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='load',
                                variable='b',
                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64uniform == (2*n*m)*n_subgroups
    assert f32uniform == (m*n)*n_subgroups

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    if expect_fallback:
        if ell < group_size_0:
            assert f32nonconsec == 3*n*m*ell*n_workgroups
        else:
            assert f32nonconsec == 3*n*m*n_workgroups*group_size_0
    else:
        assert f32nonconsec == 3*n*m*ell

    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='store', variable='e',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='store',
                                variable='c',
                                count_granularity=CG.WORKITEM)
                           ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64uniform == m*n*n_subgroups

    if expect_fallback:
        if ell < group_size_0:
            assert f32nonconsec == n*m*ell*n_workgroups
        else:
            assert f32nonconsec == n*m*n_workgroups*group_size_0
    else:
        assert f32nonconsec == n*m*ell
示例#21
0
    if mesh.layers:
        cells = cells * (mesh.layers - 1)
    print("CELLS= {0}".format(cells))
    print("DOFS= {0}".format(dofs))

    from loopy.program import make_program

    knl = compile_form(y_form, coffee=False)[0].ast
    warnings = list(knl.silenced_warnings)
    warnings.extend(["insn_count_subgroups_upper_bound", "no_lid_found"])
    knl = knl.copy(silenced_warnings=warnings)
    knl.options.ignore_boostable_into = True

    program = make_program(knl)
    op_map = lp.get_op_map(program, subgroup_size=1)
    mem_map = lp.get_mem_access_map(program, subgroup_size=1)

    for op in ['add', 'sub', 'mul', 'div']:
        print("{0}S= {1}".format(
            op.upper(),
            op_map.filter_by(name=[op], dtype=[np.float64]).eval_and_sum({})))
    print("MEMS= {0}".format(
        mem_map.filter_by(mtype=['global'],
                          dtype=[np.float64]).eval_and_sum({})))
    print("INSTRUCTIONS= {0:d}".format(len(knl.instructions)))
    print("LOOPS= {0:d}".format(len(knl.all_inames())))
    for domain in knl.domains:
        if domain.get_dim_name(3, 0)[0] == "j":
            print("DOF_LOOP_EXTENT= {0:d}".format(
                int(domain.dim_max_val(0).to_str()) + 1))
            break
示例#22
0
def test_all_counters_parallel_matmul():
    bsize = 16
    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize*bsize
    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    f32mul = op_map[
                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)
    f32add = op_map[
                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops = op_map[
                        lp.Op(np.int32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops += op_map[
                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert f32mul+f32add == m*2*n_subgroups

    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={1: bsize},
                             direction='load', variable='b',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('m')},
                             gid_strides={0: Variable('m')*bsize},
                             direction='load',
                             variable='a', count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32s1lb == n*m*ell/bsize
    assert f32s1la == n*m*ell/bsize

    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
                             direction='store', variable='c',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32coal == n*ell

    local_mem_map = lp.get_mem_access_map(knl,
                        count_redundant_work=True,
                        subgroup_size=SGS).filter_by(mtype=['local'])

    local_mem_l = local_mem_map.filter_by(direction=['load']
                                          ).eval_and_sum(params)
    # (count-per-sub-group)*n_subgroups
    assert local_mem_l == m*2*n_subgroups

    local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={1: 16},
                                               gid_strides={},
                                               variable='a_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)
    local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={0: 1},
                                               gid_strides={},
                                               variable='b_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_l_a == local_mem_l_b == m*n_subgroups

    local_mem_s = local_mem_map.filter_by(direction=['store']
                                          ).eval_and_sum(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_s == m*2/bsize*n_subgroups
示例#23
0
def test_mem_access_tagged_variables():
    bsize = 16
    knl = lp.make_kernel(
        "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
        ["c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"],
        name="matmul",
        assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize * bsize
    n_workgroups = div_ceil(n, bsize) * div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group

    mem_access_map = lp.get_mem_access_map(knl,
                                           count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={0: 1},
        gid_strides={1: bsize},
        direction='load',
        variable='b',
        variable_tag='mmbload',
        count_granularity=CG.WORKITEM)].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={1: Variable('m')},
        gid_strides={0: Variable('m') * bsize},
        direction='load',
        variable='a',
        variable_tag='mmaload',
        count_granularity=CG.SUBGROUP)].eval_with_dict(params)

    assert f32s1lb == n * m * ell

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32s1la == m * n_subgroups

    f32coal = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={
            0: 1,
            1: Variable('ell')
        },
        gid_strides={
            0: Variable('ell') * bsize,
            1: bsize
        },
        direction='store',
        variable='c',
        variable_tag='mmresult',
        count_granularity=CG.WORKITEM)].eval_with_dict(params)

    assert f32coal == n * ell
示例#24
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
    pytest.importorskip("fparser")
    ctx = ctx_factory()

    filename = os.path.join(os.path.dirname(__file__),
                            "strongVolumeKernels.f90")
    with open(filename) as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    program = lp.parse_fortran(source, filename, seq_dependencies=False)

    hsv_r, hsv_s = program["strongVolumeKernelR"], program[
        "strongVolumeKernelS"]

    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s
    hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True)

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.prioritize_loops(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    from loopy.frontend.fortran.translator import specialize_fortran_division
    hsv = specialize_fortran_division(hsv)

    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv,
                          "D[:,:]",
                          fetch_outer_inames="e",
                          default_tag="l.auto")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
                            lp.find_one_rule_matching(
                                hsv, prep_var_name + "_*subst*"),
                            default_tag="l.auto")

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv,
                          "Q[ii,jj,k,:,:,e]",
                          sweep_inames=ilp_inames,
                          default_tag="l.auto")

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero", "-cl-fast-relaxed-math",
                             "-cl-finite-math-only", "-cl-mad-enable",
                             "-cl-no-signed-zeros"
                         ])

    if 1:
        print("OPS")
        op_map = lp.get_op_map(hsv, subgroup_size=32)
        print(lp.stringify_stats_mapping(op_map))

        print("MEM")
        gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
        print(lp.stringify_stats_mapping(gmem_map))

    # FIXME: renaming's a bit tricky in this program model.
    # add a simple transformation for it
    # hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
示例#25
0
    def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \
      -> tuple:
        """Generate the kernel stats for a program with its args."""
        args_tuple = tuple(
            (key, value.shape) if hasattr(value, "shape") else (key, value)
            for key, value in kwargs.items())

        # Are kernel stats already in the cache?
        try:
            x = self.kernel_stats[program][args_tuple]  # noqa
            return args_tuple
        except KeyError:
            # If not, calculate and cache the stats
            executor = program.target.get_kernel_executor(program, self.queue)
            info = executor.kernel_info(executor.arg_to_dtype_set(kwargs))

            kernel = executor.get_typed_and_scheduled_kernel(
                executor.arg_to_dtype_set(kwargs))

            idi = info.implemented_data_info

            types = {
                k: v
                for k, v in kwargs.items()
                if hasattr(v, "dtype") and not v.dtype == object
            }

            param_dict = kwargs.copy()
            param_dict.update({
                k: None
                for k in kernel.arg_dict.keys() if k not in param_dict
            })

            param_dict.update(
                {d.name: None
                 for d in idi if d.name not in param_dict})

            # Generate the wrapper code
            wrapper = executor.get_wrapper_generator()

            gen = PythonFunctionGenerator("_mcom_gen_args_profile",
                                          list(param_dict))

            wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi)
            wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi)

            param_names = program.all_params()
            gen("return {%s}" % ", ".join(f"{repr(name)}: {name}"
                                          for name in param_names))

            # Run the wrapper code, save argument values in domain_params
            domain_params = gen.get_picklable_function()(**param_dict)

            # Get flops/memory statistics
            kernel = lp.add_and_infer_dtypes(kernel, types)
            op_map = lp.get_op_map(kernel, subgroup_size="guess")
            bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \
              .to_bytes().eval_and_sum(domain_params)

            flops = op_map.filter_by(
                dtype=[np.float32, np.float64]).eval_and_sum(domain_params)

            try:
                footprint = lp.gather_access_footprint_bytes(kernel)
                footprint_bytes = sum(
                    footprint[k].eval_with_dict(domain_params)
                    for k in footprint)

            except lp.symbolic.UnableToDetermineAccessRange:
                footprint_bytes = None

            res = ProfileResult(time=0,
                                flops=flops,
                                bytes_accessed=bytes_accessed,
                                footprint_bytes=footprint_bytes)

            self.kernel_stats.setdefault(program, {})[args_tuple] = res
            return args_tuple