Exemplo n.º 1
0
def write_straprim_caller(myfile,
                          index,
                          a_coeffs,
                          b_coeffs,
                          c_coeffs,
                          dims,
                          num_multiplies,
                          level=1):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 1, comment)

    write_stra_mat(myfile, 0, a_coeffs, index, 'ms, ks', dims, level)
    write_stra_mat(myfile, 1, b_coeffs, index, 'ks, ns', dims, level)
    write_stra_mat(myfile, 2, c_coeffs, index, 'ms, ns', dims, level)

    #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index )
    #add = 'straprim_naive(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index )
    add = 'straprim_ab(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format(
        index)
    write_line(myfile, 1, add)

    write_line(myfile, 1, 'comm.barrier();')

    write_break(myfile)
Exemplo n.º 2
0
def write_divisor_initializer(myfile, dims, level):
    level_dim = exp_dim(dims, level)
    write_line(
        myfile, 1, 'const std::array<unsigned,2> A_divisor={%d,%d};' %
        (level_dim[0], level_dim[1]))
    write_line(
        myfile, 1, 'const std::array<unsigned,2> B_divisor={%d,%d};' %
        (level_dim[1], level_dim[2]))
    write_line(
        myfile, 1, 'const std::array<unsigned,2> C_divisor={%d,%d};' %
        (level_dim[1], level_dim[2]))
    write_break(myfile)
Exemplo n.º 3
0
def write_straprim_caller(myfile,
                          index,
                          a_coeffs,
                          b_coeffs,
                          c_coeffs,
                          dims,
                          num_multiplies,
                          level=1):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 1, comment)

    write_stra_mat(myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level)
    write_stra_mat(myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level)
    write_stra_mat(myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level)

    myfile.write( \
'''\
    if (Cv{0}.stride(!row_major) == 1)
    {{
        Av{0}.transpose();
        Bv{0}.transpose();
        Cv{0}.transpose();
        stra_gemm(comm, cfg, alpha, Bv{0}, Av{0}, beta, Cv{0});
    }} else {{
        stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});
    }}
'''.format( index ) )

    #Av{0}.swap(Bv{0});

    #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index )
    #write_line( myfile, 1, add )

    write_line(myfile, 1, 'comm.barrier();')

    write_line(
        myfile, 1,
        '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format(
            index))
    write_line(myfile, 1, '//print_tensor_matrix( ct );')

    write_break(myfile)
Exemplo n.º 4
0
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims, level ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)])
    comment += ';'
    write_line(myfile, 1, comment)

    write_stra_mat( myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level )
    write_stra_mat( myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level )
    write_stra_mat( myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level )

    myfile.write( \
'''\
    //if (ct.stride(!row_major) == 1)
    //{{
    //    Av{0}.transpose();
    //    Bv{0}.transpose();
    //    Cv{0}.transpose();
    //    straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC,
    //             alpha,
    //             B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC,
    //             A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC,
    //             beta,
    //             C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC);
    //}} else {{
        straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC,
                 alpha,
                 A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC,
                 B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC,
                 beta,
                 C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC);
    //}}
'''.format( index, getNNZ(a_coeffs), getNNZ(b_coeffs), getNNZ(c_coeffs) ) )

    #Av{0}.swap(Bv{0});

    #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index )
    #write_line( myfile, 1, add )

    write_line( myfile, 1, 'comm.barrier();' )

    write_line( myfile, 1, '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format( index ) )
    write_line( myfile, 1, '//print_tensor_matrix( ct );' )

    write_break( myfile )
Exemplo n.º 5
0
def create_kernel_header(myfile, coeffs):
    #write_line( myfile, 0, '#include "bl_dgemm_kernel.h"' )
    write_break(myfile)
    abc_micro_kernel_gen.write_header_start(myfile)
    for i, coeff_set in enumerate(transpose(coeffs[2])):
        if len(coeff_set) > 0:
            nonzero_coeffs = [
                coeff for coeff in coeff_set if is_nonzero(coeff)
            ]
            nnz = len(nonzero_coeffs)
            abc_micro_kernel_gen.generate_kernel_header(
                myfile, nonzero_coeffs, i)
            write_break(myfile)
    abc_micro_kernel_gen.write_header_end(myfile)
Exemplo n.º 6
0
def create_micro_functions(myfile, coeffs, kernel_header_filename):
    write_line(myfile, 0, '#include "%s"' % kernel_header_filename)
    write_break(myfile)
    abc_micro_kernel_gen.write_common_rankk_macro_assembly(myfile)
    write_break(myfile)
    abc_micro_kernel_gen.macro_initialize_assembly(myfile)
    #write_break( myfile )
    #abc_micro_kernel_gen.macro_rankk_xor0_assembly( myfile )
    #write_break( myfile )
    #abc_micro_kernel_gen.macro_rankk_loopkiter_assembly( myfile )
    #write_break( myfile )
    #abc_micro_kernel_gen.macro_rankk_loopkleft_assembly( myfile )
    #write_break( myfile )
    #abc_micro_kernel_gen.macro_rankk_postaccum_assembly( myfile )
    write_break(myfile)
    for i, coeff_set in enumerate(transpose(coeffs[2])):
        if len(coeff_set) > 0:
            nonzero_coeffs = [
                coeff for coeff in coeff_set if is_nonzero(coeff)
            ]
            nnz = len(nonzero_coeffs)

            if nnz <= 23:
                abc_micro_kernel_gen.generate_micro_kernel(
                    myfile, nonzero_coeffs, i)

            write_break(myfile)
Exemplo n.º 7
0
def write_common_start_assembly(myfile, nnz):
    myfile.write( \
'''\
    void*   b_next = bli_auxinfo_next_b( data );

    uint64_t k_iter = k / 4;
    uint64_t k_left = k % 4;
''' )

    add = 'double '
    add += ', '.join(
        ['*coeff%d = &coeff_list[%d]' % (i, i) for i in range(nnz)])
    add += ';'
    write_line(myfile, 1, add)

    add = 'double '
    add += ', '.join(['*c%d = c_list[%d]' % (i, i) for i in range(nnz)])
    add += ';'
    write_line(myfile, 1, add)

    write_break(myfile)

    myfile.write( \
'''\
	__asm__ volatile
	(
	"                                            \\n\\t"
	"                                            \\n\\t"
    "movq                %[a], %%rax             \\n\\t" // load address of a.              ( v )
    "movq                %[b], %%rbx             \\n\\t" // load address of b.              ( v )
    "movq                %[b_next], %%r15        \\n\\t" // load address of b_next.         ( v )
    "addq          $-4 * 64, %%r15               \\n\\t" //                                 ( ? )
    "                                            \\n\\t"
    "vmovapd   0 * 32(%%rax), %%ymm0             \\n\\t" // initialize loop by pre-loading
    "vmovapd   0 * 32(%%rbx), %%ymm2             \\n\\t" // elements of a and b.
    "vpermilpd  $0x5, %%ymm2, %%ymm3             \\n\\t"
    "                                            \\n\\t"
    "                                            \\n\\t"
    "movq                %[cs_c], %%rdi          \\n\\t" // load cs_c
    "leaq        (,%%rdi,8), %%rdi               \\n\\t" // cs_c * sizeof(double)
''' )
Exemplo n.º 8
0
def gen_abc_fmm(coeff_filename, dims, level, outfilename):

    coeffs = read_coeffs(coeff_filename)
    #print coeffs
    #print coeffs[0][0]

    #coeffs2= [ transpose( U2 ), transpose( V2 ), transpose( W2 ) ]

    with open(outfilename, 'w') as myfile:

        cur_coeffs = generateCoeffs(coeffs, level)
        #writeCoeffs( cur_coeffs )
        #writeEquation( cur_coeffs, dims, level )

        num_multiplies = len(cur_coeffs[0][0])

        writePartition(myfile, dims, level)

        write_break(myfile)

        create_straprim_caller(myfile, cur_coeffs, dims, num_multiplies, level)
Exemplo n.º 9
0
def write_abc_strassen_header(myfile):
    write_line(myfile, 1, 'double *packA, *packB;')
    write_break(myfile)
    write_line(myfile, 1,
               'int bl_ic_nt = bl_read_nway_from_env( "BLISLAB_IC_NT" );')
    write_break(myfile)
    write_line(myfile, 1, '//// Allocate packing buffers')
    write_line(
        myfile, 1,
        '//packA  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) * bl_ic_nt, sizeof(double) );'
    )
    write_line(
        myfile, 1,
        '//packB  = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 )           , sizeof(double) );'
    )

    write_line(myfile, 1,
               'bl_malloc_packing_pool( &packA, &packB, n, bl_ic_nt );')

    write_break(myfile)
Exemplo n.º 10
0
def create_macro_functions(myfile, coeffs):
    for i, coeff_set in enumerate(transpose(coeffs[2])):
        if len(coeff_set) > 0:
            write_macro_func(myfile, coeff_set, i, 'C')
            write_break(myfile)
Exemplo n.º 11
0
 def all_adds(coeffs, name):
     for i, coeff_set in enumerate(coeffs):
         if len(coeff_set) > 0:
             write_packm_func(myfile, coeff_set, i, name)
             write_break(myfile)
Exemplo n.º 12
0
def gen_abc_fmm(coeff_filename, dims, level, outfilename,
                micro_kernel_filename, kernel_header_filename):

    coeffs = read_coeffs(coeff_filename)
    #print coeffs
    #print coeffs[0][0]

    #coeffs2= [ transpose( U2 ), transpose( V2 ), transpose( W2 ) ]

    with open(outfilename, 'w') as myfile:
        write_line(myfile, 0, '#include "%s"' % kernel_header_filename[10:])
        write_line(myfile, 0, '#include "bl_dgemm.h"')
        write_break(myfile)

        cur_coeffs = generateCoeffs(coeffs, level)
        #writeCoeffs( cur_coeffs )
        #writeEquation( cur_coeffs, dims, level )

        num_multiplies = len(cur_coeffs[0][0])

        create_packm_functions(myfile, cur_coeffs)

        my_micro_file = open(micro_kernel_filename, 'w')
        create_micro_functions(my_micro_file, cur_coeffs,
                               kernel_header_filename[10:])

        my_kernel_header = open(kernel_header_filename, 'w')
        create_kernel_header(my_kernel_header, cur_coeffs)

        create_macro_functions(myfile, cur_coeffs)

        create_straprim_abc_functions(myfile, cur_coeffs, dims, level)

        write_line(
            myfile, 0,
            'void bl_dgemm_strassen_abc( int m, int n, int k, double *XA, int lda, double *XB, int ldb, double *XC, int ldc )'
        )
        write_line(myfile, 0, '{')

        write_abc_strassen_header(myfile)

        writePartition(myfile, dims, level)

        write_break(myfile)

        write_line(myfile, 0, '#ifdef _PARALLEL_')
        write_line(myfile, 1, '#pragma omp parallel num_threads( bl_ic_nt )')
        write_line(myfile, 0, '#endif')
        write_line(myfile, 1, '{')
        create_straprim_caller(myfile, cur_coeffs, dims, num_multiplies, level)
        write_line(myfile, 1, '}')

        write_break(myfile)
        level_dim = exp_dim(dims, level)
        write_line(
            myfile, 1,
            'bl_dynamic_peeling( m, n, k, XA, lda, XB, ldb, XC, ldc, %d * DGEMM_MR, %d, %d * DGEMM_NR );'
            % (level_dim[0], level_dim[1], level_dim[2]))

        write_break(myfile)
        write_line(myfile, 1, '//free( packA );')
        write_line(myfile, 1, '//free( packB );')

        write_line(myfile, 0, '}')
Exemplo n.º 13
0
def write_straprim_abc_function(myfile, index, a_coeffs, b_coeffs, c_coeffs,
                                dims, level):
    comment = '// M%d = (' % (index)
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \
                               for i, c in enumerate(a_coeffs) if is_nonzero(c)])
    comment += ') * ('
    comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \
                               for i, c in enumerate(b_coeffs) if is_nonzero(c)])
    comment += '); '
    comment += '; '.join([
        ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index)
        for i, c in enumerate(c_coeffs) if is_nonzero(c)
    ])
    comment += ';'
    write_line(myfile, 0, comment)

    add = 'void bl_dgemm_straprim_abc%d( int m, int n, int k, ' % index

    add += ', '.join(
        ['double* %s%d' % ('a', i) for i in range(getNNZ(a_coeffs))])
    add += ', int lda, '
    add += ', '.join(
        ['double* %s%d' % ('b', i) for i in range(getNNZ(b_coeffs))])
    add += ', int ldb, '
    add += ', '.join(
        ['double* %s%d' % ('c', i) for i in range(getNNZ(c_coeffs))])
    add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {'

    write_line(myfile, 0, add)
    write_line(myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;')
    write_line(myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {')
    write_line(myfile, 2, 'jb = min( n - jc, DGEMM_NC );')
    write_line(myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {')
    write_line(myfile, 3, 'pb = min( k - pc, DGEMM_KC );')
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    #write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )' )
    #write_line( myfile, 0, '#endif')
    write_line(myfile, 3, '{')
    write_line(myfile, 4, 'int tid = omp_get_thread_num();')
    write_line(myfile, 4, 'int my_start;')
    write_line(myfile, 4, 'int my_end;')
    write_line(myfile, 4, 'bl_get_range( jb, DGEMM_NR, &my_start, &my_end );')
    write_line(myfile, 4, 'for ( j = my_start; j < my_end; j += DGEMM_NR ) {')

    add = 'packB_add_stra_abc%d( min( jb - j, DGEMM_NR ), pb, ' % index
    add += ', '.join([
        '&%s%d[ pc + (jc+j)*ldb ]' % ('b', i) for i in range(getNNZ(b_coeffs))
    ])
    add += ', ldb, &packB[ j * pb ] );'
    write_line(myfile, 5, add)
    write_line(myfile, 4, '}')
    write_line(myfile, 3, '}')

    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(myfile, 0, '#pragma omp barrier')
    write_line(myfile, 0, '#endif')
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    #write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )' )
    #write_line( myfile, 0, '#endif')
    write_line(myfile, 3, '{')
    #write_line( myfile, 0, '#ifdef _PARALLEL_')
    write_line(myfile, 4, 'int tid = omp_get_thread_num();')
    write_line(myfile, 4, 'int my_start;')
    write_line(myfile, 4, 'int my_end;')
    write_line(myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );')
    #write_line( myfile, 0, '#else')
    #write_line( myfile, 4, 'int tid = 0;' )
    #write_line( myfile, 4, 'int my_start = 0;' )
    #write_line( myfile, 4, 'int my_end = m;' )
    #write_line( myfile, 0, '#endif')
    write_line(myfile, 4,
               'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {')
    write_line(myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );')
    write_line(myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {')

    add = 'packA_add_stra_abc%d( min( ib - i, DGEMM_MR ), pb, ' % index
    add += ', '.join([
        '&%s%d[ pc*lda + (ic+i) ]' % ('a', i) for i in range(getNNZ(a_coeffs))
    ])
    add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );'
    write_line(myfile, 6, add)

    write_line(myfile, 5, '}')

    add = 'bl_macro_kernel_stra_abc%d( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, ' % index
    add += ', '.join(
        ['&%s%d[ jc * ldc + ic ]' % ('c', i) for i in range(getNNZ(c_coeffs))])
    add += ', ldc );'
    write_line(myfile, 5, add)

    write_line(myfile, 4, '}')
    write_line(myfile, 3, '}')
    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(myfile, 0, '#pragma omp barrier')
    write_line(myfile, 0, '#endif')
    write_line(myfile, 2, '}')
    write_line(myfile, 1, '}')

    write_line(myfile, 0, '#ifdef _PARALLEL_')
    write_line(myfile, 0, '#pragma omp barrier')
    write_line(myfile, 0, '#endif')
    write_line(myfile, 0, '}')
    write_break(myfile)