def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 1, comment) write_stra_mat(myfile, 0, a_coeffs, index, 'ms, ks', dims, level) write_stra_mat(myfile, 1, b_coeffs, index, 'ks, ns', dims, level) write_stra_mat(myfile, 2, c_coeffs, index, 'ms, ns', dims, level) #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #add = 'straprim_naive(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) add = 'straprim_ab(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index) write_line(myfile, 1, add) write_line(myfile, 1, 'comm.barrier();') write_break(myfile)
def write_divisor_initializer(myfile, dims, level): level_dim = exp_dim(dims, level) write_line( myfile, 1, 'const std::array<unsigned,2> A_divisor={%d,%d};' % (level_dim[0], level_dim[1])) write_line( myfile, 1, 'const std::array<unsigned,2> B_divisor={%d,%d};' % (level_dim[1], level_dim[2])) write_line( myfile, 1, 'const std::array<unsigned,2> C_divisor={%d,%d};' % (level_dim[1], level_dim[2])) write_break(myfile)
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 1, comment) write_stra_mat(myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level) write_stra_mat(myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level) write_stra_mat(myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level) myfile.write( \ '''\ if (Cv{0}.stride(!row_major) == 1) {{ Av{0}.transpose(); Bv{0}.transpose(); Cv{0}.transpose(); stra_gemm(comm, cfg, alpha, Bv{0}, Av{0}, beta, Cv{0}); }} else {{ stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0}); }} '''.format( index ) ) #Av{0}.swap(Bv{0}); #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #write_line( myfile, 1, add ) write_line(myfile, 1, 'comm.barrier();') write_line( myfile, 1, '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format( index)) write_line(myfile, 1, '//print_tensor_matrix( ct );') write_break(myfile)
def write_straprim_caller(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, num_multiplies, level=1): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([' %s += %s * M%d' % ( getBlockName( 2, i, dims, level ), c, index ) for i, c in enumerate(c_coeffs) if is_nonzero(c)]) comment += ';' write_line(myfile, 1, comment) write_stra_mat( myfile, 0, a_coeffs, index, ['AC', 'AB'], dims, level ) write_stra_mat( myfile, 1, b_coeffs, index, ['AB', 'BC'], dims, level ) write_stra_mat( myfile, 2, c_coeffs, index, ['AC', 'BC'], dims, level ) myfile.write( \ '''\ //if (ct.stride(!row_major) == 1) //{{ // Av{0}.transpose(); // Bv{0}.transpose(); // Cv{0}.transpose(); // straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC, // alpha, // B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC, // A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC, // beta, // C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC); //}} else {{ straprim_naive<T,{1},{2},{3}>(comm, cfg, my_sub_len_AB, my_sub_len_AC, my_sub_len_BC, alpha, A{0}_list, A{0}_coeff_list, my_stride_A_AB, my_stride_A_AC, B{0}_list, B{0}_coeff_list, my_stride_B_AB, my_stride_B_BC, beta, C{0}_list, C{0}_coeff_list, my_stride_C_AC, my_stride_C_BC); //}} '''.format( index, getNNZ(a_coeffs), getNNZ(b_coeffs), getNNZ(c_coeffs) ) ) #Av{0}.swap(Bv{0}); #add = 'stra_gemm(comm, cfg, alpha, Av{0}, Bv{0}, beta, Cv{0});'.format( index ) #write_line( myfile, 1, add ) write_line( myfile, 1, 'comm.barrier();' ) write_line( myfile, 1, '//std::cout << "stra_internal/stra_mult_M{0}:" << std::endl;'.format( index ) ) write_line( myfile, 1, '//print_tensor_matrix( ct );' ) write_break( myfile )
def create_kernel_header(myfile, coeffs): #write_line( myfile, 0, '#include "bl_dgemm_kernel.h"' ) write_break(myfile) abc_micro_kernel_gen.write_header_start(myfile) for i, coeff_set in enumerate(transpose(coeffs[2])): if len(coeff_set) > 0: nonzero_coeffs = [ coeff for coeff in coeff_set if is_nonzero(coeff) ] nnz = len(nonzero_coeffs) abc_micro_kernel_gen.generate_kernel_header( myfile, nonzero_coeffs, i) write_break(myfile) abc_micro_kernel_gen.write_header_end(myfile)
def create_micro_functions(myfile, coeffs, kernel_header_filename): write_line(myfile, 0, '#include "%s"' % kernel_header_filename) write_break(myfile) abc_micro_kernel_gen.write_common_rankk_macro_assembly(myfile) write_break(myfile) abc_micro_kernel_gen.macro_initialize_assembly(myfile) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_xor0_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_loopkiter_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_loopkleft_assembly( myfile ) #write_break( myfile ) #abc_micro_kernel_gen.macro_rankk_postaccum_assembly( myfile ) write_break(myfile) for i, coeff_set in enumerate(transpose(coeffs[2])): if len(coeff_set) > 0: nonzero_coeffs = [ coeff for coeff in coeff_set if is_nonzero(coeff) ] nnz = len(nonzero_coeffs) if nnz <= 23: abc_micro_kernel_gen.generate_micro_kernel( myfile, nonzero_coeffs, i) write_break(myfile)
def write_common_start_assembly(myfile, nnz): myfile.write( \ '''\ void* b_next = bli_auxinfo_next_b( data ); uint64_t k_iter = k / 4; uint64_t k_left = k % 4; ''' ) add = 'double ' add += ', '.join( ['*coeff%d = &coeff_list[%d]' % (i, i) for i in range(nnz)]) add += ';' write_line(myfile, 1, add) add = 'double ' add += ', '.join(['*c%d = c_list[%d]' % (i, i) for i in range(nnz)]) add += ';' write_line(myfile, 1, add) write_break(myfile) myfile.write( \ '''\ __asm__ volatile ( " \\n\\t" " \\n\\t" "movq %[a], %%rax \\n\\t" // load address of a. ( v ) "movq %[b], %%rbx \\n\\t" // load address of b. ( v ) "movq %[b_next], %%r15 \\n\\t" // load address of b_next. ( v ) "addq $-4 * 64, %%r15 \\n\\t" // ( ? ) " \\n\\t" "vmovapd 0 * 32(%%rax), %%ymm0 \\n\\t" // initialize loop by pre-loading "vmovapd 0 * 32(%%rbx), %%ymm2 \\n\\t" // elements of a and b. "vpermilpd $0x5, %%ymm2, %%ymm3 \\n\\t" " \\n\\t" " \\n\\t" "movq %[cs_c], %%rdi \\n\\t" // load cs_c "leaq (,%%rdi,8), %%rdi \\n\\t" // cs_c * sizeof(double) ''' )
def gen_abc_fmm(coeff_filename, dims, level, outfilename): coeffs = read_coeffs(coeff_filename) #print coeffs #print coeffs[0][0] #coeffs2= [ transpose( U2 ), transpose( V2 ), transpose( W2 ) ] with open(outfilename, 'w') as myfile: cur_coeffs = generateCoeffs(coeffs, level) #writeCoeffs( cur_coeffs ) #writeEquation( cur_coeffs, dims, level ) num_multiplies = len(cur_coeffs[0][0]) writePartition(myfile, dims, level) write_break(myfile) create_straprim_caller(myfile, cur_coeffs, dims, num_multiplies, level)
def write_abc_strassen_header(myfile): write_line(myfile, 1, 'double *packA, *packB;') write_break(myfile) write_line(myfile, 1, 'int bl_ic_nt = bl_read_nway_from_env( "BLISLAB_IC_NT" );') write_break(myfile) write_line(myfile, 1, '//// Allocate packing buffers') write_line( myfile, 1, '//packA = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) * bl_ic_nt, sizeof(double) );' ) write_line( myfile, 1, '//packB = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 ) , sizeof(double) );' ) write_line(myfile, 1, 'bl_malloc_packing_pool( &packA, &packB, n, bl_ic_nt );') write_break(myfile)
def create_macro_functions(myfile, coeffs): for i, coeff_set in enumerate(transpose(coeffs[2])): if len(coeff_set) > 0: write_macro_func(myfile, coeff_set, i, 'C') write_break(myfile)
def all_adds(coeffs, name): for i, coeff_set in enumerate(coeffs): if len(coeff_set) > 0: write_packm_func(myfile, coeff_set, i, name) write_break(myfile)
def gen_abc_fmm(coeff_filename, dims, level, outfilename, micro_kernel_filename, kernel_header_filename): coeffs = read_coeffs(coeff_filename) #print coeffs #print coeffs[0][0] #coeffs2= [ transpose( U2 ), transpose( V2 ), transpose( W2 ) ] with open(outfilename, 'w') as myfile: write_line(myfile, 0, '#include "%s"' % kernel_header_filename[10:]) write_line(myfile, 0, '#include "bl_dgemm.h"') write_break(myfile) cur_coeffs = generateCoeffs(coeffs, level) #writeCoeffs( cur_coeffs ) #writeEquation( cur_coeffs, dims, level ) num_multiplies = len(cur_coeffs[0][0]) create_packm_functions(myfile, cur_coeffs) my_micro_file = open(micro_kernel_filename, 'w') create_micro_functions(my_micro_file, cur_coeffs, kernel_header_filename[10:]) my_kernel_header = open(kernel_header_filename, 'w') create_kernel_header(my_kernel_header, cur_coeffs) create_macro_functions(myfile, cur_coeffs) create_straprim_abc_functions(myfile, cur_coeffs, dims, level) write_line( myfile, 0, 'void bl_dgemm_strassen_abc( int m, int n, int k, double *XA, int lda, double *XB, int ldb, double *XC, int ldc )' ) write_line(myfile, 0, '{') write_abc_strassen_header(myfile) writePartition(myfile, dims, level) write_break(myfile) write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 1, '#pragma omp parallel num_threads( bl_ic_nt )') write_line(myfile, 0, '#endif') write_line(myfile, 1, '{') create_straprim_caller(myfile, cur_coeffs, dims, num_multiplies, level) write_line(myfile, 1, '}') write_break(myfile) level_dim = exp_dim(dims, level) write_line( myfile, 1, 'bl_dynamic_peeling( m, n, k, XA, lda, XB, ldb, XC, ldc, %d * DGEMM_MR, %d, %d * DGEMM_NR );' % (level_dim[0], level_dim[1], level_dim[2])) write_break(myfile) write_line(myfile, 1, '//free( packA );') write_line(myfile, 1, '//free( packB );') write_line(myfile, 0, '}')
def write_straprim_abc_function(myfile, index, a_coeffs, b_coeffs, c_coeffs, dims, level): comment = '// M%d = (' % (index) comment += ' + '.join([str(c) + ' * %s' % getBlockName( 0, i, dims, level ) \ for i, c in enumerate(a_coeffs) if is_nonzero(c)]) comment += ') * (' comment += ' + '.join([str(c) + ' * %s' % getBlockName( 1, i, dims, level ) \ for i, c in enumerate(b_coeffs) if is_nonzero(c)]) comment += '); ' comment += '; '.join([ ' %s += %s * M%d' % (getBlockName(2, i, dims, level), c, index) for i, c in enumerate(c_coeffs) if is_nonzero(c) ]) comment += ';' write_line(myfile, 0, comment) add = 'void bl_dgemm_straprim_abc%d( int m, int n, int k, ' % index add += ', '.join( ['double* %s%d' % ('a', i) for i in range(getNNZ(a_coeffs))]) add += ', int lda, ' add += ', '.join( ['double* %s%d' % ('b', i) for i in range(getNNZ(b_coeffs))]) add += ', int ldb, ' add += ', '.join( ['double* %s%d' % ('c', i) for i in range(getNNZ(c_coeffs))]) add += ', int ldc, double *packA, double *packB, int bl_ic_nt ) {' write_line(myfile, 0, add) write_line(myfile, 1, 'int i, j, p, ic, ib, jc, jb, pc, pb;') write_line(myfile, 1, 'for ( jc = 0; jc < n; jc += DGEMM_NC ) {') write_line(myfile, 2, 'jb = min( n - jc, DGEMM_NC );') write_line(myfile, 2, 'for ( pc = 0; pc < k; pc += DGEMM_KC ) {') write_line(myfile, 3, 'pb = min( k - pc, DGEMM_KC );') #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel for num_threads( bl_ic_nt ) private( j )' ) #write_line( myfile, 0, '#endif') write_line(myfile, 3, '{') write_line(myfile, 4, 'int tid = omp_get_thread_num();') write_line(myfile, 4, 'int my_start;') write_line(myfile, 4, 'int my_end;') write_line(myfile, 4, 'bl_get_range( jb, DGEMM_NR, &my_start, &my_end );') write_line(myfile, 4, 'for ( j = my_start; j < my_end; j += DGEMM_NR ) {') add = 'packB_add_stra_abc%d( min( jb - j, DGEMM_NR ), pb, ' % index add += ', '.join([ '&%s%d[ pc + (jc+j)*ldb ]' % ('b', i) for i in range(getNNZ(b_coeffs)) ]) add += ', ldb, &packB[ j * pb ] );' write_line(myfile, 5, add) write_line(myfile, 4, '}') write_line(myfile, 3, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') #write_line( myfile, 0, '#ifdef _PARALLEL_') #write_line( myfile, 3, '#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i )' ) #write_line( myfile, 0, '#endif') write_line(myfile, 3, '{') #write_line( myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 4, 'int tid = omp_get_thread_num();') write_line(myfile, 4, 'int my_start;') write_line(myfile, 4, 'int my_end;') write_line(myfile, 4, 'bl_get_range( m, DGEMM_MR, &my_start, &my_end );') #write_line( myfile, 0, '#else') #write_line( myfile, 4, 'int tid = 0;' ) #write_line( myfile, 4, 'int my_start = 0;' ) #write_line( myfile, 4, 'int my_end = m;' ) #write_line( myfile, 0, '#endif') write_line(myfile, 4, 'for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {') write_line(myfile, 5, 'ib = min( my_end - ic, DGEMM_MC );') write_line(myfile, 5, 'for ( i = 0; i < ib; i += DGEMM_MR ) {') add = 'packA_add_stra_abc%d( min( ib - i, DGEMM_MR ), pb, ' % index add += ', '.join([ '&%s%d[ pc*lda + (ic+i) ]' % ('a', i) for i in range(getNNZ(a_coeffs)) ]) add += ', lda, &packA[ tid * DGEMM_MC * pb + i * pb ] );' write_line(myfile, 6, add) write_line(myfile, 5, '}') add = 'bl_macro_kernel_stra_abc%d( ib, jb, pb, packA + tid * DGEMM_MC * pb, packB, ' % index add += ', '.join( ['&%s%d[ jc * ldc + ic ]' % ('c', i) for i in range(getNNZ(c_coeffs))]) add += ', ldc );' write_line(myfile, 5, add) write_line(myfile, 4, '}') write_line(myfile, 3, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') write_line(myfile, 2, '}') write_line(myfile, 1, '}') write_line(myfile, 0, '#ifdef _PARALLEL_') write_line(myfile, 0, '#pragma omp barrier') write_line(myfile, 0, '#endif') write_line(myfile, 0, '}') write_break(myfile)