def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ arg_config, tuner_config = program_config len_A = arg_config['A_len'] A_dtype = arg_config['A_dtype'] A_ndim = arg_config['A_ndim'] A_shape = arg_config['A_shape'] A_powers = arg_config['A_powers'] len_B = arg_config['B_len'] B_dtype = arg_config['B_dtype'] B_ndim = arg_config['B_ndim'] B_shape = arg_config['B_shape'] inner_type_A = get_ctree_type(A_dtype) array_type_A = NdPointer(A_dtype, A_ndim, A_shape) inner_type_B = get_ctree_type(B_dtype) array_type_B = NdPointer(B_dtype, B_ndim, B_shape) apply_one_typesig = FuncType(inner_type_A, [inner_type_A, inner_type_B]) tree = CFile("generated", [ py_ast.body[0], FunctionDecl( Void(), "apply_all", params=[SymbolRef("A", array_type_A), SymbolRef("B", array_type_B)], defn=[ For(Assign(SymbolRef("k", Int()), Constant(1)), Lt(SymbolRef("k"), Constant(A_powers)), PostInc(SymbolRef("k")), [ For(Assign(SymbolRef("i", Int()), Constant(0)), Lt(SymbolRef("i"), Constant(len_A)), PostInc(SymbolRef("i")), [ Assign(ArrayRef(SymbolRef("A"), Add(Mul(Constant(len_A), SymbolRef("k")), SymbolRef("i"))), FunctionCall(SymbolRef("apply"), [ArrayRef(SymbolRef("A"), Add(Mul(Constant(len_A), Sub(SymbolRef("k"), Constant(1))), SymbolRef("i"))),ArrayRef(SymbolRef("B"), SymbolRef("i"))])) ] ) ] ), ] ), ]) tree = PyBasicConversions().visit(tree) apply_one = tree.find(FunctionDecl, name="apply") apply_one.set_static().set_inline() apply_one.set_typesig(apply_one_typesig) entry_point_typesig = tree.find(FunctionDecl, name="apply_all").get_type().as_ctype() return Project([tree]), entry_point_typesig
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ arg_config, tuner_config = program_config len_A = arg_config['A_len'] A_dtype = arg_config['A_dtype'] A_ndim = arg_config['A_ndim'] A_shape = arg_config['A_shape'] inner_type = get_ctree_type(A_dtype) array_type = NdPointer(A_dtype, A_ndim, A_shape) apply_one_typesig = FuncType(inner_type, [inner_type]) template_entries = { 'array_decl': SymbolRef("A", array_type), 'array_ref' : SymbolRef("A"), 'num_items' : Constant(len_A), } tree = CFile("generated", [ py_ast.body[0], StringTemplate("""\ void apply_all($array_decl) { for (int i = 0; i < $num_items; i++) { $array_ref[i] = apply( $array_ref[i] ); } } """, template_entries) ]) tree = PyBasicConversions().visit(tree) apply_one = tree.find(FunctionDecl, name="apply") apply_one.set_static().set_inline() apply_one.set_typesig(apply_one_typesig) with open("graph.dot", 'w') as f: f.write( to_dot(tree) ) entry_point_typesig = FuncType(Void(), [array_type]).as_ctype() return Project([tree]), entry_point_typesig
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ self._current_config = program_config arg_config, tuner_config = program_config n, dtype = arg_config['n'], arg_config['dtype'] rx, ry = tuner_config['rx']*4, tuner_config['ry']*4 cx, cy = tuner_config['cx']*4, tuner_config['cy']*4 unroll = tuner_config['ry']*4 elem_type = get_ctree_type(dtype) array_type = NdPointer(dtype, 2, (n,n)) dgemm_typesig = FuncType(Void(), [array_type, array_type, array_type, Ptr(Double())]) A = SymbolRef("A", array_type) B = SymbolRef("B", array_type) C = SymbolRef("C", array_type) N = Constant(n) RX, RY = Constant(rx), Constant(ry) CX, CY = Constant(cx), Constant(cy) UNROLL = Constant(unroll) template_args = { "A_decl": A.copy(declare=True), "B_decl": B.copy(declare=True), "C_decl": C.copy(declare=True), "RX": RX, "RY": RY, "CX": CX, "CY": CY, "UNROLL": UNROLL, "lda": N, } preamble = StringTemplate(""" #include <immintrin.h> #define min(x,y) (((x)<(y))?(x):(y)) """, copy.deepcopy(template_args)) reg_template_args = { 'load_c_block': self._gen_load_c_block(rx, ry, n), 'store_c_block': self._gen_store_c_block(rx, ry, n), 'k_rank1_updates': self._gen_k_rank1_updates(rx, ry, cx, cy, unroll, n), } reg_template_args.update(copy.deepcopy(template_args)) register_dgemm = StringTemplate(""" void register_dgemm( $A_decl, $B_decl, $C_decl, int K ) { __m256d c[$RY/4][$RX]; $load_c_block while ( K >= $UNROLL ) { $k_rank1_updates A += $UNROLL*$CY; B += $UNROLL; K -= $UNROLL; } $store_c_block } """, reg_template_args) fast_dgemm = StringTemplate(""" void fast_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl ) { static double a[$CX*$CY] __attribute__ ((aligned (16))); // make a local aligned copy of A's block for( int j = 0; j < K; j++ ) for( int i = 0; i < M; i++ ) a[i+j*$CY] = A[i+j*$lda]; // multiply using the copy for( int j = 0; j < N; j += $RX ) for( int i = 0; i < M; i += $RY ) register_dgemm( a + i, B + j*$lda, C + i + j*$lda, K ); }""", template_args) fringe_dgemm = StringTemplate(""" void fringe_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl ) { for( int j = 0; j < N; j++ ) for( int i = 0; i < M; i++ ) for( int k = 0; k < K; k++ ) C[i+j*$lda] += A[i+k*$lda] * B[k+j*$lda]; } """, copy.deepcopy(template_args)) wall_time = StringTemplate(""" #include <sys/time.h> double wall_time () { struct timeval t; gettimeofday (&t, NULL); return 1.*t.tv_sec + 1.e-6*t.tv_usec; } """, {}) dgemm = StringTemplate(""" int align( int x, int y ) { return x <= y ? x : (x/y)*y; } void dgemm($C_decl, $A_decl, $B_decl, double *duration) { double start_time = wall_time(); for( int i = 0; i < $lda; ) { int I = align( min( $lda-i, $CY ), $RY ); for( int j = 0; j < $lda; ) { int J = align( $lda-j, $RX ); for( int k = 0; k < $lda; ) { int K = align( min( $lda-k, $CX ), $UNROLL ); if( (I%$RY) == 0 && (J%$RX) == 0 && (K%$UNROLL) == 0 ) fast_dgemm ( I, J, K, A + i + k*$lda, B + k + j*$lda, C + i + j*$lda ); else fringe_dgemm( I, J, K, A + i + k*$lda, B + k + j*$lda, C + i + j*$lda ); k += K; } j += J; } i += I; } // report time back for tuner *duration = wall_time() - start_time; } """, copy.deepcopy(template_args)) tree = CFile("generated", [ preamble, wall_time, register_dgemm, fast_dgemm, fringe_dgemm, dgemm, ]) return Project([tree]), dgemm_typesig.as_ctype()
def args_to_subconfig(self, args): return {'arg_typesig': tuple(get_ctree_type(arg) for arg in args)}
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ self._current_config = program_config arg_config, tuner_config = program_config n, dtype = arg_config['n'], arg_config['dtype'] rx, ry = tuner_config['rx'] * 4, tuner_config['ry'] * 4 cx, cy = tuner_config['cx'] * 4, tuner_config['cy'] * 4 unroll = tuner_config['ry'] * 4 elem_type = get_ctree_type(dtype) array_type = NdPointer(dtype, 2, (n, n)) dgemm_typesig = FuncType( Void(), [array_type, array_type, array_type, Ptr(Double())]) A = SymbolRef("A", array_type) B = SymbolRef("B", array_type) C = SymbolRef("C", array_type) N = Constant(n) RX, RY = Constant(rx), Constant(ry) CX, CY = Constant(cx), Constant(cy) UNROLL = Constant(unroll) template_args = { "A_decl": A.copy(declare=True), "B_decl": B.copy(declare=True), "C_decl": C.copy(declare=True), "RX": RX, "RY": RY, "CX": CX, "CY": CY, "UNROLL": UNROLL, "lda": N, } preamble = StringTemplate( """ #include <immintrin.h> #define min(x,y) (((x)<(y))?(x):(y)) """, copy.deepcopy(template_args)) reg_template_args = { 'load_c_block': self._gen_load_c_block(rx, ry, n), 'store_c_block': self._gen_store_c_block(rx, ry, n), 'k_rank1_updates': self._gen_k_rank1_updates(rx, ry, cx, cy, unroll, n), } reg_template_args.update(copy.deepcopy(template_args)) register_dgemm = StringTemplate( """ void register_dgemm( $A_decl, $B_decl, $C_decl, int K ) { __m256d c[$RY/4][$RX]; $load_c_block while ( K >= $UNROLL ) { $k_rank1_updates A += $UNROLL*$CY; B += $UNROLL; K -= $UNROLL; } $store_c_block } """, reg_template_args) fast_dgemm = StringTemplate( """ void fast_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl ) { static double a[$CX*$CY] __attribute__ ((aligned (16))); // make a local aligned copy of A's block for( int j = 0; j < K; j++ ) for( int i = 0; i < M; i++ ) a[i+j*$CY] = A[i+j*$lda]; // multiply using the copy for( int j = 0; j < N; j += $RX ) for( int i = 0; i < M; i += $RY ) register_dgemm( a + i, B + j*$lda, C + i + j*$lda, K ); }""", template_args) fringe_dgemm = StringTemplate( """ void fringe_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl ) { for( int j = 0; j < N; j++ ) for( int i = 0; i < M; i++ ) for( int k = 0; k < K; k++ ) C[i+j*$lda] += A[i+k*$lda] * B[k+j*$lda]; } """, copy.deepcopy(template_args)) wall_time = StringTemplate( """ #include <sys/time.h> double wall_time () { struct timeval t; gettimeofday (&t, NULL); return 1.*t.tv_sec + 1.e-6*t.tv_usec; } """, {}) dgemm = StringTemplate( """ int align( int x, int y ) { return x <= y ? x : (x/y)*y; } void dgemm($C_decl, $A_decl, $B_decl, double *duration) { double start_time = wall_time(); for( int i = 0; i < $lda; ) { int I = align( min( $lda-i, $CY ), $RY ); for( int j = 0; j < $lda; ) { int J = align( $lda-j, $RX ); for( int k = 0; k < $lda; ) { int K = align( min( $lda-k, $CX ), $UNROLL ); if( (I%$RY) == 0 && (J%$RX) == 0 && (K%$UNROLL) == 0 ) fast_dgemm ( I, J, K, A + i + k*$lda, B + k + j*$lda, C + i + j*$lda ); else fringe_dgemm( I, J, K, A + i + k*$lda, B + k + j*$lda, C + i + j*$lda ); k += K; } j += J; } i += I; } // report time back for tuner *duration = wall_time() - start_time; } """, copy.deepcopy(template_args)) tree = CFile("generated", [ preamble, wall_time, register_dgemm, fast_dgemm, fringe_dgemm, dgemm, ]) return Project([tree]), dgemm_typesig.as_ctype()
def get_base_type(self): return get_ctree_type(self.ptr._dtype_)
def visit_Constant(self, node): return get_ctree_type(node.value)
def args_to_subconfig(self, args): return {'arg_type': get_ctree_type(args[0])}