def get_angle_function(dtypein, dtypeout, pitch=True): type_src = dtype_to_ctype(dtypein) type_dest = dtype_to_ctype(dtypeout) name = "angle_function" if dtypeout == np.float32: fletter = "f" else: fletter = "" if pitch: func = SourceModule(pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def integrate(stepsize = .01, stores = 5, steps=10000, number_of_particles=2**10): gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles) number_of_particles = np.int32(number_of_particles) gpu_rs, gpu_vs = [gpu_r], [gpu_v] for i in xrange(stores-1): gpu_rs.append(gpuarray.empty_like(gpu_r)) gpu_vs.append(gpuarray.empty_like(gpu_v)) advance = SourceModule(advance_kernel).get_function("advance") advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32]) block_size = (32,0,0) grid_size = (int(number_of_particles/32), 0, 0) advance.prepared_call(block_size, grid_size ,gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles) old, new = 1, 2 for i in xrange(steps): r = rs_gpu[old].get_async() v = vs_gpu[old].get_async() advance.prepared_call_async(block_size, grid_size ,gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles) np.write("step{i:4}_r".format(i*stepsize)+".dat", r) np.write("step{i:4}_v".format(i*stepsize)+".dat", r) old, new = new, (new+1)%stores
def get_divscalar_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "divscalar" operation = "/" if pitch: func = SourceModule( pitch_left_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_left_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare([np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_complex_function(real_type, imag_type, result_type, pitch=True): type_real = dtype_to_ctype(real_type) type_imag = dtype_to_ctype(imag_type) type_result = dtype_to_ctype(result_type) name = "makecomplex" if pitch: func = SourceModule(pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi') #[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule(non_pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi') #[np.intp, np.intp, np.intp, np.int32]) return func
def get_astype_function(dtype_dest, dtype_src, pitch = True): type_dest = dtype_to_ctype(dtype_dest) type_src = dtype_to_ctype(dtype_src) name = "astype" operation = "" if pitch: func = SourceModule( pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_conj_function(dtype, pitch=True): type_src = dtype_to_ctype(dtype) if dtype == np.complex128: operation = "pycuda::conj" elif dtype == np.complex64: operation = "pycuda::conj" else: raise TypeError("Only complex arrays are allowed " "to perform conjugation") name = "conj" if pitch: func = SourceModule(pitch_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def get_complex_from_amp_function(in_type, result_type, pitch = True): type_in = dtype_to_ctype(in_type) type_result = dtype_to_ctype(result_type) name = "makecomplex_amp_phase" if pitch: func = SourceModule( pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule( non_pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def integrate(stepsize=0.01, stores=5, steps=10000, number_of_particles=2 ** 10): gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles) number_of_particles = np.int32(number_of_particles) gpu_rs, gpu_vs = [gpu_r], [gpu_v] for i in xrange(stores - 1): gpu_rs.append(gpuarray.empty_like(gpu_r)) gpu_vs.append(gpuarray.empty_like(gpu_v)) advance = SourceModule(advance_kernel).get_function("advance") advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32]) block_size = (32, 0, 0) grid_size = (int(number_of_particles / 32), 0, 0) advance.prepared_call(block_size, grid_size, gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles) old, new = 1, 2 for i in xrange(steps): r = rs_gpu[old].get_async() v = vs_gpu[old].get_async() advance.prepared_call_async( block_size, grid_size, gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles ) np.write("step{i:4}_r".format(i * stepsize) + ".dat", r) np.write("step{i:4}_v".format(i * stepsize) + ".dat", r) old, new = new, (new + 1) % stores
def get_conj_function(dtype, pitch = True): type_src = dtype_to_ctype(dtype) if dtype == np.complex128: operation = "pycuda::conj" elif dtype == np.complex64: operation = "pycuda::conj" else: raise TypeError("Only complex arrays are allowed " "to perform conjugation") name = "conj" if pitch: func = SourceModule( pitch_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True): type_left = dtype_to_ctype(left_dtype) type_right = dtype_to_ctype(right_dtype) type_rslt = dtype_to_ctype(rslt_dtype) name = "divarray" operation = "/" if pitch: func = SourceModule( pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPiPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def get_angle_function(dtypein, dtypeout, pitch = True): type_src = dtype_to_ctype(dtypein) type_dest = dtype_to_ctype(dtypeout) name = "angle_function" if dtypeout == np.float32: fletter = "f" else: fletter = "" if pitch: func = SourceModule( pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_complex_from_amp_function(in_type, result_type, pitch=True): type_in = dtype_to_ctype(in_type) type_result = dtype_to_ctype(result_type) name = "makecomplex_amp_phase" if pitch: func = SourceModule(pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi') #[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule(non_pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi') #[np.intp, np.intp, np.intp, np.int32]) return func
def get_scalardiv_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "scalardiv" operation = "/" if pitch: func = SourceModule( pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_scalardiv_function(src_type, dest_type, pitch=True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "scalardiv" operation = "/" if pitch: func = SourceModule(pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare( 'iiPiPi' + np.dtype(dest_type).char) #[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule(non_pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP' + np.dtype(dest_type).char + 'i') #[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_powscalar_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "powscalar" operation = "pow" if pitch: func = SourceModule( pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch=True): type_left = dtype_to_ctype(left_dtype) type_right = dtype_to_ctype(right_dtype) type_rslt = dtype_to_ctype(rslt_dtype) name = "divarray" operation = "/" if pitch: func = SourceModule(pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPiPi') #[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi') #[np.intp, np.intp, np.intp, np.int32]) return func
def get_astype_function(dtype_dest, dtype_src, pitch=True): type_dest = dtype_to_ctype(dtype_dest) type_src = dtype_to_ctype(dtype_src) name = "astype" operation = "" if pitch: func = SourceModule(pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def get_powscalar_function(src_type, dest_type, pitch=True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "powscalar" operation = "pow" if pitch: func = SourceModule(pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare( 'iiPiPi' + np.dtype(dest_type).char) #[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule(non_pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP' + np.dtype(dest_type).char + 'i') #[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_complex_function(real_type, imag_type, result_type, pitch = True): type_real = dtype_to_ctype(real_type) type_imag = dtype_to_ctype(imag_type) type_result = dtype_to_ctype(result_type) name = "makecomplex" if pitch: func = SourceModule( pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule( non_pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def get_resize_function(dtype): type_src = dtype_to_ctype(dtype) name = "resize" func = SourceModule(reshape_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": "", }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiiiPiPi') #[np.int32, np.int32, np.int32, np.int32, # np.intp, np.int32, np.intp, np.int32]) return func
def get_resize_function(dtype): type_src = dtype_to_ctype(dtype) name = "resize" func = SourceModule( reshape_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": "", }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiiiPiPi')#[np.int32, np.int32, np.int32, np.int32, # np.intp, np.int32, np.intp, np.int32]) return func
def get_realimag_function(dtype, real = True, pitch = True): type_src = dtype_to_ctype(dtype) if dtype == np.complex64: type_dest = "float" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" elif dtype == np.complex128: type_dest = "double" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" else: raise TypeError("only support complex inputs are " "numpy.complex64 or numpy.complex128") if pitch: func = SourceModule( pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_realimag_function(dtype, real=True, pitch=True): type_src = dtype_to_ctype(dtype) if dtype == np.complex64: type_dest = "float" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" elif dtype == np.complex128: type_dest = "double" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" else: raise TypeError("only support complex inputs are " "numpy.complex64 or numpy.complex128") if pitch: func = SourceModule(pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def get_transpose_function(dtype, conj=False): src_type = dtype_to_ctype(dtype) name = "trans" operation = "" if conj: if dtype == np.complex128: operation = "pycuda::conj" elif dtype == np.complex64: operation = "pycuda::conj" func = SourceModule(transpose_template % { "name": name, "type": src_type, "operation": operation }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') #[np.int32, np.int32, np.intp, # np.int32, np.intp, np.int32]) return func
def get_abs_function(dtype, pitch = True): type_src = dtype_to_ctype(dtype) if dtype == np.complex128: operation = "pycuda::abs" type_dest = "double" elif dtype == np.complex64: operation = "pycuda::abs" type_dest = "float" elif dtype == np.float64: operation = "fabs" type_dest = "double" elif dtype == np.float32: operation = "fabsf" type_dest = "float" else: operation = "abs" type_dest = dtype_to_ctype(dtype) name = "abs_function" if pitch: func = SourceModule( pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_transpose_function(dtype, conj = False): src_type = dtype_to_ctype(dtype) name = "trans" operation = "" if conj: if dtype == np.complex128: operation = "pycuda::conj" elif dtype == np.complex64: operation = "pycuda::conj" func = SourceModule( transpose_template % { "name": name, "type": src_type, "operation": operation }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi')#[np.int32, np.int32, np.intp, # np.int32, np.intp, np.int32]) return func
def get_fill_function(dtype, pitch = True): type_dst = dtype_to_ctype(dtype) name = "fill" if pitch: func = SourceModule( fill_pitch_template % { "name": name, "type_dst": type_dst }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPi'+np.dtype(dtype).char) # [np.int32, np.int32, np.intp, np.int32, _get_type(dtype)]) else: func = SourceModule( fill_nonpitch_template % { "name": name, "type_dst": type_dst }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iP'+np.dtype(dtype).char)#[np.int32, np.intp, _get_type(dtype)]) return func
def get_fill_function(dtype, pitch=True): type_dst = dtype_to_ctype(dtype) name = "fill" if pitch: func = SourceModule(fill_pitch_template % { "name": name, "type_dst": type_dst }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPi' + np.dtype(dtype).char) # [np.int32, np.int32, np.intp, np.int32, _get_type(dtype)]) else: func = SourceModule(fill_nonpitch_template % { "name": name, "type_dst": type_dst }, options=["--ptxas-options=-v"]).get_function(name) func.prepare( 'iP' + np.dtype(dtype).char) #[np.int32, np.intp, _get_type(dtype)]) return func
def get_abs_function(dtype, pitch=True): type_src = dtype_to_ctype(dtype) if dtype == np.complex128: operation = "pycuda::abs" type_dest = "double" elif dtype == np.complex64: operation = "pycuda::abs" type_dest = "float" elif dtype == np.float64: operation = "fabs" type_dest = "double" elif dtype == np.float32: operation = "fabsf" type_dest = "float" else: operation = "abs" type_dest = dtype_to_ctype(dtype) name = "abs_function" if pitch: func = SourceModule(pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def setup(self, bottom, top): assert len(bottom) == 3 assert len(top) == 1 # parameter param = eval(self.param_str_) self.lambda_ = param['lambda'] self.clip_gradient_ = param.get('clip_gradient', None) # Create CUDA function with pu.caffe_cuda_context(): self.k_masked_diff_ = ElementwiseKernel( "float *diff, float *pred, float *label, float *mask", "diff[i] = (pred[i] - label[i]) * mask[i]", 'masked_diff') self.k_squared_ = ElementwiseKernel( "float *diff, float *diff2", "diff2[i] = diff[i] * diff[i]", 'squared') self.k_ensure_mask_sum_ = ElementwiseKernel( "float *mask_sum", "mask_sum[i] = max(mask_sum[i], 1.0f)", 'ensure_mask_sum') if self.clip_gradient_ is not None: self.k_clip_gradient = ElementwiseKernel( "float *diff", "diff[i] = fmaxf(-{0}, fminf(diff[i], {0}))".format( self.clip_gradient_), 'clip_gradient') # This should be computed more faster by cublasSdot self.k_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i]", arguments="float *d") self.k_squred_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i]", arguments="float *d") self.k_div_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] / m[i]", arguments="float *d, float *m") self.k_div_squared_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i] / (m[i] * m[i])", arguments="float *d, float *m") func_backward = SourceModule( """ #include <caffe/util/device_alternate.hpp> __global__ void backward(float *pred, float *label, float *mask, float *diff_sum, float *mask_sum, int count, int stride, int sgn, int batch_size, float lambda, float loss_weight, float *diff) { CUDA_KERNEL_LOOP(i, count) { diff[i] = loss_weight * mask[i] * 2.0f * sgn / mask_sum[i / stride] / batch_size * ((pred[i] - label[i]) - lambda / mask_sum[i / stride] * diff_sum[i / stride]); } } """, include_dirs=pu.caffe_include_dirs).get_function("backward") func_backward.prepare("PPPPPiiiiffP") def _func_backward(pred, label, mask, ds, ms, sgn, loss_weight, diff): bg = pu.block_and_grid(pred.size) batch_size = pred.shape[0] count = pred.size stride = pred.size / pred.shape[0] func_backward.prepared_call( bg['grid'], bg['block'], pred.gpudata, label.gpudata, mask.gpudata, ds.gpudata, ms.gpudata, count, stride, sgn, batch_size, self.lambda_, loss_weight, diff.gpudata) self.k_backward_ = _func_backward self.batch_size_ = 0 self.dim_ = 0 self.reshape(bottom, top)
class SLFNSkCUDA(SLFN): """Single Layer Feed-forward Network (SLFN) implementation on GPU with pyCUDA. To choose a specific GPU, use environmental variable ``CUDA_DEVICE``, for exampe ``CUDA_DEVICE=0 python myscript1.py & CUDA_DEVICE=1 python myscript2.py``. In single precision, only upper triangular part of HH matrix is computed to speedup the method. """ def __init__(self, inputs, outputs, norm=None, precision=np.float64): super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision) # startup GPU #self.ctx = misc.init_context(misc.init_device(nDevice)) # NO NO NO, crashes and does not release memory # use CUDA_DEVICE=0 python my-script.py try: linalg.init() except OSError as e: pass # no 'cusolver' library which is paid and not needed # print "error initializing scikit-cuda: %s" % e # print "ignore if toolbox works" # precision-dependent stuff if precision is np.float64: self.posv = lapack.dposv else: self.posv = lapack.sposv self.handle = cublas.cublasCreate() # prepare GPU function kernels kernel = """ __global__ void dev_sigm(%s *a) { unsigned idx = blockDim.x * blockIdx.x + threadIdx.x; a[idx] = 1.0 / ( exp(a[idx]) + 1 ); } """ kernel = kernel % "double" if self.precision is np.float64 else kernel % "float" self.dev_sigm = SourceModule(kernel).get_function("dev_sigm") self.dev_sigm.prepare("P") # GPU transformation functions self.func["lin"] = self._dev_lin self.func["sigm"] = self._dev_sigm self.func["tanh"] = self._dev_tanh self.func["rbf_l1"] = self._dev_rbfl1 self.func["rbf_l2"] = self._dev_rbfl2 self.func["rbf_linf"] = self._dev_rbflinf def _dev_lin(self, devX, devW, devB): """Linear function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) return devH def _dev_sigm(self, devX, devW, devB): """Compute Sigmoid on GPU for a given array and return array.""" # def sigm(a): # block = a._block # grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1) # dev_sigm.prepared_call(grid, block, a.gpudata) # return a devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) block = devH._block grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1) self.dev_sigm.prepared_call(grid, block, devH.gpudata) return devH def _dev_tanh(self, devX, devW, devB): """Hyperbolic tangent function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) cumath.tanh(devH, out=devH) return devH def _dev_rbfl1(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_L1 X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "cityblock")**2 / B)) return devH def _dev_rbfl2(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_L2 X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "euclidean")**2 / B)) return devH def _dev_rbflinf(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_Linf X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "chebyshev")**2 / B)) return devH def add_neurons(self, number, func, W, B): """Add prepared neurons to the SLFN, merge with existing ones. Adds a number of specific neurons to SLFN network. Weights and biases must be provided for that function. If neurons of such type already exist, they are merged together. Args: number (int): the number of new neurons to add func (str): transformation function of hidden layer. Linear function creates a linear model. W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`) B (vector): a 1-D vector of neuron biases, size (`number` * 1) """ ntypes = [nr[1] for nr in self.neurons] # existing types of neurons if func in ntypes: # add to an existing neuron type i = ntypes.index(func) nn0, _, devW, devB = self.neurons[i] number = nn0 + number devW = gpuarray.to_gpu(np.hstack((devW.get(), W))) devB = gpuarray.to_gpu(np.hstack((devB.get(), B))) self.neurons[i] = (number, func, devW, devB) else: # create a new neuron type devW = gpuarray.to_gpu(W) devB = gpuarray.to_gpu(B) self.neurons.append((number, func, devW, devB)) self.reset() self.B = None def reset(self): """ Resets intermediate training results, releases memory that they use. Keeps solution of ELM, so a trained ELM remains operational. Can be called to free memory after an ELM is trained. """ self.L = sum([n[0] for n in self.neurons]) # get number of neurons self.HH = None self.HT = None def _project(self, X, dev=False): """Projects X to H, an auxiliary function that implements a particular projection. For actual projection, use `ELM.project()` instead. Args: X (matrix): an input data matrix, size (N * `inputs`) dev (bool, optional): whether leave result in the GPU memory Returns: H (matrix): an SLFN hidden layer representation, size (N * `L`) where 'L' is number of neurons """ assert self.neurons is not None, "ELM has no neurons" X = np.array(X, order="C", dtype=self.precision) devX = gpuarray.to_gpu(X) devH = gpuarray.empty((X.shape[0], self.L), dtype=self.precision) i = 0 for nn, ftype, devW, devB in self.neurons: devH[:, i:i+nn] = self.func[ftype](devX, devW, devB) i += nn H = devH if dev else devH.get() return H def _predict(self, X, dev=False): """Predict a batch of data. Auxiliary function that implements a particular prediction. For prediction, use `ELM.predict()` instead. Args: X (matrix): input data size (N * `inputs`) dev (bool, optional): whether leave result in the GPU memory Returns: Y (matrix): predicted outputs size (N * `outputs`), always in float/double format. """ assert self.B is not None, "Solve the task before predicting" devH = self._project(X, dev=True) devY = linalg.dot(devH, self.B) Y = devY if dev else devY.get() return Y def add_batch(self, X, T, wc=None): """Add a batch of training data to an iterative solution, weighted if neeed. The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method. With parameters HH_out, HT_out, the output will be put into these matrices instead of model. Args: X (matrix): input data matrix size (N * `inputs`) T (matrix): output data matrix size (N * `outputs`) wc (vector): vector of weights for data samples, one weight per sample, size (N * 1) HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together """ devH = self._project(X, dev=True) T = np.array(T, order="C", dtype=self.precision) devT = gpuarray.to_gpu(T) if wc is not None: # apply weights if given w = np.array(wc**0.5, dtype=self.precision)[:, None] # re-shape to column matrix devWC = gpuarray.to_gpu(w) misc.mult_matvec(devH, devWC, axis=0, out=devH) misc.mult_matvec(devT, devWC, axis=0, out=devT) if self.HH is None: # initialize space for self.HH, self.HT self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision) self.HH = linalg.eye(self.L, dtype=self.precision) self.HH *= self.norm linalg.add_dot(devH, devT, self.HT, transa='T') if self.precision is np.float64: linalg.add_dot(devH, devH, self.HH, transa='T') else: cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L) # self.ctx.synchronize() # GPU runs asyncronously without that def solve(self): """Compute output weights B, with fix for unstable solution. """ HH = self.HH.get() HT = self.HT.get() B = self.solve_corr(HH, HT) self.B = gpuarray.to_gpu(B) def solve_corr(self, HH, HT): """Compute output weights B for given HH and HT. Simple but inefficient version, see a better one in solver_python. Args: HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`) HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`) """ _, B, info = self.posv(HH, HT) if info > 0: print "ELM covariance matrix is not full rank; solving with SVD (slow)" print "This happened because you have duplicated or too many neurons" HH = np.triu(HH) + np.triu(HH, k=1).T B = np.linalg.lstsq(HH, HT)[0] B = np.array(B, order='C', dtype=self.precision) return B def _prune(self, idx): """Leave only neurons with the given indexes. """ idx = list(idx) neurons = [] for k, func, devW, devB in self.neurons: ix1 = [i for i in idx if i < k] # index for current neuron type idx = [i-k for i in idx if i >= k] number = len(ix1) W = devW.get() W = np.array(W[:, ix1], order='C') devW = gpuarray.to_gpu(W) B = devB.get() B = np.array(B[ix1], order='C') devB = gpuarray.to_gpu(B) neurons.append((number, func, devW, devB)) self.neurons = neurons # reset invalid parameters self.reset() self.B = None def get_B(self): """Return B as a numpy array. """ if self.B is None: B = None else: B = self.B.get() return B def set_B(self, B): """Set B as a numpy array. Args: B (matrix): output layer weights matrix, size (`L` * `outputs`) """ assert B.shape[0] == self.L, "Incorrect first dimension: %d expected, %d found" % (self.L, B.shape[0]) assert B.shape[1] == self.outputs, "Incorrect output dimension: %d expected, %d found" % (self.outputs, B.shape[1]) self.B = gpuarray.to_gpu(B.astype(self.precision)) def get_corr(self): """Return current correlation matrices. """ if self.HH is None: HH = None HT = None else: HH = self.HH.get() HT = self.HT.get() HH = np.triu(HH) + np.triu(HH, k=1).T return HH, HT def set_corr(self, HH, HT): """Set pre-computed correlation matrices. Args: HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`) HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`) """ assert self.neurons is not None, "Add or load neurons before using ELM" assert HH.shape[0] == HH.shape[1], "HH must be a square matrix" msg = "Wrong HH dimension: (%d, %d) expected, %s found" % (self.L, self.L, HH.shape) assert HH.shape[0] == self.L, msg assert HH.shape[0] == HT.shape[0], "HH and HT must have the same number of rows (%d)" % self.L assert HT.shape[1] == self.outputs, "Number of columns in HT must equal number of outputs (%d)" % self.outputs self.HH = gpuarray.to_gpu(HH.astype(self.precision)) self.HT = gpuarray.to_gpu(HT.astype(self.precision)) def get_neurons(self): """Return current neurons. Returns: neurons (list of tuples (number/int, func/string, W/matrix, B/vector)): current neurons in the model """ neurons = [] for number, func, devW, devB in self.neurons: neurons.append((number, func, devW.get(), devB.get())) return neurons
def setup(self, bottom, top): assert len(bottom) == 3 assert len(top) == 1 # parameter param = eval(self.param_str_) self.lambda_ = param['lambda'] self.clip_gradient_ = param.get('clip_gradient', None) # Create CUDA function with pu.caffe_cuda_context(): self.k_masked_diff_ = ElementwiseKernel( "float *diff, float *pred, float *label, float *mask", "diff[i] = (pred[i] - label[i]) * mask[i]", 'masked_diff') self.k_squared_ = ElementwiseKernel( "float *diff, float *diff2", "diff2[i] = diff[i] * diff[i]", 'squared') self.k_ensure_mask_sum_ = ElementwiseKernel( "float *mask_sum", "mask_sum[i] = max(mask_sum[i], 1.0f)", 'ensure_mask_sum') if self.clip_gradient_ is not None: self.k_clip_gradient = ElementwiseKernel( "float *diff", "diff[i] = fmaxf(-{0}, fminf(diff[i], {0}))".format( self.clip_gradient_), 'clip_gradient') # This should be computed more faster by cublasSdot self.k_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i]", arguments="float *d") self.k_squred_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i]", arguments="float *d") self.k_div_sum_ = ReductionKernel(dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] / m[i]", arguments="float *d, float *m") self.k_div_squared_sum_ = ReductionKernel( dtype, neutral="0", reduce_expr="a+b", map_expr="d[i] * d[i] / (m[i] * m[i])", arguments="float *d, float *m") func_backward = SourceModule( """ #include <caffe/util/device_alternate.hpp> __global__ void backward(float *pred, float *label, float *mask, float *diff_sum, float *mask_sum, int count, int stride, int sgn, int batch_size, float lambda, float loss_weight, float *diff) { CUDA_KERNEL_LOOP(i, count) { diff[i] = loss_weight * mask[i] * 2.0f * sgn / mask_sum[i / stride] / batch_size * ((pred[i] - label[i]) - lambda / mask_sum[i / stride] * diff_sum[i / stride]); } } """, include_dirs=pu.caffe_include_dirs).get_function("backward") func_backward.prepare("PPPPPiiiiffP") def _func_backward(pred, label, mask, ds, ms, sgn, loss_weight, diff): bg = pu.block_and_grid(pred.size) batch_size = pred.shape[0] count = pred.size stride = pred.size / pred.shape[0] func_backward.prepared_call(bg['grid'], bg['block'], pred.gpudata, label.gpudata, mask.gpudata, ds.gpudata, ms.gpudata, count, stride, sgn, batch_size, self.lambda_, loss_weight, diff.gpudata) self.k_backward_ = _func_backward self.batch_size_ = 0 self.dim_ = 0 self.reshape(bottom, top)
class SLFNSkCUDA(SLFN): """Single Layer Feed-forward Network (SLFN) implementation on GPU with pyCUDA. To choose a specific GPU, use environmental variable ``CUDA_DEVICE``, for exampe ``CUDA_DEVICE=0 python myscript1.py & CUDA_DEVICE=1 python myscript2.py``. In single precision, only upper triangular part of HH matrix is computed to speedup the method. """ def __init__(self, inputs, outputs, norm=None, precision=np.float64): super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision) # startup GPU #self.ctx = misc.init_context(misc.init_device(nDevice)) # NO NO NO, crashes and does not release memory # use CUDA_DEVICE=0 python my-script.py try: linalg.init() except OSError as e: pass # no 'cusolver' library which is paid and not needed # print "error initializing scikit-cuda: %s" % e # print "ignore if toolbox works" # precision-dependent stuff if precision is np.float64: self.posv = lapack.dposv else: self.posv = lapack.sposv self.handle = cublas.cublasCreate() # prepare GPU function kernels kernel = """ __global__ void dev_sigm(%s *a) { unsigned idx = blockDim.x * blockIdx.x + threadIdx.x; a[idx] = 1.0 / ( exp(a[idx]) + 1 ); } """ kernel = kernel % "double" if self.precision is np.float64 else kernel % "float" self.dev_sigm = SourceModule(kernel).get_function("dev_sigm") self.dev_sigm.prepare("P") # GPU transformation functions self.func["lin"] = self._dev_lin self.func["sigm"] = self._dev_sigm self.func["tanh"] = self._dev_tanh self.func["rbf_l1"] = self._dev_rbfl1 self.func["rbf_l2"] = self._dev_rbfl2 self.func["rbf_linf"] = self._dev_rbflinf def _dev_lin(self, devX, devW, devB): """Linear function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) return devH def _dev_sigm(self, devX, devW, devB): """Compute Sigmoid on GPU for a given array and return array.""" # def sigm(a): # block = a._block # grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1) # dev_sigm.prepared_call(grid, block, a.gpudata) # return a devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) block = devH._block grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1) self.dev_sigm.prepared_call(grid, block, devH.gpudata) return devH def _dev_tanh(self, devX, devW, devB): """Hyperbolic tangent function on GPU. Returns: devH (gpuarray): GPU matrix with the result. """ devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1) cumath.tanh(devH, out=devH) return devH def _dev_rbfl1(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_L1 X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "cityblock")**2 / B)) return devH def _dev_rbfl2(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_L2 X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "euclidean")**2 / B)) return devH def _dev_rbflinf(self, devX, devW, devB): # TODO: make proper GPU implementation of RBF_Linf X = devX.get() W = devW.get() B = devB.get() devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "chebyshev")**2 / B)) return devH def add_neurons(self, number, func, W, B): """Add prepared neurons to the SLFN, merge with existing ones. Adds a number of specific neurons to SLFN network. Weights and biases must be provided for that function. If neurons of such type already exist, they are merged together. Args: number (int): the number of new neurons to add func (str): transformation function of hidden layer. Linear function creates a linear model. W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`) B (vector): a 1-D vector of neuron biases, size (`number` * 1) """ ntypes = [nr[1] for nr in self.neurons] # existing types of neurons if func in ntypes: # add to an existing neuron type i = ntypes.index(func) nn0, _, devW, devB = self.neurons[i] number = nn0 + number devW = gpuarray.to_gpu(np.hstack((devW.get(), W))) devB = gpuarray.to_gpu(np.hstack((devB.get(), B))) self.neurons[i] = (number, func, devW, devB) else: # create a new neuron type devW = gpuarray.to_gpu(W) devB = gpuarray.to_gpu(B) self.neurons.append((number, func, devW, devB)) self.reset() self.B = None def reset(self): """ Resets intermediate training results, releases memory that they use. Keeps solution of ELM, so a trained ELM remains operational. Can be called to free memory after an ELM is trained. """ self.L = sum([n[0] for n in self.neurons]) # get number of neurons self.HH = None self.HT = None def _project(self, X, dev=False): """Projects X to H, an auxiliary function that implements a particular projection. For actual projection, use `ELM.project()` instead. Args: X (matrix): an input data matrix, size (N * `inputs`) dev (bool, optional): whether leave result in the GPU memory Returns: H (matrix): an SLFN hidden layer representation, size (N * `L`) where 'L' is number of neurons """ assert self.neurons is not None, "ELM has no neurons" X = np.array(X, order="C", dtype=self.precision) devX = gpuarray.to_gpu(X) devH = gpuarray.empty((X.shape[0], self.L), dtype=self.precision) i = 0 for nn, ftype, devW, devB in self.neurons: devH[:, i:i + nn] = self.func[ftype](devX, devW, devB) i += nn H = devH if dev else devH.get() return H def _predict(self, X, dev=False): """Predict a batch of data. Auxiliary function that implements a particular prediction. For prediction, use `ELM.predict()` instead. Args: X (matrix): input data size (N * `inputs`) dev (bool, optional): whether leave result in the GPU memory Returns: Y (matrix): predicted outputs size (N * `outputs`), always in float/double format. """ assert self.B is not None, "Solve the task before predicting" devH = self._project(X, dev=True) devY = linalg.dot(devH, self.B) Y = devY if dev else devY.get() return Y def add_batch(self, X, T, wc=None): """Add a batch of training data to an iterative solution, weighted if neeed. The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method. With parameters HH_out, HT_out, the output will be put into these matrices instead of model. Args: X (matrix): input data matrix size (N * `inputs`) T (matrix): output data matrix size (N * `outputs`) wc (vector): vector of weights for data samples, one weight per sample, size (N * 1) HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together """ devH = self._project(X, dev=True) T = np.array(T, order="C", dtype=self.precision) devT = gpuarray.to_gpu(T) if wc is not None: # apply weights if given w = np.array( wc**0.5, dtype=self.precision)[:, None] # re-shape to column matrix devWC = gpuarray.to_gpu(w) misc.mult_matvec(devH, devWC, axis=0, out=devH) misc.mult_matvec(devT, devWC, axis=0, out=devT) if self.HH is None: # initialize space for self.HH, self.HT self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision) self.HH = linalg.eye(self.L, dtype=self.precision) self.HH *= self.norm linalg.add_dot(devH, devT, self.HT, transa='T') if self.precision is np.float64: linalg.add_dot(devH, devH, self.HH, transa='T') else: cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L) # self.ctx.synchronize() # GPU runs asyncronously without that def solve(self): """Compute output weights B, with fix for unstable solution. """ HH = self.HH.get() HT = self.HT.get() B = self.solve_corr(HH, HT) self.B = gpuarray.to_gpu(B) def solve_corr(self, HH, HT): """Compute output weights B for given HH and HT. Simple but inefficient version, see a better one in solver_python. Args: HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`) HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`) """ _, B, info = self.posv(HH, HT) if info > 0: print( "ELM covariance matrix is not full rank; solving with SVD (slow)" ) print( "This happened because you have duplicated or too many neurons" ) HH = np.triu(HH) + np.triu(HH, k=1).T B = np.linalg.lstsq(HH, HT)[0] B = np.array(B, order='C', dtype=self.precision) return B def _prune(self, idx): """Leave only neurons with the given indexes. """ idx = list(idx) neurons = [] for k, func, devW, devB in self.neurons: ix1 = [i for i in idx if i < k] # index for current neuron type idx = [i - k for i in idx if i >= k] number = len(ix1) W = devW.get() W = np.array(W[:, ix1], order='C') devW = gpuarray.to_gpu(W) B = devB.get() B = np.array(B[ix1], order='C') devB = gpuarray.to_gpu(B) neurons.append((number, func, devW, devB)) self.neurons = neurons # reset invalid parameters self.reset() self.B = None def get_B(self): """Return B as a numpy array. """ if self.B is None: B = None else: B = self.B.get() return B def set_B(self, B): """Set B as a numpy array. Args: B (matrix): output layer weights matrix, size (`L` * `outputs`) """ assert B.shape[ 0] == self.L, "Incorrect first dimension: %d expected, %d found" % ( self.L, B.shape[0]) assert B.shape[ 1] == self.outputs, "Incorrect output dimension: %d expected, %d found" % ( self.outputs, B.shape[1]) self.B = gpuarray.to_gpu(B.astype(self.precision)) def get_corr(self): """Return current correlation matrices. """ if self.HH is None: HH = None HT = None else: HH = self.HH.get() HT = self.HT.get() HH = np.triu(HH) + np.triu(HH, k=1).T return HH, HT def set_corr(self, HH, HT): """Set pre-computed correlation matrices. Args: HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`) HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`) """ assert self.neurons is not None, "Add or load neurons before using ELM" assert HH.shape[0] == HH.shape[1], "HH must be a square matrix" msg = "Wrong HH dimension: (%d, %d) expected, %s found" % ( self.L, self.L, HH.shape) assert HH.shape[0] == self.L, msg assert HH.shape[0] == HT.shape[ 0], "HH and HT must have the same number of rows (%d)" % self.L assert HT.shape[ 1] == self.outputs, "Number of columns in HT must equal number of outputs (%d)" % self.outputs self.HH = gpuarray.to_gpu(HH.astype(self.precision)) self.HT = gpuarray.to_gpu(HT.astype(self.precision)) def get_neurons(self): """Return current neurons. Returns: neurons (list of tuples (number/int, func/string, W/matrix, B/vector)): current neurons in the model """ neurons = [] for number, func, devW, devB in self.neurons: neurons.append((number, func, devW.get(), devB.get())) return neurons