Пример #1
0
def get_angle_function(dtypein, dtypeout, pitch=True):
    type_src = dtype_to_ctype(dtypein)
    type_dest = dtype_to_ctype(dtypeout)
    name = "angle_function"
    if dtypeout == np.float32:
        fletter = "f"
    else:
        fletter = ""

    if pitch:
        func = SourceModule(pitch_angle_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "fletter": fletter,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_angle_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "fletter": fletter,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #2
0
def integrate(stepsize = .01, stores = 5, steps=10000, number_of_particles=2**10):
    gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles)
    number_of_particles = np.int32(number_of_particles)
    gpu_rs, gpu_vs = [gpu_r], [gpu_v]
    
    for i in xrange(stores-1):
        gpu_rs.append(gpuarray.empty_like(gpu_r))
        gpu_vs.append(gpuarray.empty_like(gpu_v))
        
    advance = SourceModule(advance_kernel).get_function("advance")
    advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32])
    
    block_size = (32,0,0)
    grid_size = (int(number_of_particles/32), 0, 0)
    
    advance.prepared_call(block_size, grid_size ,gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles)

    old, new = 1, 2
    for i in xrange(steps):
        r = rs_gpu[old].get_async()
        v = vs_gpu[old].get_async()
        advance.prepared_call_async(block_size, grid_size ,gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles)
        
        np.write("step{i:4}_r".format(i*stepsize)+".dat", r)
        np.write("step{i:4}_v".format(i*stepsize)+".dat", r)
        
        old, new = new, (new+1)%stores
Пример #3
0
def get_divscalar_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    
    name = "divscalar"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_left_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare([np.int32, np.int32, np.intp, np.int32,
                      np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_left_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare([np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #4
0
def get_complex_function(real_type, imag_type, result_type, pitch=True):
    type_real = dtype_to_ctype(real_type)
    type_imag = dtype_to_ctype(imag_type)
    type_result = dtype_to_ctype(result_type)

    name = "makecomplex"

    if pitch:
        func = SourceModule(pitch_complex_template % {
            "name": name,
            "real_type": type_real,
            "imag_type": type_imag,
            "result_type": type_result
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')  #[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_complex_template % {
            "name": name,
            "real_type": type_real,
            "imag_type": type_imag,
            "result_type": type_result
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')  #[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #5
0
def get_astype_function(dtype_dest, dtype_src, pitch = True):
    type_dest = dtype_to_ctype(dtype_dest)
    type_src = dtype_to_ctype(dtype_src)
    name = "astype"
    operation = ""
    
    if pitch:
        func = SourceModule(
                pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #6
0
def get_conj_function(dtype, pitch=True):
    type_src = dtype_to_ctype(dtype)
    if dtype == np.complex128:
        operation = "pycuda::conj"
    elif dtype == np.complex64:
        operation = "pycuda::conj"
    else:
        raise TypeError("Only complex arrays are allowed "
                        "to perform conjugation")
    name = "conj"

    if pitch:
        func = SourceModule(pitch_template % {
            "name": name,
            "dest_type": type_src,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_template % {
            "name": name,
            "dest_type": type_src,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #7
0
def get_complex_from_amp_function(in_type, result_type, pitch = True):
    type_in = dtype_to_ctype(in_type)
    type_result = dtype_to_ctype(result_type)
    
    name = "makecomplex_amp_phase"
    
    if pitch:
        func = SourceModule(
                pitch_complex_amp_template % {
                    "name": name,
                    "in_type": type_in,
                    "result_type": type_result,
                    "fletter": 'f' if in_type == np.float32 else ''
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_complex_amp_template % {
                    "name": name,
                    "in_type": type_in,
                    "result_type": type_result,
                    "fletter": 'f' if in_type == np.float32 else ''
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #8
0
def integrate(stepsize=0.01, stores=5, steps=10000, number_of_particles=2 ** 10):
    gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles)
    number_of_particles = np.int32(number_of_particles)
    gpu_rs, gpu_vs = [gpu_r], [gpu_v]

    for i in xrange(stores - 1):
        gpu_rs.append(gpuarray.empty_like(gpu_r))
        gpu_vs.append(gpuarray.empty_like(gpu_v))

    advance = SourceModule(advance_kernel).get_function("advance")
    advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32])

    block_size = (32, 0, 0)
    grid_size = (int(number_of_particles / 32), 0, 0)

    advance.prepared_call(block_size, grid_size, gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles)

    old, new = 1, 2
    for i in xrange(steps):
        r = rs_gpu[old].get_async()
        v = vs_gpu[old].get_async()
        advance.prepared_call_async(
            block_size, grid_size, gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles
        )

        np.write("step{i:4}_r".format(i * stepsize) + ".dat", r)
        np.write("step{i:4}_v".format(i * stepsize) + ".dat", r)

        old, new = new, (new + 1) % stores
Пример #9
0
def get_conj_function(dtype, pitch = True):
    type_src = dtype_to_ctype(dtype)
    if dtype == np.complex128:
        operation = "pycuda::conj"
    elif dtype == np.complex64:
        operation = "pycuda::conj"
    else:
        raise TypeError("Only complex arrays are allowed "
                        "to perform conjugation")
    name = "conj"
    
    if pitch:
        func = SourceModule(
                pitch_template % {
                    "name": name,
                    "dest_type": type_src,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_template % {
                    "name": name,
                    "dest_type": type_src,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #10
0
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True):
    type_left = dtype_to_ctype(left_dtype)
    type_right = dtype_to_ctype(right_dtype)
    type_rslt = dtype_to_ctype(rslt_dtype)

    name = "divarray"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_array_op_template % {
                    "name": name,
                    "dest_type": type_rslt,
                    "left_type": type_left,
                    "right_type": type_right,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPiPi')#[np.int32, np.int32, np.intp, np.int32,
        #             np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_array_op_template % {
                    "name": name,
                    "dest_type": type_rslt,
                    "left_type": type_left,
                    "right_type": type_right,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #11
0
def get_angle_function(dtypein, dtypeout, pitch = True):
    type_src = dtype_to_ctype(dtypein)
    type_dest = dtype_to_ctype(dtypeout)
    name = "angle_function"
    if dtypeout == np.float32:
        fletter = "f"
    else:
        fletter = ""
    
    if pitch:
        func = SourceModule(
                pitch_angle_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "fletter": fletter,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_angle_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "fletter": fletter,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #12
0
def get_complex_from_amp_function(in_type, result_type, pitch=True):
    type_in = dtype_to_ctype(in_type)
    type_result = dtype_to_ctype(result_type)

    name = "makecomplex_amp_phase"

    if pitch:
        func = SourceModule(pitch_complex_amp_template % {
            "name": name,
            "in_type": type_in,
            "result_type": type_result,
            "fletter": 'f' if in_type == np.float32 else ''
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')  #[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_complex_amp_template % {
            "name": name,
            "in_type": type_in,
            "result_type": type_result,
            "fletter": 'f' if in_type == np.float32 else ''
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')  #[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #13
0
def get_scalardiv_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    
    name = "scalardiv"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_right_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_right_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #14
0
def get_scalardiv_function(src_type, dest_type, pitch=True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)

    name = "scalardiv"
    operation = "/"

    if pitch:
        func = SourceModule(pitch_right_scalar_op_template % {
            "name": name,
            "src_type": type_src,
            "dest_type": type_dest,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare(
            'iiPiPi' +
            np.dtype(dest_type).char)  #[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(non_pitch_right_scalar_op_template % {
            "name": name,
            "src_type": type_src,
            "dest_type": type_dest,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP' + np.dtype(dest_type).char +
                     'i')  #[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #15
0
def get_powscalar_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    name = "powscalar"
    operation = "pow"
    
    if pitch:
        func = SourceModule(
                pitch_left_scalar_func_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                    "fletter": 'f' if src_type == np.float32 else '',
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_left_scalar_func_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                    "fletter": 'f' if src_type == np.float32 else '',
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #16
0
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch=True):
    type_left = dtype_to_ctype(left_dtype)
    type_right = dtype_to_ctype(right_dtype)
    type_rslt = dtype_to_ctype(rslt_dtype)

    name = "divarray"
    operation = "/"

    if pitch:
        func = SourceModule(pitch_array_op_template % {
            "name": name,
            "dest_type": type_rslt,
            "left_type": type_left,
            "right_type": type_right,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPiPi')  #[np.int32, np.int32, np.intp, np.int32,
        #             np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_array_op_template % {
            "name": name,
            "dest_type": type_rslt,
            "left_type": type_left,
            "right_type": type_right,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')  #[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #17
0
def get_astype_function(dtype_dest, dtype_src, pitch=True):
    type_dest = dtype_to_ctype(dtype_dest)
    type_src = dtype_to_ctype(dtype_src)
    name = "astype"
    operation = ""

    if pitch:
        func = SourceModule(pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #18
0
def get_powscalar_function(src_type, dest_type, pitch=True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    name = "powscalar"
    operation = "pow"

    if pitch:
        func = SourceModule(pitch_left_scalar_func_template % {
            "name": name,
            "src_type": type_src,
            "dest_type": type_dest,
            "operation": operation,
            "fletter": 'f' if src_type == np.float32 else '',
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare(
            'iiPiPi' +
            np.dtype(dest_type).char)  #[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(non_pitch_left_scalar_func_template % {
            "name": name,
            "src_type": type_src,
            "dest_type": type_dest,
            "operation": operation,
            "fletter": 'f' if src_type == np.float32 else '',
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP' + np.dtype(dest_type).char +
                     'i')  #[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #19
0
def get_complex_function(real_type, imag_type, result_type, pitch = True):
    type_real = dtype_to_ctype(real_type)
    type_imag = dtype_to_ctype(imag_type)
    type_result = dtype_to_ctype(result_type)
    
    name = "makecomplex"
    
    if pitch:
        func = SourceModule(
                pitch_complex_template % {
                    "name": name,
                    "real_type": type_real,
                    "imag_type": type_imag,
                    "result_type": type_result
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_complex_template % {
                    "name": name,
                    "real_type": type_real,
                    "imag_type": type_imag,
                    "result_type": type_result
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #20
0
def get_resize_function(dtype):
    type_src = dtype_to_ctype(dtype)
    name = "resize"
    func = SourceModule(reshape_template % {
        "name": name,
        "dest_type": type_src,
        "src_type": type_src,
        "operation": "",
    },
                        options=["--ptxas-options=-v"]).get_function(name)
    func.prepare('iiiiPiPi')  #[np.int32, np.int32, np.int32, np.int32,
    #              np.intp, np.int32, np.intp, np.int32])
    return func
Пример #21
0
def get_resize_function(dtype):
    type_src = dtype_to_ctype(dtype)
    name = "resize"
    func = SourceModule(
            reshape_template % {
                "name": name,
                "dest_type": type_src,
                "src_type": type_src,
                "operation": "",
            },
            options=["--ptxas-options=-v"]).get_function(name)
    func.prepare('iiiiPiPi')#[np.int32, np.int32, np.int32, np.int32,
    #              np.intp, np.int32, np.intp, np.int32])
    return func
Пример #22
0
def get_realimag_function(dtype, real = True, pitch = True):
    type_src = dtype_to_ctype(dtype)
    
    if dtype == np.complex64:
        type_dest = "float"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    elif dtype == np.complex128:
        type_dest = "double"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    else:
        raise TypeError("only support complex inputs are "
                        "numpy.complex64 or numpy.complex128")
    
    if pitch:
        func = SourceModule(
                pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #23
0
def get_realimag_function(dtype, real=True, pitch=True):
    type_src = dtype_to_ctype(dtype)

    if dtype == np.complex64:
        type_dest = "float"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    elif dtype == np.complex128:
        type_dest = "double"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    else:
        raise TypeError("only support complex inputs are "
                        "numpy.complex64 or numpy.complex128")

    if pitch:
        func = SourceModule(pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #24
0
def get_transpose_function(dtype, conj=False):
    src_type = dtype_to_ctype(dtype)
    name = "trans"
    operation = ""

    if conj:
        if dtype == np.complex128:
            operation = "pycuda::conj"
        elif dtype == np.complex64:
            operation = "pycuda::conj"

    func = SourceModule(transpose_template % {
        "name": name,
        "type": src_type,
        "operation": operation
    },
                        options=["--ptxas-options=-v"]).get_function(name)
    func.prepare('iiPiPi')  #[np.int32, np.int32, np.intp,
    #              np.int32, np.intp, np.int32])
    return func
Пример #25
0
def get_abs_function(dtype, pitch = True):
    type_src = dtype_to_ctype(dtype)
    if dtype == np.complex128:
        operation = "pycuda::abs"
        type_dest = "double"
    elif dtype == np.complex64:
        operation = "pycuda::abs"
        type_dest = "float"
    elif dtype == np.float64:
        operation = "fabs"
        type_dest = "double"
    elif dtype == np.float32:
        operation = "fabsf"
        type_dest = "float"
    else:
        operation = "abs"
        type_dest = dtype_to_ctype(dtype)
    name = "abs_function"
    
    if pitch:
        func = SourceModule(
                pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #26
0
def get_transpose_function(dtype, conj = False):
    src_type = dtype_to_ctype(dtype)
    name = "trans"
    operation = ""
    
    if conj:
        if dtype == np.complex128:
            operation = "pycuda::conj"
        elif dtype == np.complex64:
            operation = "pycuda::conj"
    
    func = SourceModule(
            transpose_template % {
                "name": name,
                "type": src_type,
                "operation": operation
            },
            options=["--ptxas-options=-v"]).get_function(name)
    func.prepare('iiPiPi')#[np.int32, np.int32, np.intp,
    #              np.int32, np.intp, np.int32])
    return func
Пример #27
0
def get_fill_function(dtype, pitch = True):
    type_dst = dtype_to_ctype(dtype)
    name = "fill"
    
    if pitch:
        func = SourceModule(
            fill_pitch_template % {
                    "name": name,
                    "type_dst": type_dst
            }, options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPi'+np.dtype(dtype).char)
        #    [np.int32, np.int32, np.intp, np.int32, _get_type(dtype)])
    else:
        func = SourceModule(
                fill_nonpitch_template % {
                    "name": name,
                    "type_dst": type_dst
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iP'+np.dtype(dtype).char)#[np.int32, np.intp, _get_type(dtype)])
    return func
Пример #28
0
def get_fill_function(dtype, pitch=True):
    type_dst = dtype_to_ctype(dtype)
    name = "fill"

    if pitch:
        func = SourceModule(fill_pitch_template % {
            "name": name,
            "type_dst": type_dst
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPi' + np.dtype(dtype).char)
        #    [np.int32, np.int32, np.intp, np.int32, _get_type(dtype)])
    else:
        func = SourceModule(fill_nonpitch_template % {
            "name": name,
            "type_dst": type_dst
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare(
            'iP' +
            np.dtype(dtype).char)  #[np.int32, np.intp, _get_type(dtype)])
    return func
Пример #29
0
def get_abs_function(dtype, pitch=True):
    type_src = dtype_to_ctype(dtype)
    if dtype == np.complex128:
        operation = "pycuda::abs"
        type_dest = "double"
    elif dtype == np.complex64:
        operation = "pycuda::abs"
        type_dest = "float"
    elif dtype == np.float64:
        operation = "fabs"
        type_dest = "double"
    elif dtype == np.float32:
        operation = "fabsf"
        type_dest = "float"
    else:
        operation = "abs"
        type_dest = dtype_to_ctype(dtype)
    name = "abs_function"

    if pitch:
        func = SourceModule(pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #30
0
    def setup(self, bottom, top):
        assert len(bottom) == 3
        assert len(top) == 1
        # parameter
        param = eval(self.param_str_)
        self.lambda_ = param['lambda']
        self.clip_gradient_ = param.get('clip_gradient', None)
        # Create CUDA function
        with pu.caffe_cuda_context():
            self.k_masked_diff_ = ElementwiseKernel(
                "float *diff, float *pred, float *label, float *mask",
                "diff[i] = (pred[i] - label[i]) * mask[i]", 'masked_diff')
            self.k_squared_ = ElementwiseKernel(
                "float *diff, float *diff2",
                "diff2[i] = diff[i] * diff[i]", 'squared')
            self.k_ensure_mask_sum_ = ElementwiseKernel(
                "float *mask_sum",
                "mask_sum[i] = max(mask_sum[i], 1.0f)", 'ensure_mask_sum')
            if self.clip_gradient_ is not None:
                self.k_clip_gradient = ElementwiseKernel(
                    "float *diff",
                    "diff[i] = fmaxf(-{0}, fminf(diff[i], {0}))".format(
                        self.clip_gradient_),
                    'clip_gradient')
            # This should be computed more faster by cublasSdot
            self.k_sum_ = ReductionKernel(
                dtype, neutral="0",
                reduce_expr="a+b", map_expr="d[i]",
                arguments="float *d")
            self.k_squred_sum_ = ReductionKernel(
                dtype, neutral="0",
                reduce_expr="a+b", map_expr="d[i] * d[i]",
                arguments="float *d")
            self.k_div_sum_ = ReductionKernel(
                dtype, neutral="0",
                reduce_expr="a+b",
                map_expr="d[i] / m[i]",
                arguments="float *d, float *m")
            self.k_div_squared_sum_ = ReductionKernel(
                dtype, neutral="0",
                reduce_expr="a+b",
                map_expr="d[i] * d[i] / (m[i] * m[i])",
                arguments="float *d, float *m")
            func_backward = SourceModule(
                """
#include <caffe/util/device_alternate.hpp>
__global__ void backward(float *pred, float *label, float *mask,
  float *diff_sum, float *mask_sum, int count, int stride, int sgn,
  int batch_size, float lambda, float loss_weight, float *diff) {
  CUDA_KERNEL_LOOP(i, count) {
    diff[i] = loss_weight * mask[i] * 2.0f * sgn / mask_sum[i / stride]
         / batch_size * ((pred[i] - label[i])
            - lambda / mask_sum[i / stride] * diff_sum[i / stride]);
  }
}
""", include_dirs=pu.caffe_include_dirs).get_function("backward")
            func_backward.prepare("PPPPPiiiiffP")

            def _func_backward(pred, label, mask, ds, ms, sgn, loss_weight,
                               diff):
                bg = pu.block_and_grid(pred.size)
                batch_size = pred.shape[0]
                count = pred.size
                stride = pred.size / pred.shape[0]
                func_backward.prepared_call(
                    bg['grid'], bg['block'],
                    pred.gpudata, label.gpudata, mask.gpudata, ds.gpudata,
                    ms.gpudata, count, stride, sgn, batch_size,
                    self.lambda_, loss_weight,
                    diff.gpudata)
            self.k_backward_ = _func_backward
        self.batch_size_ = 0
        self.dim_ = 0
        self.reshape(bottom, top)
Пример #31
0
class SLFNSkCUDA(SLFN):
    """Single Layer Feed-forward Network (SLFN) implementation on GPU with pyCUDA.

    To choose a specific GPU, use environmental variable ``CUDA_DEVICE``, for exampe
    ``CUDA_DEVICE=0 python myscript1.py & CUDA_DEVICE=1 python myscript2.py``.

    In single precision, only upper triangular part of HH matrix is computed to speedup the method.
    """

    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf

    def _dev_lin(self, devX, devW, devB):
        """Linear function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        return devH

    def _dev_sigm(self, devX, devW, devB):
        """Compute Sigmoid on GPU for a given array and return array."""

#        def sigm(a):
#            block = a._block
#            grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1)
#            dev_sigm.prepared_call(grid, block, a.gpudata)
#            return a

        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        block = devH._block
        grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1)
        self.dev_sigm.prepared_call(grid, block, devH.gpudata)
        return devH

    def _dev_tanh(self, devX, devW, devB):
        """Hyperbolic tangent function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        cumath.tanh(devH, out=devH)
        return devH

    def _dev_rbfl1(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_L1
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "cityblock")**2 / B))
        return devH

    def _dev_rbfl2(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_L2
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "euclidean")**2 / B))
        return devH

    def _dev_rbflinf(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_Linf
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "chebyshev")**2 / B))
        return devH

    def add_neurons(self, number, func, W, B):
        """Add prepared neurons to the SLFN, merge with existing ones.

        Adds a number of specific neurons to SLFN network. Weights and biases
        must be provided for that function.

        If neurons of such type already exist, they are merged together.

        Args:
            number (int): the number of new neurons to add
            func (str): transformation function of hidden layer. Linear function creates a linear model.
            W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`)
            B (vector): a 1-D vector of neuron biases, size (`number` * 1)
        """
        ntypes = [nr[1] for nr in self.neurons]  # existing types of neurons
        if func in ntypes:
            # add to an existing neuron type
            i = ntypes.index(func)
            nn0, _, devW, devB = self.neurons[i]
            number = nn0 + number
            devW = gpuarray.to_gpu(np.hstack((devW.get(), W)))
            devB = gpuarray.to_gpu(np.hstack((devB.get(), B)))
            self.neurons[i] = (number, func, devW, devB)
        else:
            # create a new neuron type
            devW = gpuarray.to_gpu(W)
            devB = gpuarray.to_gpu(B)
            self.neurons.append((number, func, devW, devB))
        self.reset()
        self.B = None

    def reset(self):
        """ Resets intermediate training results, releases memory that they use.

        Keeps solution of ELM, so a trained ELM remains operational.
        Can be called to free memory after an ELM is trained.
        """
        self.L = sum([n[0] for n in self.neurons])  # get number of neurons
        self.HH = None
        self.HT = None

    def _project(self, X, dev=False):
        """Projects X to H, an auxiliary function that implements a particular projection.

        For actual projection, use `ELM.project()` instead.

        Args:
            X (matrix): an input data matrix, size (N * `inputs`)
            dev (bool, optional): whether leave result in the GPU memory

        Returns:
            H (matrix): an SLFN hidden layer representation, size (N * `L`) where 'L' is number of neurons
        """
        assert self.neurons is not None, "ELM has no neurons"
        X = np.array(X, order="C", dtype=self.precision)
        devX = gpuarray.to_gpu(X)
        devH = gpuarray.empty((X.shape[0], self.L), dtype=self.precision)
        i = 0
        for nn, ftype, devW, devB in self.neurons:
            devH[:, i:i+nn] = self.func[ftype](devX, devW, devB)
            i += nn

        H = devH if dev else devH.get()
        return H

    def _predict(self, X, dev=False):
        """Predict a batch of data. Auxiliary function that implements a particular prediction.

        For prediction, use `ELM.predict()` instead.

        Args:
            X (matrix): input data size (N * `inputs`)
            dev (bool, optional): whether leave result in the GPU memory

        Returns:
            Y (matrix): predicted outputs size (N * `outputs`), always in float/double format.
        """
        assert self.B is not None, "Solve the task before predicting"
        devH = self._project(X, dev=True)
        devY = linalg.dot(devH, self.B)
        Y = devY if dev else devY.get()
        return Y

    def add_batch(self, X, T, wc=None):
        """Add a batch of training data to an iterative solution, weighted if neeed.

        The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method.
        With parameters HH_out, HT_out, the output will be put into these matrices instead of model.

        Args:
            X (matrix): input data matrix size (N * `inputs`)
            T (matrix): output data matrix size (N * `outputs`)
            wc (vector): vector of weights for data samples, one weight per sample, size (N * 1)
            HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together
        """
        devH = self._project(X, dev=True)
        T = np.array(T, order="C", dtype=self.precision)
        devT = gpuarray.to_gpu(T)
        if wc is not None:  # apply weights if given
            w = np.array(wc**0.5, dtype=self.precision)[:, None]  # re-shape to column matrix
            devWC = gpuarray.to_gpu(w)
            misc.mult_matvec(devH, devWC, axis=0, out=devH)
            misc.mult_matvec(devT, devWC, axis=0, out=devT)

        if self.HH is None:  # initialize space for self.HH, self.HT
            self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision)
            self.HH = linalg.eye(self.L, dtype=self.precision)
            self.HH *= self.norm

        linalg.add_dot(devH, devT, self.HT, transa='T')
        if self.precision is np.float64:
            linalg.add_dot(devH, devH, self.HH, transa='T')
        else:
            cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1, devH.ptr, self.L, 1, self.HH.ptr, self.L)
#        self.ctx.synchronize()  # GPU runs asyncronously without that

    def solve(self):
        """Compute output weights B, with fix for unstable solution.
        """
        HH = self.HH.get()
        HT = self.HT.get()
        B = self.solve_corr(HH, HT)
        self.B = gpuarray.to_gpu(B)

    def solve_corr(self, HH, HT):
        """Compute output weights B for given HH and HT.

        Simple but inefficient version, see a better one in solver_python.

        Args:
            HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`)
            HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`)
        """
        _, B, info = self.posv(HH, HT)
        if info > 0:
            print "ELM covariance matrix is not full rank; solving with SVD (slow)"
            print "This happened because you have duplicated or too many neurons"
            HH = np.triu(HH) + np.triu(HH, k=1).T
            B = np.linalg.lstsq(HH, HT)[0]
        B = np.array(B, order='C', dtype=self.precision)
        return B

    def _prune(self, idx):
        """Leave only neurons with the given indexes.
        """
        idx = list(idx)
        neurons = []
        for k, func, devW, devB in self.neurons:
            ix1 = [i for i in idx if i < k]  # index for current neuron type
            idx = [i-k for i in idx if i >= k]
            number = len(ix1)
            W = devW.get()
            W = np.array(W[:, ix1], order='C')
            devW = gpuarray.to_gpu(W)
            B = devB.get()
            B = np.array(B[ix1], order='C')
            devB = gpuarray.to_gpu(B)
            neurons.append((number, func, devW, devB))
        self.neurons = neurons
        # reset invalid parameters
        self.reset()
        self.B = None

    def get_B(self):
        """Return B as a numpy array.
        """
        if self.B is None:
            B = None
        else:
            B = self.B.get()
        return B

    def set_B(self, B):
        """Set B as a numpy array.

        Args:
            B (matrix): output layer weights matrix, size (`L` * `outputs`)
        """
        assert B.shape[0] == self.L, "Incorrect first dimension: %d expected, %d found" % (self.L, B.shape[0])
        assert B.shape[1] == self.outputs, "Incorrect output dimension: %d expected, %d found" % (self.outputs, B.shape[1])
        self.B = gpuarray.to_gpu(B.astype(self.precision))

    def get_corr(self):
        """Return current correlation matrices.
        """
        if self.HH is None:
            HH = None
            HT = None
        else:
            HH = self.HH.get()
            HT = self.HT.get()
            HH = np.triu(HH) + np.triu(HH, k=1).T
        return HH, HT

    def set_corr(self, HH, HT):
        """Set pre-computed correlation matrices.

        Args:
            HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`)
            HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`)
        """
        assert self.neurons is not None, "Add or load neurons before using ELM"
        assert HH.shape[0] == HH.shape[1], "HH must be a square matrix"
        msg = "Wrong HH dimension: (%d, %d) expected, %s found" % (self.L, self.L, HH.shape)
        assert HH.shape[0] == self.L, msg
        assert HH.shape[0] == HT.shape[0], "HH and HT must have the same number of rows (%d)" % self.L
        assert HT.shape[1] == self.outputs, "Number of columns in HT must equal number of outputs (%d)" % self.outputs
        self.HH = gpuarray.to_gpu(HH.astype(self.precision))
        self.HT = gpuarray.to_gpu(HT.astype(self.precision))

    def get_neurons(self):
        """Return current neurons.

        Returns:
            neurons (list of tuples (number/int, func/string, W/matrix, B/vector)): current neurons in the model
        """
        neurons = []
        for number, func, devW, devB in self.neurons:
            neurons.append((number, func, devW.get(), devB.get()))
        return neurons
Пример #32
0
    def setup(self, bottom, top):
        assert len(bottom) == 3
        assert len(top) == 1
        # parameter
        param = eval(self.param_str_)
        self.lambda_ = param['lambda']
        self.clip_gradient_ = param.get('clip_gradient', None)
        # Create CUDA function
        with pu.caffe_cuda_context():
            self.k_masked_diff_ = ElementwiseKernel(
                "float *diff, float *pred, float *label, float *mask",
                "diff[i] = (pred[i] - label[i]) * mask[i]", 'masked_diff')
            self.k_squared_ = ElementwiseKernel(
                "float *diff, float *diff2", "diff2[i] = diff[i] * diff[i]",
                'squared')
            self.k_ensure_mask_sum_ = ElementwiseKernel(
                "float *mask_sum", "mask_sum[i] = max(mask_sum[i], 1.0f)",
                'ensure_mask_sum')
            if self.clip_gradient_ is not None:
                self.k_clip_gradient = ElementwiseKernel(
                    "float *diff",
                    "diff[i] = fmaxf(-{0}, fminf(diff[i], {0}))".format(
                        self.clip_gradient_), 'clip_gradient')
            # This should be computed more faster by cublasSdot
            self.k_sum_ = ReductionKernel(dtype,
                                          neutral="0",
                                          reduce_expr="a+b",
                                          map_expr="d[i]",
                                          arguments="float *d")
            self.k_squred_sum_ = ReductionKernel(dtype,
                                                 neutral="0",
                                                 reduce_expr="a+b",
                                                 map_expr="d[i] * d[i]",
                                                 arguments="float *d")
            self.k_div_sum_ = ReductionKernel(dtype,
                                              neutral="0",
                                              reduce_expr="a+b",
                                              map_expr="d[i] / m[i]",
                                              arguments="float *d, float *m")
            self.k_div_squared_sum_ = ReductionKernel(
                dtype,
                neutral="0",
                reduce_expr="a+b",
                map_expr="d[i] * d[i] / (m[i] * m[i])",
                arguments="float *d, float *m")
            func_backward = SourceModule(
                """
#include <caffe/util/device_alternate.hpp>
__global__ void backward(float *pred, float *label, float *mask,
  float *diff_sum, float *mask_sum, int count, int stride, int sgn,
  int batch_size, float lambda, float loss_weight, float *diff) {
  CUDA_KERNEL_LOOP(i, count) {
    diff[i] = loss_weight * mask[i] * 2.0f * sgn / mask_sum[i / stride]
         / batch_size * ((pred[i] - label[i])
            - lambda / mask_sum[i / stride] * diff_sum[i / stride]);
  }
}
""",
                include_dirs=pu.caffe_include_dirs).get_function("backward")
            func_backward.prepare("PPPPPiiiiffP")

            def _func_backward(pred, label, mask, ds, ms, sgn, loss_weight,
                               diff):
                bg = pu.block_and_grid(pred.size)
                batch_size = pred.shape[0]
                count = pred.size
                stride = pred.size / pred.shape[0]
                func_backward.prepared_call(bg['grid'], bg['block'],
                                            pred.gpudata, label.gpudata,
                                            mask.gpudata, ds.gpudata,
                                            ms.gpudata, count, stride, sgn,
                                            batch_size, self.lambda_,
                                            loss_weight, diff.gpudata)

            self.k_backward_ = _func_backward
        self.batch_size_ = 0
        self.dim_ = 0
        self.reshape(bottom, top)
Пример #33
0
class SLFNSkCUDA(SLFN):
    """Single Layer Feed-forward Network (SLFN) implementation on GPU with pyCUDA.

    To choose a specific GPU, use environmental variable ``CUDA_DEVICE``, for exampe
    ``CUDA_DEVICE=0 python myscript1.py & CUDA_DEVICE=1 python myscript2.py``.

    In single precision, only upper triangular part of HH matrix is computed to speedup the method.
    """
    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf

    def _dev_lin(self, devX, devW, devB):
        """Linear function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        return devH

    def _dev_sigm(self, devX, devW, devB):
        """Compute Sigmoid on GPU for a given array and return array."""

        #        def sigm(a):
        #            block = a._block
        #            grid = (int(np.ceil(1.0 * np.prod(a.shape) / block[0])), 1)
        #            dev_sigm.prepared_call(grid, block, a.gpudata)
        #            return a

        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        block = devH._block
        grid = (int(np.ceil(1.0 * np.prod(devH.shape) / block[0])), 1)
        self.dev_sigm.prepared_call(grid, block, devH.gpudata)
        return devH

    def _dev_tanh(self, devX, devW, devB):
        """Hyperbolic tangent function on GPU.

        Returns:
            devH (gpuarray): GPU matrix with the result.
        """
        devH = misc.add_matvec(linalg.dot(devX, devW), devB, axis=1)
        cumath.tanh(devH, out=devH)
        return devH

    def _dev_rbfl1(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_L1
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "cityblock")**2 / B))
        return devH

    def _dev_rbfl2(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_L2
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "euclidean")**2 / B))
        return devH

    def _dev_rbflinf(self, devX, devW, devB):
        # TODO: make proper GPU implementation of RBF_Linf
        X = devX.get()
        W = devW.get()
        B = devB.get()
        devH = gpuarray.to_gpu(np.exp(-cdist(X, W.T, "chebyshev")**2 / B))
        return devH

    def add_neurons(self, number, func, W, B):
        """Add prepared neurons to the SLFN, merge with existing ones.

        Adds a number of specific neurons to SLFN network. Weights and biases
        must be provided for that function.

        If neurons of such type already exist, they are merged together.

        Args:
            number (int): the number of new neurons to add
            func (str): transformation function of hidden layer. Linear function creates a linear model.
            W (matrix): a 2-D matrix of neuron weights, size (`inputs` * `number`)
            B (vector): a 1-D vector of neuron biases, size (`number` * 1)
        """
        ntypes = [nr[1] for nr in self.neurons]  # existing types of neurons
        if func in ntypes:
            # add to an existing neuron type
            i = ntypes.index(func)
            nn0, _, devW, devB = self.neurons[i]
            number = nn0 + number
            devW = gpuarray.to_gpu(np.hstack((devW.get(), W)))
            devB = gpuarray.to_gpu(np.hstack((devB.get(), B)))
            self.neurons[i] = (number, func, devW, devB)
        else:
            # create a new neuron type
            devW = gpuarray.to_gpu(W)
            devB = gpuarray.to_gpu(B)
            self.neurons.append((number, func, devW, devB))
        self.reset()
        self.B = None

    def reset(self):
        """ Resets intermediate training results, releases memory that they use.

        Keeps solution of ELM, so a trained ELM remains operational.
        Can be called to free memory after an ELM is trained.
        """
        self.L = sum([n[0] for n in self.neurons])  # get number of neurons
        self.HH = None
        self.HT = None

    def _project(self, X, dev=False):
        """Projects X to H, an auxiliary function that implements a particular projection.

        For actual projection, use `ELM.project()` instead.

        Args:
            X (matrix): an input data matrix, size (N * `inputs`)
            dev (bool, optional): whether leave result in the GPU memory

        Returns:
            H (matrix): an SLFN hidden layer representation, size (N * `L`) where 'L' is number of neurons
        """
        assert self.neurons is not None, "ELM has no neurons"
        X = np.array(X, order="C", dtype=self.precision)
        devX = gpuarray.to_gpu(X)
        devH = gpuarray.empty((X.shape[0], self.L), dtype=self.precision)
        i = 0
        for nn, ftype, devW, devB in self.neurons:
            devH[:, i:i + nn] = self.func[ftype](devX, devW, devB)
            i += nn

        H = devH if dev else devH.get()
        return H

    def _predict(self, X, dev=False):
        """Predict a batch of data. Auxiliary function that implements a particular prediction.

        For prediction, use `ELM.predict()` instead.

        Args:
            X (matrix): input data size (N * `inputs`)
            dev (bool, optional): whether leave result in the GPU memory

        Returns:
            Y (matrix): predicted outputs size (N * `outputs`), always in float/double format.
        """
        assert self.B is not None, "Solve the task before predicting"
        devH = self._project(X, dev=True)
        devY = linalg.dot(devH, self.B)
        Y = devY if dev else devY.get()
        return Y

    def add_batch(self, X, T, wc=None):
        """Add a batch of training data to an iterative solution, weighted if neeed.

        The batch is processed as a whole, the training data is splitted in `ELM.add_data()` method.
        With parameters HH_out, HT_out, the output will be put into these matrices instead of model.

        Args:
            X (matrix): input data matrix size (N * `inputs`)
            T (matrix): output data matrix size (N * `outputs`)
            wc (vector): vector of weights for data samples, one weight per sample, size (N * 1)
            HH_out, HT_out (matrix, optional): output matrices to add batch result into, always given together
        """
        devH = self._project(X, dev=True)
        T = np.array(T, order="C", dtype=self.precision)
        devT = gpuarray.to_gpu(T)
        if wc is not None:  # apply weights if given
            w = np.array(
                wc**0.5,
                dtype=self.precision)[:, None]  # re-shape to column matrix
            devWC = gpuarray.to_gpu(w)
            misc.mult_matvec(devH, devWC, axis=0, out=devH)
            misc.mult_matvec(devT, devWC, axis=0, out=devT)

        if self.HH is None:  # initialize space for self.HH, self.HT
            self.HT = misc.zeros((self.L, self.outputs), dtype=self.precision)
            self.HH = linalg.eye(self.L, dtype=self.precision)
            self.HH *= self.norm

        linalg.add_dot(devH, devT, self.HT, transa='T')
        if self.precision is np.float64:
            linalg.add_dot(devH, devH, self.HH, transa='T')
        else:
            cublas.cublasSsyrk(self.handle, 'L', 'N', self.L, X.shape[0], 1,
                               devH.ptr, self.L, 1, self.HH.ptr, self.L)
#        self.ctx.synchronize()  # GPU runs asyncronously without that

    def solve(self):
        """Compute output weights B, with fix for unstable solution.
        """
        HH = self.HH.get()
        HT = self.HT.get()
        B = self.solve_corr(HH, HT)
        self.B = gpuarray.to_gpu(B)

    def solve_corr(self, HH, HT):
        """Compute output weights B for given HH and HT.

        Simple but inefficient version, see a better one in solver_python.

        Args:
            HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`)
            HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`)
        """
        _, B, info = self.posv(HH, HT)
        if info > 0:
            print(
                "ELM covariance matrix is not full rank; solving with SVD (slow)"
            )
            print(
                "This happened because you have duplicated or too many neurons"
            )
            HH = np.triu(HH) + np.triu(HH, k=1).T
            B = np.linalg.lstsq(HH, HT)[0]
        B = np.array(B, order='C', dtype=self.precision)
        return B

    def _prune(self, idx):
        """Leave only neurons with the given indexes.
        """
        idx = list(idx)
        neurons = []
        for k, func, devW, devB in self.neurons:
            ix1 = [i for i in idx if i < k]  # index for current neuron type
            idx = [i - k for i in idx if i >= k]
            number = len(ix1)
            W = devW.get()
            W = np.array(W[:, ix1], order='C')
            devW = gpuarray.to_gpu(W)
            B = devB.get()
            B = np.array(B[ix1], order='C')
            devB = gpuarray.to_gpu(B)
            neurons.append((number, func, devW, devB))
        self.neurons = neurons
        # reset invalid parameters
        self.reset()
        self.B = None

    def get_B(self):
        """Return B as a numpy array.
        """
        if self.B is None:
            B = None
        else:
            B = self.B.get()
        return B

    def set_B(self, B):
        """Set B as a numpy array.

        Args:
            B (matrix): output layer weights matrix, size (`L` * `outputs`)
        """
        assert B.shape[
            0] == self.L, "Incorrect first dimension: %d expected, %d found" % (
                self.L, B.shape[0])
        assert B.shape[
            1] == self.outputs, "Incorrect output dimension: %d expected, %d found" % (
                self.outputs, B.shape[1])
        self.B = gpuarray.to_gpu(B.astype(self.precision))

    def get_corr(self):
        """Return current correlation matrices.
        """
        if self.HH is None:
            HH = None
            HT = None
        else:
            HH = self.HH.get()
            HT = self.HT.get()
            HH = np.triu(HH) + np.triu(HH, k=1).T
        return HH, HT

    def set_corr(self, HH, HT):
        """Set pre-computed correlation matrices.

        Args:
            HH (matrix): covariance matrix of hidden layer represenation H, size (`L` * `L`)
            HT (matrix): correlation matrix between H and outputs T, size (`L` * `outputs`)
        """
        assert self.neurons is not None, "Add or load neurons before using ELM"
        assert HH.shape[0] == HH.shape[1], "HH must be a square matrix"
        msg = "Wrong HH dimension: (%d, %d) expected, %s found" % (
            self.L, self.L, HH.shape)
        assert HH.shape[0] == self.L, msg
        assert HH.shape[0] == HT.shape[
            0], "HH and HT must have the same number of rows (%d)" % self.L
        assert HT.shape[
            1] == self.outputs, "Number of columns in HT must equal number of outputs (%d)" % self.outputs
        self.HH = gpuarray.to_gpu(HH.astype(self.precision))
        self.HT = gpuarray.to_gpu(HT.astype(self.precision))

    def get_neurons(self):
        """Return current neurons.

        Returns:
            neurons (list of tuples (number/int, func/string, W/matrix, B/vector)): current neurons in the model
        """
        neurons = []
        for number, func, devW, devB in self.neurons:
            neurons.append((number, func, devW.get(), devB.get()))
        return neurons