Exemplos de memcpy_peer em Python, exemplos de pycuda.driver.memcpy_peer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: proc_load.py Projeto: mesnilgr/theano_alexnet

def fun_load(config, sock_data=5000):

    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_randproc = not config['use_data_layer']
    flag_batch = config['batch_crop_mirror']

    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print 'shared_x information received', shape, dtype
    shape = (3, 255, 255, 256) # TODO remove fix

    gpu_data_remote = gpuarray.GPUArray(shape, dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print 'img_mean received'

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        #data = pickle.load(open(hkl_name)) - img_mean
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time
        if flag_randproc:
            param_rand = recv_queue.get()

            data = crop_and_mirror(data, param_rand, flag_batch=flag_batch)
        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'

        drv.memcpy_peer(gpu_data_remote.ptr,
                        gpu_data.ptr,
                        gpu_data.dtype.itemsize *
                        gpu_data.size,
                        ctx, ctx)

        ctx.synchronize()

        send_queue.put('copy_finished')

Exemplo n.º 2

0

Exibir arquivo

def fun_load(config, sock_data=5000):

    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_randproc = not config['use_data_layer']
    flag_batch = config['batch_crop_mirror']

    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print 'shared_x information received'

    gpu_data_remote = gpuarray.GPUArray(shape,
                                        dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print 'img_mean received'

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time

        if flag_randproc:
            param_rand = recv_queue.get()

            data = crop_and_mirror(data, param_rand, flag_batch=flag_batch)

        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'

        drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr,
                        gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx)

        ctx.synchronize()

        send_queue.put('copy_finished')

Exemplo n.º 3

0

Exibir arquivo

Arquivo: proc_load.py Projeto: lanlianhuaer/Recurrent-Pose-Attention

def fun_load(config, sock_data_2=5001):
    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    num_timesteps = config['num_timesteps']
    num_seq = config['num_seq']
    img_scale_x = config['img_scale_x']
    img_scale_y = config['img_scale_y']
    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx_2 = dev.make_context()

    sock_2 = zmq.Context().socket(zmq.PAIR)
    sock_2.bind('tcp://*:{0}'.format(sock_data_2))
    shape_temporal, dtype_temporal, h_temporal = sock_2.recv_pyobj()
    print 'shared_x information received', shape_temporal
    gpu_data_remote_temporal = gpuarray.GPUArray(
        shape_temporal,
        dtype_temporal,
        gpudata=drv.IPCMemoryHandle(h_temporal))
    gpu_data_temporal = gpuarray.GPUArray(shape_temporal, dtype_temporal)
    # print 'img_mean received'
    # The first time, do the set ups and other stuff
    # receive information for loading
    while True:
        video_name_temporal = recv_queue.get()
        rand_param = recv_queue.get()
        if config['modal'] == 'rgb':
            data_temporal = prepare_data_rgb(video_name_temporal,
                                             num_timesteps,
                                             num_seq,
                                             rand_param,
                                             data_shape=(img_scale_x,
                                                         img_scale_y, 3))
        else:
            data_temporal = prepare_data_flow(video_name_temporal,
                                              num_timesteps,
                                              num_seq,
                                              rand_param,
                                              data_shape=(img_scale_x,
                                                          img_scale_y))

        gpu_data_temporal.set(data_temporal)
        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'
        drv.memcpy_peer(
            gpu_data_remote_temporal.ptr, gpu_data_temporal.ptr,
            gpu_data_temporal.dtype.itemsize * gpu_data_temporal.size, ctx_2,
            ctx_2)

        ctx_2.synchronize()
        send_queue.put('copy_finished')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: proc_load.py Projeto: tvijay333/theano_alexnet

def fun_load(config, sock_data=5000):

    send_queue = config["queue_l2t"]
    recv_queue = config["queue_t2l"]
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_batch = config["batch_crop_mirror"]

    drv.init()
    dev = drv.Device(int(config["gpu"][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind("tcp://*:{0}".format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print "shared_x information received"

    gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print "img_mean received"

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time

        param_rand = recv_queue.get()

        data = crop_and_mirror(data, param_rand, flag_batch=flag_batch)

        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == "calc_finished"

        drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx)

        ctx.synchronize()

        send_queue.put("copy_finished")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: cuda.py Projeto: skallumadi/chainer

def copy(array, out=None, out_device=None):
    """Copies a GPUArray object using the default stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (~pycuda.gpuarray.GPUArray): Array to be copied.
        out (~pycuda.gpuarray.GPUArray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.

    Returns:
        ~pycuda.gpuarray.GPUArray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    """
    in_device = get_device(array)
    if out is None:
        if out_device is None:
            out_device = in_device
        else:
            out_device = get_device(out_device)

        with using_device(out_device):
            out = empty_like(array)
    else:
        out_device = get_device(out)

    with using_device(in_device):
        if in_device == out_device:
            drv.memcpy_dtod(out.ptr, array.ptr, out.nbytes)
        else:
            drv.memcpy_peer(out.ptr, array.ptr, out.nbytes, out_device,
                            in_device)

    return out

Exemplo n.º 6

0

Exibir arquivo

Arquivo: cuda.py Projeto: kuwa32/chainer

def copy(array, out=None, out_device=None):
    """Copies a GPUArray object using the default stream.

    This function can copy the device array to the destination array on another
    device.

    Args:
        array (~pycuda.gpuarray.GPUArray): Array to be copied.
        out (~pycuda.gpuarray.GPUArray): Destination array.
            If it is not ``None``, then ``out_device`` argument is ignored.
        out_device: Destination device specifier. Actual device object is
            obtained by passing this value to :func:`get_device`.

    Returns:
        ~pycuda.gpuarray.GPUArray: Copied array.

        If ``out`` is not specified, then the array is allocated on the device
        specified by ``out_device`` argument.

    """
    in_device = get_device(array)
    if out is None:
        if out_device is None:
            out_device = in_device
        else:
            out_device = get_device(out_device)

        with using_device(out_device):
            out = empty_like(array)
    else:
        out_device = get_device(out)

    with using_device(in_device):
        if in_device == out_device:
            drv.memcpy_dtod(out.ptr, array.ptr, out.nbytes)
        else:
            drv.memcpy_peer(
                out.ptr, array.ptr, out.nbytes, out_device, in_device)

    return out

Exemplo n.º 7

0

Exibir arquivo

def train_net(config, private_config):

    # UNPACK CONFIGS
    (train_videos_spatial_jhmdb,val_videos_spatial_jhmdb,train_videos_temporal_jhmdb,val_videos_temporal_jhmdb,
     train_targets,val_targets,
           train_labels_jhmdb,val_labels_jhmdb) = unpack_configs_jhmdb(config,gpu_id=private_config['gpu_id'])
    # print('val_len',len(val_videos_spatial_jhmdb),'train_len',len(train_videos_spatial_jhmdb))
    if config['modal']=='rgb':
        train_videos = list(train_videos_spatial_jhmdb)
        test_videos = list(val_videos_spatial_jhmdb)
    else:
        train_videos = list(train_videos_temporal_jhmdb)
        test_videos = list(val_videos_temporal_jhmdb)
    print('jhmdb_len',len(train_videos),len(train_labels_jhmdb))#,len(tr_video_length_jhmdb))
    flag_para_load =config['para_load']
    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                        print 'training cost:', cost_ij,'cost_nll:',cost_nll,'cost_attention:',cost_att

                    if config['print_train_error']:
                        error_ij = train_error()

                        gpu_send_queue.put(error_ij)
                        that_error = gpu_recv_queue.get()
                        error_ij = (error_ij + that_error) / 2.

                        if private_config['flag_verbose']:
                            print 'training error rate:', error_ij

                if flag_para_load and (count < len(minibatch_range)):
                    load_send_queue.put('calc_finished')

                if count%20 == 0:
                    e = time.time()
                    print "time per 20 iter:", (e - s)
            # ############### Test on Validation Set ##################
            DropoutLayer.SetDropoutOff()
            this_val_error, this_val_loss = get_test_error(config,
                 shared_x, shared_mask, shared_y,shared_target,shared_use_noise,
                 shared_conv,test_videos,  val_labels_jhmdb,
                flag_para_load,
                batch_size,num_seq, validate_model_lstm,train_model,
                send_queue=load_send_queue, recv_queue=load_recv_queue)

            # report validation stats
            gpu_send_queue.put(this_val_error)
            that_val_error = gpu_recv_queue.get()
            this_val_error = (this_val_error + that_val_error) / 2.

            gpu_send_queue.put(this_val_loss)
            that_val_loss = gpu_recv_queue.get()
            this_val_loss = (this_val_loss + that_val_loss) / 2.

            if private_config['flag_verbose']:
                print('epoch %i: test loss of jhmdb %f ' %
                      (epoch, this_val_loss))
                print('epoch %i: test error of jhmdb %f %%' %
                      (epoch, this_val_error * 100.))
            val_record.append([this_val_error, this_val_loss])
            if private_config['flag_save']:
                np.save(config['weights_dir'] + 'test_record_jhmdb.npy', val_record)

            DropoutLayer.SetDropoutOn()
            ###########################################
            # Adapt Learning Rate
            step_idx = adjust_learning_rate(config, epoch, step_idx,
                                            val_record, learning_rate)
            # Save Weights, only one of them will do
            if private_config['flag_save'] :
                if epoch % config['snapshot_freq'] == 0:
                    save_weights(layers, config['weights_dir'], epoch)
                    np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                            learning_rate.get_value())
                    save_momentums(vels, config['weights_dir'], epoch)
        print('Optimization complete.')

Exemplo n.º 8

0

Exibir arquivo

Arquivo: dual_mlp.py Projeto: ZhangAustin/theano_multi_gpu

def fun_mlp(shared_args, private_args, this_queue, that_queue):
    '''
    shared_args 
    contains neural network parameters

    private_args
    contains parameters for process run on each gpu

    this_queue and that_queue are used for synchronization between processes.
    '''

    learning_rate = shared_args['learning_rate']
    n_epochs = shared_args['n_epochs']
    dataset = shared_args['dataset']
    batch_size = shared_args['batch_size']
    L1_reg = shared_args['L1_reg']
    L2_reg = shared_args['L2_reg']
    n_hidden = shared_args['n_hidden']

    ####
    # pycuda and zmq environment
    drv.init()
    dev = drv.Device(private_args['ind_gpu'])
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)

    if private_args['flag_client']:
        sock.connect('tcp://localhost:5000')
    else:
        sock.bind('tcp://*:5000')
    ####

    ####
    # import theano related
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T

    from logistic_sgd import load_data
    from mlp import MLP

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    ####


    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    classifier = MLP(rng=rng, input=x, n_in=28 * 28,
                     n_hidden=n_hidden, n_out=10)

    cost = (classifier.negative_log_likelihood(y)
            + L1_reg * classifier.L1
            + L2_reg * classifier.L2_sqr)

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]}
    )

    gparams = [T.grad(cost, param) for param in classifier.params]

    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})
    ####
    # setting pycuda and
    # pass handles, only done once
    
    param_ga_list = []
    # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu
    
    param_other_list = []
    # a list of theano shared variables that are used to store values of theano shared variable from the other gpu

    param_ga_other_list = []
    # a list of pycuda gpuarrays which point to theano shared variables in param_other_list

    h_list = []
    # a list of pycuda IPC handles

    shape_list = []
    # a list containing shapes of variables in param_ga_list

    dtype_list = []
    # a list containing dtypes of variables in param_ga_list
    
    average_fun_list = []
    # a list containing theano functions for averaging parameters

    for param in classifier.params:
        param_other = theano.shared(param.get_value())
        param_ga = \
            theano.misc.pycuda_utils.to_gpuarray(param.container.value)
        param_ga_other = \
            theano.misc.pycuda_utils.to_gpuarray(
                param_other.container.value)
        h = drv.mem_get_ipc_handle(param_ga.ptr)
        average_fun = \
            theano.function([], updates=[(param,
                                          (param + param_other) / 2.)])

        param_other_list.append(param_other)
        param_ga_list.append(param_ga)
        param_ga_other_list.append(param_ga_other)
        h_list.append(h)
        shape_list.append(param_ga.shape)
        dtype_list.append(param_ga.dtype)
        average_fun_list.append(average_fun)

    # pass shape, dtype and handles
    sock.send_pyobj((shape_list, dtype_list, h_list))
    shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj()

    param_ga_remote_list = []

    # create gpuarray point to the other gpu use the passed information
    for shape_other, dtype_other, h_other in zip(shape_other_list,
                                                 dtype_other_list,
                                                 h_other_list):
        param_ga_remote = \
            gpuarray.GPUArray(shape_other, dtype_other,
                              gpudata=drv.IPCMemoryHandle(h_other))

        param_ga_remote_list.append(param_ga_remote)
    ####


    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    this_queue.put('')
    that_queue.get()
    start_time = time.time()

    epoch = 0

    while epoch < n_epochs:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            if minibatch_index % 2 == private_args['mod']:
                train_model(minibatch_index)
                
                this_queue.put('')
                that_queue.get()

                # exchanging weights
                for param_ga, param_ga_other, param_ga_remote in \
                        zip(param_ga_list, param_ga_other_list,
                            param_ga_remote_list):

                    drv.memcpy_peer(param_ga_other.ptr,
                                    param_ga_remote.ptr,
                                    param_ga_remote.dtype.itemsize *
                                    param_ga_remote.size,
                                    ctx, ctx)                
                
                ctx.synchronize()
                this_queue.put('')
                that_queue.get()
                    
                for average_fun in average_fun_list:
                    average_fun()



        if private_args['verbose']:
            validation_losses = [validate_model(i) for i
                                 in xrange(n_valid_batches)]
            this_validation_loss = np.mean(validation_losses)

            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch, minibatch_index + 1, n_train_batches,
                   this_validation_loss * 100.))

    end_time = time.time()

    this_queue.put('')
    that_queue.get()

    if private_args['verbose']:
        print 'The code run for %d epochs, with %f epochs/sec' % (
            epoch, 1. * epoch / (end_time - start_time))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.1fs' % ((end_time - start_time)))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: proc_load.py Projeto: myt00seven/svrg

def fun_load(config, sock_data=5000):

    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_batch = config['batch_crop_mirror']

    cropsize = config['cropsize']

    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print 'shared_x information received'

    gpu_data_remote = gpuarray.GPUArray(shape, dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print 'img_mean received'

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # print '... ... in the loop of generating data'
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time

        param_rand = recv_queue.get()
        # print '... ... get the original hkl size:',data.size

        data = crop_and_mirror(data, param_rand, flag_batch=flag_batch, cropsize=cropsize)
        # print '... ... get the cropped size:',data.size
    
        # raw_input("Press Enter to continue...")
        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'

        drv.memcpy_peer(gpu_data_remote.ptr,
                        gpu_data.ptr,
                        gpu_data.dtype.itemsize *
                        gpu_data.size,
                        ctx, ctx)

        ctx.synchronize()

        send_queue.put('copy_finished')

Exemplo n.º 10

0

Exibir arquivo

def train_net(config, private_config):

    # UNPACK CONFIGS
    (flag_para_load, flag_datalayer, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = \
        unpack_configs(config, ext_data=private_config['ext_data'],
                       ext_label=private_config['ext_label'])

    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                    print 'training cost:', cost_ij

                if config['print_train_error']:
                    error_ij = train_error()

                    gpu_send_queue.put(error_ij)
                    that_error = gpu_recv_queue.get()
                    error_ij = (error_ij + that_error) / 2.

                    if private_config['flag_verbose']:
                        print 'training error rate:', error_ij

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_val_error, this_val_loss = get_val_error_loss(
            rand_arr,
            shared_x,
            shared_y,
            val_filenames,
            val_labels,
            flag_datalayer,
            flag_para_load,
            batch_size,
            validate_model,
            send_queue=load_send_queue,
            recv_queue=load_recv_queue)

        # report validation stats
        gpu_send_queue.put(this_val_error)
        that_val_error = gpu_recv_queue.get()
        this_val_error = (this_val_error + that_val_error) / 2.

        gpu_send_queue.put(this_val_loss)
        that_val_loss = gpu_recv_queue.get()
        this_val_loss = (this_val_loss + that_val_loss) / 2.

        if private_config['flag_verbose']:
            print('epoch %i: validation loss %f ' % (epoch, this_val_loss))
            print('epoch %i: validation error %f %%' %
                  (epoch, this_val_error * 100.))
        val_record.append([this_val_error, this_val_loss])

        if private_config['flag_save']:
            np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx, val_record,
                                        learning_rate)

        # Save Weights, only one of them will do
        if private_config['flag_save']:
            if epoch % config['snapshot_freq'] == 0:
                save_weights(layers, config['weights_dir'], epoch)
                np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                        learning_rate.get_value())
                save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')

Exemplo n.º 11

0

Exibir arquivo

def fun_mlp(shared_args, private_args, this_queue, that_queue):
    '''
    shared_args 
    contains neural network parameters

    private_args
    contains parameters for process run on each gpu

    this_queue and that_queue are used for synchronization between processes.
    '''

    learning_rate = shared_args['learning_rate']
    n_epochs = shared_args['n_epochs']
    dataset = shared_args['dataset']
    batch_size = shared_args['batch_size']
    L1_reg = shared_args['L1_reg']
    L2_reg = shared_args['L2_reg']
    n_hidden = shared_args['n_hidden']

    ####
    # pycuda and zmq environment
    drv.init()
    dev = drv.Device(private_args['ind_gpu'])
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)

    if private_args['flag_client']:
        sock.connect('tcp://localhost:5000')
    else:
        sock.bind('tcp://*:5000')
    ####

    ####
    # import theano related
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T

    from logistic_sgd import load_data
    from mlp import MLP

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    ####

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10)

    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    gparams = [T.grad(cost, param) for param in classifier.params]

    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    ####
    # setting pycuda and
    # pass handles, only done once

    param_ga_list = []
    # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu

    param_other_list = []
    # a list of theano shared variables that are used to store values of theano shared variable from the other gpu

    param_ga_other_list = []
    # a list of pycuda gpuarrays which point to theano shared variables in param_other_list

    h_list = []
    # a list of pycuda IPC handles

    shape_list = []
    # a list containing shapes of variables in param_ga_list

    dtype_list = []
    # a list containing dtypes of variables in param_ga_list

    average_fun_list = []
    # a list containing theano functions for averaging parameters

    for param in classifier.params:
        param_other = theano.shared(param.get_value())
        param_ga = \
            theano.misc.pycuda_utils.to_gpuarray(param.container.value)
        param_ga_other = \
            theano.misc.pycuda_utils.to_gpuarray(
                param_other.container.value)
        h = drv.mem_get_ipc_handle(param_ga.ptr)
        average_fun = \
            theano.function([], updates=[(param,
                                          (param + param_other) / 2.)])

        param_other_list.append(param_other)
        param_ga_list.append(param_ga)
        param_ga_other_list.append(param_ga_other)
        h_list.append(h)
        shape_list.append(param_ga.shape)
        dtype_list.append(param_ga.dtype)
        average_fun_list.append(average_fun)

    # pass shape, dtype and handles
    sock.send_pyobj((shape_list, dtype_list, h_list))
    shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj()

    param_ga_remote_list = []

    # create gpuarray point to the other gpu use the passed information
    for shape_other, dtype_other, h_other in zip(shape_other_list,
                                                 dtype_other_list,
                                                 h_other_list):
        param_ga_remote = \
            gpuarray.GPUArray(shape_other, dtype_other,
                              gpudata=drv.IPCMemoryHandle(h_other))

        param_ga_remote_list.append(param_ga_remote)
    ####

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    this_queue.put('')
    that_queue.get()
    start_time = time.time()

    epoch = 0

    while epoch < n_epochs:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            if minibatch_index % 2 == private_args['mod']:
                train_model(minibatch_index)

                this_queue.put('')
                that_queue.get()

                # exchanging weights
                for param_ga, param_ga_other, param_ga_remote in \
                        zip(param_ga_list, param_ga_other_list,
                            param_ga_remote_list):

                    drv.memcpy_peer(
                        param_ga_other.ptr, param_ga_remote.ptr,
                        param_ga_remote.dtype.itemsize * param_ga_remote.size,
                        ctx, ctx)

                ctx.synchronize()
                this_queue.put('')
                that_queue.get()

                for average_fun in average_fun_list:
                    average_fun()

        if private_args['verbose']:
            validation_losses = [
                validate_model(i) for i in xrange(n_valid_batches)
            ]
            this_validation_loss = np.mean(validation_losses)

            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch, minibatch_index + 1, n_train_batches,
                   this_validation_loss * 100.))

    end_time = time.time()

    this_queue.put('')
    that_queue.get()

    if private_args['verbose']:
        print 'The code run for %d epochs, with %f epochs/sec' % (
            epoch, 1. * epoch / (end_time - start_time))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.1fs' %
                              ((end_time - start_time)))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: train_2gpu.py Projeto: myt00seven/svrg

def train_net(config, private_config):

    # UNPACK CONFIGS
    (flag_para_load, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = \
        unpack_configs(config, ext_data=private_config['ext_data'],
                       ext_label=private_config['ext_label'])


    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                    log_iter.write("%d\n" % num_iter)
                    log_iter.flush()
                    print 'training cost:', cost_ij
                    log_err_cost.write("%f\n" % cost_ij)
                    log_err_cost.flush()

                if config['print_train_error']:
                    error_ij = train_error()

                    gpu_send_queue.put(error_ij)
                    that_error = gpu_recv_queue.get()
                    error_ij = (error_ij + that_error) / 2.

                    if private_config['flag_verbose']:
                        print 'training error rate:', error_ij
                        log_err_rate.write("%f\n" % error_ij)
                        log_err_rate.flush()


            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

            if count%20 == 0:
                e = time.time()
                print "time per 20 iter:", (e - s)
                
        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_val_error, this_val_loss = get_val_error_loss(
            rand_arr, shared_x, shared_y,
            val_filenames, val_labels,
            flag_para_load, img_mean,
            batch_size, validate_model,
            send_queue=load_send_queue, recv_queue=load_recv_queue)

        # report validation stats
        gpu_send_queue.put(this_val_error)
        that_val_error = gpu_recv_queue.get()
        this_val_error = (this_val_error + that_val_error) / 2.

        gpu_send_queue.put(this_val_loss)
        that_val_loss = gpu_recv_queue.get()
        this_val_loss = (this_val_loss + that_val_loss) / 2.

        if private_config['flag_verbose']:
            print('epoch %i: validation loss %f ' %
                  (epoch, this_val_loss))
            print('epoch %i: validation error %f %%' %
                  (epoch, this_val_error * 100.))
        val_record.append([this_val_error, this_val_loss])

        if private_config['flag_save']:
            np.save(config['weights_dir'] + 'val_record.npy', val_record)
            np.savetxt(config['weights_dir'] + 'val_record_txt.txt', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx,
                                        val_record, learning_rate)

        # Save Weights, only one of them will do
        if private_config['flag_save']:
            if epoch % config['snapshot_freq'] == 0:
                save_weights(layers, config['weights_dir'], epoch)
                np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                        learning_rate.get_value())
                save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')