Пример #1
0
    def __init__(self, send_nodes, recv_nodes,
                 scatter_send_nodes, scatter_recv_nodes,
                 gather_send_nodes, gather_recv_nodes,
                 allreduce_nodes, broadcast_send_nodes,
                 broadcast_recv_nodes, **kwargs):
        super(HetrLocals, self).__init__(**kwargs)
        self.send_nodes = send_nodes
        self.recv_nodes = recv_nodes
        self.scatter_send_nodes = scatter_send_nodes
        self.scatter_recv_nodes = scatter_recv_nodes
        self.gather_send_nodes = gather_send_nodes
        self.gather_recv_nodes = gather_recv_nodes
        self.allreduce_nodes = allreduce_nodes
        self.broadcast_send_nodes = broadcast_send_nodes
        self.broadcast_recv_nodes = broadcast_recv_nodes

        # MLSL-specific
        self.mlsl_obj = mlsl.MLSL()
        self.mlsl_obj.init()
        self.process_count = self.mlsl_obj.get_process_count()
        self.process_idx = self.mlsl_obj.get_process_idx()
        # data parallelism
        self.distribution = None

        # MPI-specific
        self.comm = MPI.COMM_WORLD
Пример #2
0
def align_ndarray(element_count, alignment, dtype):
    try:
        import mlsl
        import ctypes
        mlsl_obj = mlsl.MLSL()
        if dtype.name == 'float32':
            c_type_name = 'c_float'
        elif dtype.name == 'float64':
            c_type_name = 'c_double'
        else:
            c_type_name = None
        type_size = ctypes.sizeof(getattr(ctypes, c_type_name)(1))
        mlsl_buf = mlsl_obj.alloc(element_count * type_size, alignment)
        array = ctypes.cast(
            mlsl_buf,
            ctypes.POINTER(getattr(ctypes, c_type_name) * element_count))
        np_array = np.frombuffer(array.contents, dtype)
        return np_array
    except ImportError:
        x = np.empty(element_count + (alignment - 1), dtype)
        offset = (x.ctypes.data % alignment) // dtype.itemsize
        padding = 0 if offset == 0 else (alignment - offset)
        return x[padding:padding + element_count]
Пример #3
0
class HetrLocals(object):

    mlsl_obj = mlsl.MLSL()
    mlsl_obj.init()
    process_count = mlsl_obj.get_process_count()
    process_idx = mlsl_obj.get_process_idx()

    def __init__(self, send_nodes, recv_nodes,
                 scatter_send_nodes, scatter_recv_nodes,
                 gather_send_nodes, gather_recv_nodes,
                 allreduce_nodes, broadcast_send_nodes,
                 broadcast_recv_nodes, **kwargs):
        super(HetrLocals, self).__init__(**kwargs)
        self.send_nodes = send_nodes
        self.recv_nodes = recv_nodes
        self.scatter_send_nodes = scatter_send_nodes
        self.scatter_recv_nodes = scatter_recv_nodes
        self.gather_send_nodes = gather_send_nodes
        self.gather_recv_nodes = gather_recv_nodes
        self.allreduce_nodes = allreduce_nodes
        self.broadcast_send_nodes = broadcast_send_nodes
        self.broadcast_recv_nodes = broadcast_recv_nodes

        # MLSL-specific
        self.mlsl_obj = mlsl.MLSL()
        self.mlsl_obj.init()
        self.process_count = self.mlsl_obj.get_process_count()
        self.process_idx = self.mlsl_obj.get_process_idx()
        # data parallelism
        self.distribution = None

        # MPI-specific
        self.comm = MPI.COMM_WORLD

    def create_distribution(self):
        if not self.distribution:
            self.distribution = self.mlsl_obj.create_distribution(self.process_count, 1)

    def close(self):
        if self.distribution:
            self.mlsl_obj.delete_distribution(self.distribution)
        self.mlsl_obj.finalize()

    @staticmethod
    def close_mlsl():
        HetrLocals.mlsl_obj.finalize()
        mlsl.close()

    @staticmethod
    def mlsl_alloc(element_count, alignment, dtype):
        if dtype.name == 'float32':
            c_type_name = 'c_float'
        elif dtype.name == 'float64':
            c_type_name = 'c_double'
        else:
            c_type_name = None
        type_size = ctypes.sizeof(getattr(ctypes, c_type_name)(1))
        mlsl_buf = HetrLocals.mlsl_obj.alloc(element_count * type_size, alignment)
        array = ctypes.cast(mlsl_buf, ctypes.POINTER(getattr(ctypes, c_type_name) * element_count))
        np_array = np.frombuffer(array.contents, dtype)
        return np_array

    @staticmethod
    def mlsl_free(array):
        HetrLocals.mlsl_obj.free(array.__array_interface__['data'][0])

    def as_buffer(self, array):
        # array.shape is () for scalar
        if not array.shape:
            array = np.atleast_1d(array)
        return np.ctypeslib.as_ctypes(array)

    def mlsl_send(self, send_id, x_nparr):
        send_op = self.send_nodes[send_id]
        self.comm.Send(x_nparr, dest=send_op.metadata['peer_id'], tag=USER_TAG)

    def recv_from_mlsl_send(self, recv_id, out):
        recv_op = self.recv_nodes[recv_id]
        self.comm.Recv(out, source=recv_op.metadata['peer_id'], tag=USER_TAG)
        return out

    def mlsl_gather_send(self, gather_send_id, x_nparr):
        gather_send_op = self.gather_send_nodes[gather_send_id]

        # todo: get real root_idx
        root_idx = 0
        # np.atleast_1d is used in cases when we need to reduce to a scalar value
        x_nparr = np.atleast_1d(x_nparr)
        if self.process_idx == root_idx:
            # todo: remove that workaround for non-symmetric case
            gather_send_op.arr = x_nparr
        else:
            send_buf = self.as_buffer(x_nparr)
            send_count = x_nparr.size
            recv_buf = None
            if gather_send_op.use_reduce:
                req = self.distribution.reduce(send_buf, send_buf, send_count,
                                               mlsl.DataType.FLOAT, mlsl.ReductionType.SUM,
                                               root_idx, mlsl.GroupType.DATA)
            else:
                req = self.distribution.gather(send_buf, send_count, recv_buf,
                                               mlsl.DataType.FLOAT, root_idx,
                                               mlsl.GroupType.DATA)
            self.mlsl_obj.wait(req)

    def gather_recv_from_mlsl_gather_send(self, gather_recv_id, out):
        gather_recv_op = self.gather_recv_nodes[gather_recv_id]

        # todo: get real root_idx
        root_idx = 0

        # todo: remove that workaround for non-symmetric case
        if self.process_idx == root_idx:
            send_node = gather_recv_op.send_nodes[0]
            send_buf = self.as_buffer(send_node.arr)
            send_count = send_node.arr.size
            recv_buf = self.as_buffer(out)

            if gather_recv_op.use_reduce:
                req = self.distribution.reduce(send_buf, recv_buf, send_count,
                                               mlsl.DataType.FLOAT, mlsl.ReductionType.SUM,
                                               root_idx, mlsl.GroupType.DATA)
            else:
                req = self.distribution.gather(send_buf, send_count, recv_buf,
                                               mlsl.DataType.FLOAT, root_idx,
                                               mlsl.GroupType.DATA)
            self.mlsl_obj.wait(req)

            # todo: replace by real reduce operation
            if gather_recv_op.use_reduce:
                out /= self.process_count

        return out

    def mlsl_scatter_send(self, scatter_send_id, x_nparr):
        scatter_send_op = self.scatter_send_nodes[scatter_send_id]

        # todo: get real root_idx
        root_idx = 0

        # todo: remove that workaround for non-symmetric case
        if self.process_idx == root_idx:
            scatter_send_op.arr = x_nparr

    def scatter_recv_from_mlsl_scatter_send(self, scatter_recv_id, out):
        scatter_recv_op = self.scatter_recv_nodes[scatter_recv_id]

        # todo: get real root_idx
        root_idx = 0

        # todo: remove that workaround for non-symmetric case
        send_buf = None
        if self.process_idx == root_idx:
            send_node = scatter_recv_op.send_node()
            send_buf = self.as_buffer(send_node.arr)
        recv_buf = self.as_buffer(out)
        recv_count = out.size

        req = self.distribution.scatter(send_buf, recv_buf, recv_count,
                                        mlsl.DataType.FLOAT, root_idx,
                                        mlsl.GroupType.DATA)
        self.mlsl_obj.wait(req)
        return out

    def mlsl_allreduce_start(self, allreduce_id, out, x_nparr):
        allreduce_op = self.allreduce_nodes[allreduce_id]
        if not hasattr(allreduce_op, '_req'):
            allreduce_op._req = [None]
        if allreduce_op.reduce_func == 'sum' or allreduce_op.reduce_func == 'mean':
            allreduce_op.arr = out
            send_buf = self.as_buffer(x_nparr)
            send_count = x_nparr.size
            recv_buf = self.as_buffer(out)
            allreduce_op.req = self.distribution.all_reduce(send_buf, recv_buf, send_count,
                                                            mlsl.DataType.FLOAT,
                                                            mlsl.ReductionType.SUM,
                                                            mlsl.GroupType.DATA)
        else:
            raise RuntimeError('Reduce function {} is not supported.'
                               .format(allreduce_op.reduce_func))

    def mlsl_allreduce_wait(self, allreduce_id):
        allreduce_op = self.allreduce_nodes[allreduce_id]
        start_node = next(op for op in allreduce_op.control_deps
                          if isinstance(op, CPUMlslAllReduceStartOp))
        self.mlsl_obj.wait(start_node.req)

        if allreduce_op.reduce_func == 'sum':
            # sum reduction is performed inside MLSL
            pass
        elif allreduce_op.reduce_func == 'mean':
            start_node.arr /= self.process_count
        else:
            raise RuntimeError('Reduce function {} is not supported.'
                               .format(allreduce_op.reduce_func))

    def mlsl_broadcast_send(self, broadcast_send_id, x_nparr):
        broadcast_send_op = self.broadcast_send_nodes[broadcast_send_id]

        # todo: get real root_idx
        root_idx = 0

        # todo: remove that workaround for non-symmetric case
        if self.process_idx == root_idx:
            broadcast_send_op.arr = x_nparr

    def broadcast_recv_from_mlsl_broadcast_send(self, broadcast_recv_id, out):
        broadcast_recv_op = self.broadcast_recv_nodes[broadcast_recv_id]

        # todo: get real root_idx
        root_idx = 0

        # todo: remove that workaround for non-symmetric case
        req = None
        if self.process_idx == root_idx:
            send_buf = None
            send_node = broadcast_recv_op.send_node()
            send_buf = self.as_buffer(send_node.arr)
            count = send_node.arr.size
            req = self.distribution.bcast(send_buf, count,
                                          mlsl.DataType.FLOAT, root_idx,
                                          mlsl.GroupType.DATA)
            out[...] = send_node.arr
        else:
            recv_buf = self.as_buffer(out)
            count = out.size
            req = self.distribution.bcast(recv_buf, count,
                                          mlsl.DataType.FLOAT, root_idx,
                                          mlsl.GroupType.DATA)
        self.mlsl_obj.wait(req)
        return out
Пример #4
0
def main(configuration, ps_device=None, devices=None):
    mkl_multinode = configuration['mkl_multinode']
    if mkl_multinode == True:
        mlsl_obj = mlsl.MLSL()
        mlsl_obj.init()
        node_idx = mlsl_obj.get_process_idx()
        node_num = mlsl_obj.get_process_count()
        print 'rank ', node_idx
        print 'nodes ', node_num
        dist = mlsl_obj.create_distribution(node_num, 1)
    else:
        mlsl_obj = None
        dist = None
    prefer_to_model_parallel = configuration['prefer_to_model_parallel']
    l1_reg_weight = configuration['l1_reg_weight']
    l2_reg_weight = configuration['l2_reg_weight']

    #  time_steps*nb_samples
    src = K.placeholder(shape=(None, None), dtype='int32')
    src_mask = K.placeholder(shape=(None, None))
    trg = K.placeholder(shape=(None, None), dtype='int32')
    trg_mask = K.placeholder(shape=(None, None))

    # for fast training of new parameters
    ite = K.placeholder(ndim=0)

    enc_dec = EncoderDecoder(**configuration)

    softmax_output_num_sampled = configuration['softmax_output_num_sampled']
    if devices:
        if prefer_to_model_parallel:
            enc_dec.build_trainer_with_model_parallel(
                src,
                src_mask,
                trg,
                trg_mask,
                ite,
                ps_device,
                devices,
                l1_reg_weight=l1_reg_weight,
                l2_reg_weight=l2_reg_weight)
        else:
            # clone the input
            src = [
                K.placeholder(shape=(None, None), dtype='int32')
                for _ in devices
            ]
            src_mask = [K.placeholder(shape=(None, None)) for _ in devices]
            trg = [
                K.placeholder(shape=(None, None), dtype='int32')
                for _ in devices
            ]
            trg_mask = [K.placeholder(shape=(None, None)) for _ in devices]

            enc_dec.build_trainer_with_data_parallel(
                src,
                src_mask,
                trg,
                trg_mask,
                ite,
                devices,
                l1_reg_weight=l1_reg_weight,
                l2_reg_weight=l2_reg_weight,
                softmax_output_num_sampled=softmax_output_num_sampled)
    else:
        enc_dec.build_trainer(
            src,
            src_mask,
            trg,
            trg_mask,
            ite,
            l1_reg_weight=l1_reg_weight,
            l2_reg_weight=l2_reg_weight,
            softmax_output_num_sampled=softmax_output_num_sampled,
            mlsl_obj=mlsl_obj,
            dist=dist)

    enc_dec.build_sampler()

    if configuration['reload']:
        enc_dec.load()
    '''
    # comment for fast training
    sample_search = BeamSearch(enc_dec=enc_dec,
                               configuration=configuration,
                               beam_size=1,
                               maxlen=configuration['seq_len_src'], stochastic=True)
    valid_search = BeamSearch(enc_dec=enc_dec,
                              configuration=configuration,
                              beam_size=configuration['beam_size'],
                              maxlen=3 * configuration['seq_len_src'], stochastic=False)

    sampler = Sampler(sample_search, **configuration)
    bleuvalidator = BleuValidator(valid_search, **configuration)
    '''

    # train function
    train_fn = enc_dec.train_fn

    # train data
    ds = DStream(**configuration)

    # valid data
    '''
    # comment for fast training
    vs = get_devtest_stream(data_type='valid', input_file=None, **configuration)
    '''

    iters = args.start
    valid_bleu_best = -1
    epoch_best = -1
    iters_best = -1
    max_epochs = configuration['finish_after']
    logger.info("epochs %d" % (max_epochs))
    fn = 'nmt_mkl_log'
    if mkl_multinode == True:
        if node_idx == 0:
            file = open(fn, 'w', 0)
            last_time = time.time()
            print('mkl multinode')
    else:
        file = open(fn, 'w', 0)
        last_time = time.time()
        print('mkl single node')
    for epoch in range(max_epochs):
        for x, x_mask, y, y_mask in ds.get_iterator():
            iter_count = 0
            #last_time = time.time()
            # for data parallel, we need to split the data into #num devices part
            if devices and not prefer_to_model_parallel:
                # ignore the case that the number of samples is less than the number of devices
                num_devices = len(devices)
                num_samples = len(x)

                if num_samples < num_devices:
                    logger.warn(
                        'epoch %d \t updates %d ignored current mini-batch, since its number of samples (%d) < the number of devices (%d)'
                        % (epoch, iters, num_samples, num_devices))
                    continue

                inputs = []
                for data in (x, x_mask, y, y_mask):
                    parts = split(data, num_devices)
                    parts = [item.T for item in parts]
                    inputs.extend(parts)
            else:
                inputs = [x.T, x_mask.T, y.T, y_mask.T]
            #print('train start')
            tc = train_fn(inputs)
            #print('train finish')

            iters += 1

            #cur_time = time.time()
            #duration = cur_time - last_time
            #num_of_words = np.prod(x.shape)
            #words_per_sec = int(num_of_words / duration)
            #logger.info('epoch %d \t updates %d train cost %.4f use time %.4f sec, %d words/sec, data x %s, data y %s'
            #            % (epoch, iters, tc[0], duration, words_per_sec, x.shape, y.shape))
            '''
            # Commented for fast training
            if iters % configuration['save_freq'] == 0:
                enc_dec.save()

            if iters % configuration['sample_freq'] == 0:
                sampler.apply(x, y)

            if iters < configuration['val_burn_in']:
                continue

            if (iters <= configuration['val_burn_in_fine'] and iters % configuration['valid_freq'] == 0) \
               or (iters > configuration['val_burn_in_fine'] and iters % configuration['valid_freq_fine'] == 0):
                valid_bleu = bleuvalidator.apply(vs, configuration['valid_out'])
                os.system('mkdir -p results/%d' % iters)
                os.system('mv %s* %s results/%d' % (configuration['valid_out'], configuration['saveto'], iters))
                logger.info('valid_test \t epoch %d \t updates %d valid_bleu %.4f'
                        % (epoch, iters, valid_bleu))
                if valid_bleu > valid_bleu_best:
                    valid_bleu_best = valid_bleu
                    epoch_best = epoch
                    iters_best = iters
                    enc_dec.save(path=configuration['saveto_best'])
            '''
            '''
            if mkl_multinode and node_idx == 0:
                file.write(str(tc[0])+'\n')
            else:
                file.write(str(tc[0])+'\n')
            '''
            iter_count += 1
    if mkl_multinode == True:
        if node_idx == 0:
            file.close()
            cur_time = time.time()
            duration = cur_time - last_time
            print('time one epoch ', duration)
    else:
        file.close()
        cur_time = time.time()
        duration = cur_time - last_time
        print('time one epoch ', duration)
    if mkl_multinode == True:
        mlsl_obj.delete_distribution(dist)
        mlsl_obj.finalize()
Пример #5
0
def test_for_multiple_layers(print_graph=False,
                             max_depth=0,
                             informative_features=None):
    votes = 3
    g = create_synthetic_graph_with_informative_extra_feature(
        no_of_items=3000,
        no_of_users=3000,
        no_of_votes=votes,
        min_grade=0,
        max_grade=10,
        min_delay=0.0,
        max_delay=1000.0,
        threshold=700.0)
    random.seed(940)
    itemList = list(g.items)
    if print_graph:
        for u in g.users:
            print "User ", u.name, "voted items:"
            for i in u.reviews:
                print "Item ", i.id, "Inherent grade:", i.inherent, "User grade:", u.reviews[
                    i].grade, "Extra feature: ", u.reviews[
                        i].extra_informative_feature
    instance_list = []
    counter = 0
    for i in itemList:
        new_root = ml.InstanceNode(label=i.inherent)
        build_unfolding(0, max_depth, i, new_root, informative_features)
        new_root.set_label(i.inherent)
        instance_list.append(new_root)
        counter += 1
        if counter % 200 == 0:
            print "Created unfolding for ", counter, "items."
    OUTPUT_SIZES = [11, 2, 2]
    INPUT_SIZES = [
        11 + (1 if informative_features[0] == "include" else 0),
        11 + (1 if informative_features[1] == "include" else 0),
        11 + (1 if informative_features[2] == "include" else 0)
    ]
    LEARNING_RATE_VECTOR = [0.05, 0.1, 4.5]
    LEARNING_METHOD_VECTOR = ["steady_rate", "steady_rate", "steady_rate"]
    #LEARNING_METHOD_VECTOR = ["momentum", "momentum", "momentum"]
    #LEARNING_METHOD_VECTOR = ["adadelta", "adadelta", "adadelta"]
    MOMENTUM_VECTOR = [0.01, 0.01, 0.01]
    ADADELTA_VECTOR = [{
        "learning_factor": 1.0,
        "epsilon": 0.001,
        "decay": 0.95
    }, {
        "learning_factor": 1.0,
        "epsilon": 0.001,
        "decay": 0.95
    }, {
        "learning_factor": 1.0,
        "epsilon": 0.001,
        "decay": 0.95
    }]
    OBJECTIVE_FUNCTION = "softmax_classification"
    mlsl_model = ml.MLSL(
        max_depth + 1,
        output_sizes=OUTPUT_SIZES[:max_depth + 1],
        node_feature_sizes=INPUT_SIZES[:max_depth + 1],
        learning_rate_vector=LEARNING_RATE_VECTOR[:max_depth + 1],
        learning_method_vector=LEARNING_METHOD_VECTOR[:max_depth + 1])
    random.shuffle(instance_list)
    training_set = instance_list[0:2000]
    test_set = instance_list[2000:3000]
    print "Training starts for ", max_depth + 1, " levels"
    train_model_force_balance(mlsl_model,
                              training_set,
                              num_instances=50000,
                              max_depth=max_depth,
                              objective_function=OBJECTIVE_FUNCTION,
                              learning_rate_vector=LEARNING_RATE_VECTOR,
                              learning_method_vector=LEARNING_METHOD_VECTOR,
                              momentum_vector=MOMENTUM_VECTOR,
                              adadelta_parameters=ADADELTA_VECTOR)
    return test_model(mlsl_model, test_set)
Пример #6
0
# reproduced, stored in a retrieval system, transmitted in any form, or
# distributed by any means without the express written consent of
# Intel Corporation.
#

# MLSL library API usage example and correctness check test

from builtins import range
from collections import namedtuple
import mlsl
import numpy as np
from math import fabs
import sys
import ctypes

mlsl_obj = mlsl.MLSL()

dtype_size = 8
np_type = "float32" if dtype_size == 4 else "float64"
mlsl_dtype = mlsl.DataType.FLOAT if dtype_size == 4 else mlsl.DataType.DOUBLE
cacheline_size = 64
fail_counter_max = 5

global_minibatch_size = 16
layer_count = 2
epoch_count = 2
minibatch_per_epoch = 1

process_idx = None
process_count = 0