def __init__(self, send_nodes, recv_nodes, scatter_send_nodes, scatter_recv_nodes, gather_send_nodes, gather_recv_nodes, allreduce_nodes, broadcast_send_nodes, broadcast_recv_nodes, **kwargs): super(HetrLocals, self).__init__(**kwargs) self.send_nodes = send_nodes self.recv_nodes = recv_nodes self.scatter_send_nodes = scatter_send_nodes self.scatter_recv_nodes = scatter_recv_nodes self.gather_send_nodes = gather_send_nodes self.gather_recv_nodes = gather_recv_nodes self.allreduce_nodes = allreduce_nodes self.broadcast_send_nodes = broadcast_send_nodes self.broadcast_recv_nodes = broadcast_recv_nodes # MLSL-specific self.mlsl_obj = mlsl.MLSL() self.mlsl_obj.init() self.process_count = self.mlsl_obj.get_process_count() self.process_idx = self.mlsl_obj.get_process_idx() # data parallelism self.distribution = None # MPI-specific self.comm = MPI.COMM_WORLD
def align_ndarray(element_count, alignment, dtype): try: import mlsl import ctypes mlsl_obj = mlsl.MLSL() if dtype.name == 'float32': c_type_name = 'c_float' elif dtype.name == 'float64': c_type_name = 'c_double' else: c_type_name = None type_size = ctypes.sizeof(getattr(ctypes, c_type_name)(1)) mlsl_buf = mlsl_obj.alloc(element_count * type_size, alignment) array = ctypes.cast( mlsl_buf, ctypes.POINTER(getattr(ctypes, c_type_name) * element_count)) np_array = np.frombuffer(array.contents, dtype) return np_array except ImportError: x = np.empty(element_count + (alignment - 1), dtype) offset = (x.ctypes.data % alignment) // dtype.itemsize padding = 0 if offset == 0 else (alignment - offset) return x[padding:padding + element_count]
class HetrLocals(object): mlsl_obj = mlsl.MLSL() mlsl_obj.init() process_count = mlsl_obj.get_process_count() process_idx = mlsl_obj.get_process_idx() def __init__(self, send_nodes, recv_nodes, scatter_send_nodes, scatter_recv_nodes, gather_send_nodes, gather_recv_nodes, allreduce_nodes, broadcast_send_nodes, broadcast_recv_nodes, **kwargs): super(HetrLocals, self).__init__(**kwargs) self.send_nodes = send_nodes self.recv_nodes = recv_nodes self.scatter_send_nodes = scatter_send_nodes self.scatter_recv_nodes = scatter_recv_nodes self.gather_send_nodes = gather_send_nodes self.gather_recv_nodes = gather_recv_nodes self.allreduce_nodes = allreduce_nodes self.broadcast_send_nodes = broadcast_send_nodes self.broadcast_recv_nodes = broadcast_recv_nodes # MLSL-specific self.mlsl_obj = mlsl.MLSL() self.mlsl_obj.init() self.process_count = self.mlsl_obj.get_process_count() self.process_idx = self.mlsl_obj.get_process_idx() # data parallelism self.distribution = None # MPI-specific self.comm = MPI.COMM_WORLD def create_distribution(self): if not self.distribution: self.distribution = self.mlsl_obj.create_distribution(self.process_count, 1) def close(self): if self.distribution: self.mlsl_obj.delete_distribution(self.distribution) self.mlsl_obj.finalize() @staticmethod def close_mlsl(): HetrLocals.mlsl_obj.finalize() mlsl.close() @staticmethod def mlsl_alloc(element_count, alignment, dtype): if dtype.name == 'float32': c_type_name = 'c_float' elif dtype.name == 'float64': c_type_name = 'c_double' else: c_type_name = None type_size = ctypes.sizeof(getattr(ctypes, c_type_name)(1)) mlsl_buf = HetrLocals.mlsl_obj.alloc(element_count * type_size, alignment) array = ctypes.cast(mlsl_buf, ctypes.POINTER(getattr(ctypes, c_type_name) * element_count)) np_array = np.frombuffer(array.contents, dtype) return np_array @staticmethod def mlsl_free(array): HetrLocals.mlsl_obj.free(array.__array_interface__['data'][0]) def as_buffer(self, array): # array.shape is () for scalar if not array.shape: array = np.atleast_1d(array) return np.ctypeslib.as_ctypes(array) def mlsl_send(self, send_id, x_nparr): send_op = self.send_nodes[send_id] self.comm.Send(x_nparr, dest=send_op.metadata['peer_id'], tag=USER_TAG) def recv_from_mlsl_send(self, recv_id, out): recv_op = self.recv_nodes[recv_id] self.comm.Recv(out, source=recv_op.metadata['peer_id'], tag=USER_TAG) return out def mlsl_gather_send(self, gather_send_id, x_nparr): gather_send_op = self.gather_send_nodes[gather_send_id] # todo: get real root_idx root_idx = 0 # np.atleast_1d is used in cases when we need to reduce to a scalar value x_nparr = np.atleast_1d(x_nparr) if self.process_idx == root_idx: # todo: remove that workaround for non-symmetric case gather_send_op.arr = x_nparr else: send_buf = self.as_buffer(x_nparr) send_count = x_nparr.size recv_buf = None if gather_send_op.use_reduce: req = self.distribution.reduce(send_buf, send_buf, send_count, mlsl.DataType.FLOAT, mlsl.ReductionType.SUM, root_idx, mlsl.GroupType.DATA) else: req = self.distribution.gather(send_buf, send_count, recv_buf, mlsl.DataType.FLOAT, root_idx, mlsl.GroupType.DATA) self.mlsl_obj.wait(req) def gather_recv_from_mlsl_gather_send(self, gather_recv_id, out): gather_recv_op = self.gather_recv_nodes[gather_recv_id] # todo: get real root_idx root_idx = 0 # todo: remove that workaround for non-symmetric case if self.process_idx == root_idx: send_node = gather_recv_op.send_nodes[0] send_buf = self.as_buffer(send_node.arr) send_count = send_node.arr.size recv_buf = self.as_buffer(out) if gather_recv_op.use_reduce: req = self.distribution.reduce(send_buf, recv_buf, send_count, mlsl.DataType.FLOAT, mlsl.ReductionType.SUM, root_idx, mlsl.GroupType.DATA) else: req = self.distribution.gather(send_buf, send_count, recv_buf, mlsl.DataType.FLOAT, root_idx, mlsl.GroupType.DATA) self.mlsl_obj.wait(req) # todo: replace by real reduce operation if gather_recv_op.use_reduce: out /= self.process_count return out def mlsl_scatter_send(self, scatter_send_id, x_nparr): scatter_send_op = self.scatter_send_nodes[scatter_send_id] # todo: get real root_idx root_idx = 0 # todo: remove that workaround for non-symmetric case if self.process_idx == root_idx: scatter_send_op.arr = x_nparr def scatter_recv_from_mlsl_scatter_send(self, scatter_recv_id, out): scatter_recv_op = self.scatter_recv_nodes[scatter_recv_id] # todo: get real root_idx root_idx = 0 # todo: remove that workaround for non-symmetric case send_buf = None if self.process_idx == root_idx: send_node = scatter_recv_op.send_node() send_buf = self.as_buffer(send_node.arr) recv_buf = self.as_buffer(out) recv_count = out.size req = self.distribution.scatter(send_buf, recv_buf, recv_count, mlsl.DataType.FLOAT, root_idx, mlsl.GroupType.DATA) self.mlsl_obj.wait(req) return out def mlsl_allreduce_start(self, allreduce_id, out, x_nparr): allreduce_op = self.allreduce_nodes[allreduce_id] if not hasattr(allreduce_op, '_req'): allreduce_op._req = [None] if allreduce_op.reduce_func == 'sum' or allreduce_op.reduce_func == 'mean': allreduce_op.arr = out send_buf = self.as_buffer(x_nparr) send_count = x_nparr.size recv_buf = self.as_buffer(out) allreduce_op.req = self.distribution.all_reduce(send_buf, recv_buf, send_count, mlsl.DataType.FLOAT, mlsl.ReductionType.SUM, mlsl.GroupType.DATA) else: raise RuntimeError('Reduce function {} is not supported.' .format(allreduce_op.reduce_func)) def mlsl_allreduce_wait(self, allreduce_id): allreduce_op = self.allreduce_nodes[allreduce_id] start_node = next(op for op in allreduce_op.control_deps if isinstance(op, CPUMlslAllReduceStartOp)) self.mlsl_obj.wait(start_node.req) if allreduce_op.reduce_func == 'sum': # sum reduction is performed inside MLSL pass elif allreduce_op.reduce_func == 'mean': start_node.arr /= self.process_count else: raise RuntimeError('Reduce function {} is not supported.' .format(allreduce_op.reduce_func)) def mlsl_broadcast_send(self, broadcast_send_id, x_nparr): broadcast_send_op = self.broadcast_send_nodes[broadcast_send_id] # todo: get real root_idx root_idx = 0 # todo: remove that workaround for non-symmetric case if self.process_idx == root_idx: broadcast_send_op.arr = x_nparr def broadcast_recv_from_mlsl_broadcast_send(self, broadcast_recv_id, out): broadcast_recv_op = self.broadcast_recv_nodes[broadcast_recv_id] # todo: get real root_idx root_idx = 0 # todo: remove that workaround for non-symmetric case req = None if self.process_idx == root_idx: send_buf = None send_node = broadcast_recv_op.send_node() send_buf = self.as_buffer(send_node.arr) count = send_node.arr.size req = self.distribution.bcast(send_buf, count, mlsl.DataType.FLOAT, root_idx, mlsl.GroupType.DATA) out[...] = send_node.arr else: recv_buf = self.as_buffer(out) count = out.size req = self.distribution.bcast(recv_buf, count, mlsl.DataType.FLOAT, root_idx, mlsl.GroupType.DATA) self.mlsl_obj.wait(req) return out
def main(configuration, ps_device=None, devices=None): mkl_multinode = configuration['mkl_multinode'] if mkl_multinode == True: mlsl_obj = mlsl.MLSL() mlsl_obj.init() node_idx = mlsl_obj.get_process_idx() node_num = mlsl_obj.get_process_count() print 'rank ', node_idx print 'nodes ', node_num dist = mlsl_obj.create_distribution(node_num, 1) else: mlsl_obj = None dist = None prefer_to_model_parallel = configuration['prefer_to_model_parallel'] l1_reg_weight = configuration['l1_reg_weight'] l2_reg_weight = configuration['l2_reg_weight'] # time_steps*nb_samples src = K.placeholder(shape=(None, None), dtype='int32') src_mask = K.placeholder(shape=(None, None)) trg = K.placeholder(shape=(None, None), dtype='int32') trg_mask = K.placeholder(shape=(None, None)) # for fast training of new parameters ite = K.placeholder(ndim=0) enc_dec = EncoderDecoder(**configuration) softmax_output_num_sampled = configuration['softmax_output_num_sampled'] if devices: if prefer_to_model_parallel: enc_dec.build_trainer_with_model_parallel( src, src_mask, trg, trg_mask, ite, ps_device, devices, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight) else: # clone the input src = [ K.placeholder(shape=(None, None), dtype='int32') for _ in devices ] src_mask = [K.placeholder(shape=(None, None)) for _ in devices] trg = [ K.placeholder(shape=(None, None), dtype='int32') for _ in devices ] trg_mask = [K.placeholder(shape=(None, None)) for _ in devices] enc_dec.build_trainer_with_data_parallel( src, src_mask, trg, trg_mask, ite, devices, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight, softmax_output_num_sampled=softmax_output_num_sampled) else: enc_dec.build_trainer( src, src_mask, trg, trg_mask, ite, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight, softmax_output_num_sampled=softmax_output_num_sampled, mlsl_obj=mlsl_obj, dist=dist) enc_dec.build_sampler() if configuration['reload']: enc_dec.load() ''' # comment for fast training sample_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=1, maxlen=configuration['seq_len_src'], stochastic=True) valid_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=configuration['beam_size'], maxlen=3 * configuration['seq_len_src'], stochastic=False) sampler = Sampler(sample_search, **configuration) bleuvalidator = BleuValidator(valid_search, **configuration) ''' # train function train_fn = enc_dec.train_fn # train data ds = DStream(**configuration) # valid data ''' # comment for fast training vs = get_devtest_stream(data_type='valid', input_file=None, **configuration) ''' iters = args.start valid_bleu_best = -1 epoch_best = -1 iters_best = -1 max_epochs = configuration['finish_after'] logger.info("epochs %d" % (max_epochs)) fn = 'nmt_mkl_log' if mkl_multinode == True: if node_idx == 0: file = open(fn, 'w', 0) last_time = time.time() print('mkl multinode') else: file = open(fn, 'w', 0) last_time = time.time() print('mkl single node') for epoch in range(max_epochs): for x, x_mask, y, y_mask in ds.get_iterator(): iter_count = 0 #last_time = time.time() # for data parallel, we need to split the data into #num devices part if devices and not prefer_to_model_parallel: # ignore the case that the number of samples is less than the number of devices num_devices = len(devices) num_samples = len(x) if num_samples < num_devices: logger.warn( 'epoch %d \t updates %d ignored current mini-batch, since its number of samples (%d) < the number of devices (%d)' % (epoch, iters, num_samples, num_devices)) continue inputs = [] for data in (x, x_mask, y, y_mask): parts = split(data, num_devices) parts = [item.T for item in parts] inputs.extend(parts) else: inputs = [x.T, x_mask.T, y.T, y_mask.T] #print('train start') tc = train_fn(inputs) #print('train finish') iters += 1 #cur_time = time.time() #duration = cur_time - last_time #num_of_words = np.prod(x.shape) #words_per_sec = int(num_of_words / duration) #logger.info('epoch %d \t updates %d train cost %.4f use time %.4f sec, %d words/sec, data x %s, data y %s' # % (epoch, iters, tc[0], duration, words_per_sec, x.shape, y.shape)) ''' # Commented for fast training if iters % configuration['save_freq'] == 0: enc_dec.save() if iters % configuration['sample_freq'] == 0: sampler.apply(x, y) if iters < configuration['val_burn_in']: continue if (iters <= configuration['val_burn_in_fine'] and iters % configuration['valid_freq'] == 0) \ or (iters > configuration['val_burn_in_fine'] and iters % configuration['valid_freq_fine'] == 0): valid_bleu = bleuvalidator.apply(vs, configuration['valid_out']) os.system('mkdir -p results/%d' % iters) os.system('mv %s* %s results/%d' % (configuration['valid_out'], configuration['saveto'], iters)) logger.info('valid_test \t epoch %d \t updates %d valid_bleu %.4f' % (epoch, iters, valid_bleu)) if valid_bleu > valid_bleu_best: valid_bleu_best = valid_bleu epoch_best = epoch iters_best = iters enc_dec.save(path=configuration['saveto_best']) ''' ''' if mkl_multinode and node_idx == 0: file.write(str(tc[0])+'\n') else: file.write(str(tc[0])+'\n') ''' iter_count += 1 if mkl_multinode == True: if node_idx == 0: file.close() cur_time = time.time() duration = cur_time - last_time print('time one epoch ', duration) else: file.close() cur_time = time.time() duration = cur_time - last_time print('time one epoch ', duration) if mkl_multinode == True: mlsl_obj.delete_distribution(dist) mlsl_obj.finalize()
def test_for_multiple_layers(print_graph=False, max_depth=0, informative_features=None): votes = 3 g = create_synthetic_graph_with_informative_extra_feature( no_of_items=3000, no_of_users=3000, no_of_votes=votes, min_grade=0, max_grade=10, min_delay=0.0, max_delay=1000.0, threshold=700.0) random.seed(940) itemList = list(g.items) if print_graph: for u in g.users: print "User ", u.name, "voted items:" for i in u.reviews: print "Item ", i.id, "Inherent grade:", i.inherent, "User grade:", u.reviews[ i].grade, "Extra feature: ", u.reviews[ i].extra_informative_feature instance_list = [] counter = 0 for i in itemList: new_root = ml.InstanceNode(label=i.inherent) build_unfolding(0, max_depth, i, new_root, informative_features) new_root.set_label(i.inherent) instance_list.append(new_root) counter += 1 if counter % 200 == 0: print "Created unfolding for ", counter, "items." OUTPUT_SIZES = [11, 2, 2] INPUT_SIZES = [ 11 + (1 if informative_features[0] == "include" else 0), 11 + (1 if informative_features[1] == "include" else 0), 11 + (1 if informative_features[2] == "include" else 0) ] LEARNING_RATE_VECTOR = [0.05, 0.1, 4.5] LEARNING_METHOD_VECTOR = ["steady_rate", "steady_rate", "steady_rate"] #LEARNING_METHOD_VECTOR = ["momentum", "momentum", "momentum"] #LEARNING_METHOD_VECTOR = ["adadelta", "adadelta", "adadelta"] MOMENTUM_VECTOR = [0.01, 0.01, 0.01] ADADELTA_VECTOR = [{ "learning_factor": 1.0, "epsilon": 0.001, "decay": 0.95 }, { "learning_factor": 1.0, "epsilon": 0.001, "decay": 0.95 }, { "learning_factor": 1.0, "epsilon": 0.001, "decay": 0.95 }] OBJECTIVE_FUNCTION = "softmax_classification" mlsl_model = ml.MLSL( max_depth + 1, output_sizes=OUTPUT_SIZES[:max_depth + 1], node_feature_sizes=INPUT_SIZES[:max_depth + 1], learning_rate_vector=LEARNING_RATE_VECTOR[:max_depth + 1], learning_method_vector=LEARNING_METHOD_VECTOR[:max_depth + 1]) random.shuffle(instance_list) training_set = instance_list[0:2000] test_set = instance_list[2000:3000] print "Training starts for ", max_depth + 1, " levels" train_model_force_balance(mlsl_model, training_set, num_instances=50000, max_depth=max_depth, objective_function=OBJECTIVE_FUNCTION, learning_rate_vector=LEARNING_RATE_VECTOR, learning_method_vector=LEARNING_METHOD_VECTOR, momentum_vector=MOMENTUM_VECTOR, adadelta_parameters=ADADELTA_VECTOR) return test_model(mlsl_model, test_set)
# reproduced, stored in a retrieval system, transmitted in any form, or # distributed by any means without the express written consent of # Intel Corporation. # # MLSL library API usage example and correctness check test from builtins import range from collections import namedtuple import mlsl import numpy as np from math import fabs import sys import ctypes mlsl_obj = mlsl.MLSL() dtype_size = 8 np_type = "float32" if dtype_size == 4 else "float64" mlsl_dtype = mlsl.DataType.FLOAT if dtype_size == 4 else mlsl.DataType.DOUBLE cacheline_size = 64 fail_counter_max = 5 global_minibatch_size = 16 layer_count = 2 epoch_count = 2 minibatch_per_epoch = 1 process_idx = None process_count = 0