def check_shared_exec_group(sparse_embedding): # generate an rnn sym with #layers=5 sym = get_rnn_sym(num_layers=3, num_words=num_words, num_hidden=num_hidden, num_embed=num_embed, seq_len=max_bucket_size, sparse_embedding=sparse_embedding) arg_names1 = sym.list_arguments() input_names = [name[0] for name in data_shapes ] + [name[0] for name in label_shapes] shared_arg_names = [ name for name in arg_names1 if name not in input_names ] exec_group1 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, workload=workload, data_shapes=data_shapes, label_shapes=label_shapes, param_names=shared_arg_names, for_training=True, inputs_need_grad=False) # shared_data_arrays should only have input "data" and "softmax_label" arrays for i in range(len(contexts)): assert len(exec_group1.shared_data_arrays[i]) == len(input_names),\ "exec_group1.shared_data_arrays[%d] should have the same number of names as in input_names" % i for name in input_names: assert name in exec_group1.shared_data_arrays[i],\ "arg %s should be in exec_group1.shared_data_arrays[%d]" % (name, i) # generate an rnn sym with #layers=5 sym = get_rnn_sym(num_layers=5, num_words=num_words, num_hidden=num_hidden, num_embed=num_embed, seq_len=max_bucket_size, sparse_embedding=sparse_embedding) arg_names2 = sym.list_arguments() exec_group2 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, workload=workload, data_shapes=data_shapes, label_shapes=label_shapes, param_names=shared_arg_names, for_training=True, inputs_need_grad=False, shared_group=exec_group1) extra_args = [ name for name in arg_names2 if name not in shared_arg_names ] check_shared_grad = not sparse_embedding test_shared_exec_group(exec_grp_shared=exec_group1, exec_grp_created=exec_group2, shared_arg_names=shared_arg_names, extra_args=extra_args, check_shared_grad=check_shared_grad)
def bind(modQ, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): if force_rebind: modQ._reset_bind() if modQ.binded: modQ.logger.warning('Already binded, ignoring bind()') return modQ.for_training = for_training modQ.inputs_need_grad = inputs_need_grad modQ.binded = True modQ._grad_req = grad_req if not for_training: assert not inputs_need_grad else: pass # this is not True, as some module might not contains a loss function # that consumes the labels # assert label_shapes is not None modQ._data_shapes, modQ._label_shapes = _parse_data_desc( modQ.data_names, modQ.label_names, data_shapes, label_shapes) if shared_module is not None: assert isinstance(shared_module, Module) and \ shared_module.binded and shared_module.params_initialized shared_group = shared_module._exec_group else: shared_group = None modQ._exec_group = DataParallelExecutorGroup( modQ._symbol, modQ._context, modQ._work_load_list, modQ._data_shapes, modQ._label_shapes, modQ._param_names, for_training, inputs_need_grad, shared_group, logger=modQ.logger, fixed_param_names=modQ._fixed_param_names, grad_req=grad_req, state_names=modQ._state_names) modQ._total_exec_bytes = modQ._exec_group._total_exec_bytes if shared_module is not None: modQ.params_initialized = True modQ._arg_params = shared_module._arg_params modQ._aux_params = shared_module._aux_params elif modQ.params_initialized: # if the parameters are already initialized, we are re-binding # so automatically copy the already initialized params modQ._exec_group.set_params(modQ._arg_params, modQ._aux_params) else: assert modQ._arg_params is None and modQ._aux_params is None param_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype, ctx=x[0][0].context) for x in modQ._exec_group.param_arrays ] modQ._arg_params = { name: arr for name, arr in zip(modQ._param_names, param_arrays) } aux_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype, ctx=x[0][0].context) for x in modQ._exec_group.aux_arrays ] modQ._aux_params = { name: arr for name, arr in zip(modQ._aux_names, aux_arrays) } if shared_module is not None and shared_module.optimizer_initialized: modQ.borrow_optimizer(shared_module)
def test_executor_group(): def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len): stack = mx.rnn.SequentialRNNCell() for i in range(num_layers): stack.add( mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_' % i)) data = mx.sym.Variable('data') label = mx.sym.Variable('softmax_label') embed = mx.sym.Embedding(data=data, input_dim=num_words, output_dim=num_embed, name='embed') stack.reset() outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden)) pred = mx.sym.FullyConnected(data=pred, num_hidden=num_words, name='pred') label = mx.sym.Reshape(label, shape=(-1, )) pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') return pred def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=None, extra_args=None): # Test shared data arrays for i in range(len(exec_grp_shared.execs)): # test same shared_data_arrays for two exec groups shared_data_array1 = exec_grp_shared.shared_data_arrays[i] shared_data_array2 = exec_grp_created.shared_data_arrays[i] if extra_args is not None: assert len(shared_data_array1) == len(extra_args),\ "exec_grp_shared.shared_data_arrays[%d] should have same number of args as extra_args" assert len(shared_data_array1) == len(shared_data_array2),\ "length of shared_data_array of the shared executor group not equal to the created executor group" for k, v in shared_data_array1.items(): if extra_args is not None: assert k in extra_args, "arg %s is not in extra_args" % k assert k in shared_data_array2,\ "arg %s of the shared executor group not in the shared_data_array of the created executor group" % k assert mx.test_utils.same_array(v, shared_data_array2[k]) for data_name, array in exec_grp_shared.shared_data_arrays[ i].items(): assert data_name in exec_grp_created.shared_data_arrays[i], \ "Shared input data '%s' is not in " \ "shared_data_arrays of created executor group." % (data_name) assert mx.test_utils.same_array(array, exec_grp_created.shared_data_arrays[i][data_name]), \ "Shared input data '%s' does not share memory." % (data_name) # Test shared argument arrays and gradient arrays exec_shared = exec_grp_shared.execs[i] exec_created = exec_grp_created.execs[i] if shared_arg_names is not None: # test shared arguments for arg_name in shared_arg_names: assert arg_name in exec_created.arg_dict, \ "Shared argument '%s' is not in arg_dict of created executor group." % (arg_name) assert mx.test_utils.same_array(exec_shared.arg_dict[arg_name], exec_created.arg_dict[arg_name]), \ "Shared argument '%s' does not share memory." % (arg_name) # test shared argument gradients for arg_name in shared_arg_names: assert arg_name in exec_created.grad_dict, \ "Shared argument gradient '%s' is not in " \ "grad_dict of created executor group." % (arg_name) assert mx.test_utils.same_array(exec_shared.grad_dict[arg_name], exec_created.grad_dict[arg_name]), \ "Shared argument gradient '%s' does not sharing memory." % (arg_name) for arg_name, grad in exec_grp_shared.grad_req.items(): assert grad == exec_grp_created.grad_req[arg_name], \ "Gradient requirements for shared argument '%s' are inconsistent. " \ "Shared executor group requires '%s' while created executor group requires '%s'" \ %(arg_name, grad, exec_grp_created.grad_req[arg_name]) contexts = [mx.cpu(0), mx.cpu(1)] workload = [1] * len(contexts) batch_size = 32 max_bucket_size = 80 num_words = 1000 num_hidden = 100 num_embed = 200 data_shapes = [('data', (batch_size, max_bucket_size))] label_shapes = [('softmax_label', (batch_size, max_bucket_size))] # generate an rnn sym with #layers=5 sym = get_rnn_sym(num_layers=3, num_words=num_words, num_hidden=num_hidden, num_embed=num_embed, seq_len=max_bucket_size) arg_names1 = sym.list_arguments() input_names = [name[0] for name in data_shapes ] + [name[0] for name in label_shapes] shared_arg_names = [name for name in arg_names1 if name not in input_names] exec_group1 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, workload=workload, data_shapes=data_shapes, label_shapes=label_shapes, param_names=shared_arg_names, for_training=True, inputs_need_grad=False) # shared_data_arrays should only have input "data" and "softmax_label" arrays for i in range(len(contexts)): assert len(exec_group1.shared_data_arrays[i]) == len(input_names),\ "exec_group1.shared_data_arrays[%d] should have the same number of names as in input_names" % i for name in input_names: assert name in exec_group1.shared_data_arrays[i],\ "arg %s should be in exec_group1.shared_data_arrays[%d]" % (name, i) # generate an rnn sym with #layers=5 sym = get_rnn_sym(num_layers=5, num_words=num_words, num_hidden=num_hidden, num_embed=num_embed, seq_len=max_bucket_size) arg_names2 = sym.list_arguments() exec_group2 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, workload=workload, data_shapes=data_shapes, label_shapes=label_shapes, param_names=shared_arg_names, for_training=True, inputs_need_grad=False, shared_group=exec_group1) extra_args = [name for name in arg_names2 if name not in shared_arg_names] test_shared_exec_group(exec_grp_shared=exec_group1, exec_grp_created=exec_group2, shared_arg_names=shared_arg_names, extra_args=extra_args)
def bind(self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): """Binds the symbols to construct executors. This is necessary before one can perform computation with the module. Parameters ---------- data_shapes : list of (str, tuple) Typically is ``data_iter.provide_data``. label_shapes : list of (str, tuple) Typically is ``data_iter.provide_label``. for_training : bool Default is ``True``. Whether the executors should be bound for training. inputs_need_grad : bool Default is ``False``. Whether the gradients to the input data need to be computed. Typically this is not needed. But this might be needed when implementing composition of modules. force_rebind : bool Default is ``False``. This function does nothing if the executors are already bound. But with this ``True``, the executors will be forced to rebind. shared_module : Module Default is ``None``. This is used in bucketing. When not ``None``, the shared module essentially corresponds to a different bucket -- a module with different symbol but with the same sets of parameters (e.g. unrolled RNNs with different lengths). """ # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: self.logger.warning('Already bound, ignoring bind()') return self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True self._grad_req = grad_req if not for_training: assert not inputs_need_grad else: pass # this is not True, as some module might not contains a loss function # that consumes the labels # assert label_shapes is not None self._data_shapes, self._label_shapes = _parse_data_desc( self.data_names, self.label_names, data_shapes, label_shapes) if shared_module is not None: assert isinstance(shared_module, Module) and \ shared_module.binded and shared_module.params_initialized shared_group = shared_module._exec_group assert len(shared_group.execs) >= len(self._context) else: shared_group = None self._exec_group = DataParallelExecutorGroup( self._symbol, self._context, self._work_load_list, self._data_shapes, self._label_shapes, self._param_names, for_training, inputs_need_grad, shared_group, logger=self.logger, fixed_param_names=self._fixed_param_names, grad_req=grad_req, group2ctxs=self._group2ctxs, state_names=self._state_names) self._total_exec_bytes = self._exec_group._total_exec_bytes if shared_module is not None: self.params_initialized = True self._arg_params = shared_module._arg_params self._aux_params = shared_module._aux_params elif self.params_initialized: # if the parameters are already initialized, we are re-binding # so automatically copy the already initialized params self._exec_group.set_params(self._arg_params, self._aux_params) else: assert self._arg_params is None and self._aux_params is None param_arrays = [ zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype) for x in self._exec_group.param_arrays ] self._arg_params = { name: arr for name, arr in zip(self._param_names, param_arrays) } aux_arrays = [ zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.aux_arrays ] self._aux_params = { name: arr for name, arr in zip(self._aux_names, aux_arrays) } if shared_module is not None and shared_module.optimizer_initialized: self.borrow_optimizer(shared_module)