def _wrapper(*inputs, **kwargs): add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) sym_input_dict = {} placeholders = [] for x in inputs: placeholder = recursive_make_placeholder(x, sym_input_dict) placeholders.append(placeholder) gear_input_num = 0 if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] gear_placeholder = recursive_make_placeholder( gear_inputs, sym_input_dict, True) kwargs['gear_inputs'] = gear_placeholder gear_input_num = len(flatten(gear_inputs)) model_outputs = model_func(*placeholders, **kwargs) if len(model_outputs) == 0: raise Exception('model_func must return loss') symbol_list = list(model_outputs) bn_statistic = get_collection(MXNET_BN_STATISTIC) bn_var_names = [] bn_syms = [] moments = [] if bn_statistic is not None and len(bn_statistic) > 0: bn_var_names.extend([x[0] for x in bn_statistic]) bn_syms.extend([x[1] for x in bn_statistic]) moments.extend([x[2] for x in bn_statistic]) symbol_list.extend([mx.sym.BlockGrad(x) for x in bn_syms]) symbol = mx.sym.Group(symbol_list) executor = symbol.simple_bind(ctx=mx.cpu()) add_variable_inputs(symbol, sym_input_dict, is_training=is_training) sym_names = symbol.list_arguments() xdl_inputs = [] for sym in sym_names: xdl_inputs.append(sym_input_dict[sym]) for aux in symbol.list_auxiliary_states(): if aux in sym_input_dict: xdl_inputs.append(sym_input_dict[aux]) sym_names.append(aux) target_size = len(executor.outputs) gradient_size = len(executor.grad_arrays) if device_type.lower() == 'cpu': outputs, gradients = xdl.mxnet_backend_op( inputs=xdl_inputs, var_name_str=','.join(sym_names), device_type=device_type.lower(), graph_def=serialize_graph(symbol), target_size=target_size, gradient_size=gradient_size if is_training else 0, is_training=is_training, init_grad=init_grad if init_grad is not None else np.array( [], dtype=np.float32), has_init_grad=True if init_grad is not None else False) else: with xdl.device('GPU'): outputs, gradients = xdl.mxnet_backend_op( inputs=xdl_inputs, var_name_str=','.join(sym_names), device_type=device_type.lower(), graph_def=serialize_graph(symbol), target_size=target_size, gradient_size=gradient_size if is_training else 0, is_training=is_training, init_grad=init_grad if init_grad is not None else np.array([], dtype=np.float32), has_init_grad=True if init_grad is not None else False) bn_var_num = len(bn_var_names) if bn_var_num > 0: bn_outputs = outputs[len(outputs) - bn_var_num:] outputs = outputs[0:len(outputs) - bn_var_num] bn_update_infos = zip(bn_var_names, bn_outputs, moments) add_to_collection(BN_STATISTIC, bn_update_infos) update_ops = [] for n, v, m in bn_update_infos: update_op = xdl.ps_apply_moving_average_op(var_name=n, value=v, moment=m) update_ops.append(update_op) add_to_collection(UPDATE_OPS, update_ops) if is_training: sym_names_ = [] gradients_ = [] if gear_input_num > 0: global _GEAR_INPUTS gear_grads = [None] * gear_input_num for i in range(len(sym_names)): if sym_names[i] not in _GEAR_INPUTS: gradients_.append(gradients[i]) sym_names_.append(sym_names[i]) else: index = _GEAR_INPUTS.index(sym_names[i]) gear_grads[index] = gradients[i] for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) set_gradients(sym_names_, gradients_, cur_model_scope()) else: set_gradients(sym_names, gradients, cur_model_scope()) return outputs
def _wrapper(*inputs, **kwargs): add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) model_fn_inputs = [] xdl_inputs = [] placeholders = [] for x in inputs: input = recursive_make_placeholder(x, xdl_inputs, placeholders) model_fn_inputs.append(input) gear_placeholders = [] if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] input = recursive_make_placeholder(gear_inputs, xdl_inputs, placeholders) gear_placeholders = flatten(placeholders[-len(gear_inputs):]) kwargs['gear_inputs'] = input init_grad_placeholder = None if init_grad is not None: init_grad_placeholder = recursive_make_placeholder( init_grad, xdl_inputs, placeholders) targets = model_func(*model_fn_inputs, **kwargs) local_init_op_names = [ x.initializer.name for x in tf.local_variables() ] if isinstance(targets, tuple): targets = list(targets) else: targets = [targets] var_names = [] gradient_op_names = [] if is_training: loss = targets[0] if isinstance(loss, (list, tuple, dict)): raise 'model function must reture loss as first output' for gear_placeholder in gear_placeholders: add_to_collection(BACKPROP_VARS, ("gear_grad", gear_placeholder)) var_names, gradient_op_names = add_backprop_ops( loss, get_collection(BACKPROP_VARS, ['', cur_model_scope()]), init_grad_placeholder) input_op_names = get_op_names(placeholders) target_op_names = get_op_names(targets) op_inputs = xdl_inputs add_variable_inputs(op_inputs, input_op_names) outputs, gradients = xdl.tfbackend_op( inputs=list(op_inputs), input_op_names=','.join(input_op_names), target_op_names=','.join(target_op_names), gradient_op_names=','.join(gradient_op_names), local_init_op_names=','.join(local_init_op_names), graph_def=serialize_graph(), target_size=len(target_op_names), gradient_size=len(gradient_op_names), gpu_memory_fraction=gpu_memory_fraction) gradients_size = len(gradients) gear_size = len(gear_placeholders) gear_grads = gradients[gradients_size - gear_size:] gradients = gradients[0:gradients_size - gear_size] var_names = var_names[0:gradients_size - gear_size] if len(gear_grads) > 0: add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) if is_training: set_gradients(var_names, gradients, cur_model_scope()) return outputs
def _wrapper(*inputs, **kwargs): global _TF_GRAPH_DICT model_scope = cur_model_scope() if model_scope not in _TF_GRAPH_DICT: _TF_GRAPH_DICT[model_scope] = tf.Graph() cur_graph = _TF_GRAPH_DICT[model_scope] with cur_graph.as_default(): set_backend_type('tensorflow') add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) model_fn_inputs = [] xdl_inputs = [] placeholders = [] for x in inputs: input = recursive_make_placeholder(x, xdl_inputs, placeholders) model_fn_inputs.append(input) gear_placeholders = [] if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] input = recursive_make_placeholder(gear_inputs, xdl_inputs, placeholders) gear_placeholders = flatten( placeholders[-len(gear_inputs):]) #gear_placeholders = flatten(placeholders[-1:]) kwargs['gear_inputs'] = input init_grad_placeholder = None if init_grad is not None: init_grad_placeholder = recursive_make_placeholder( init_grad, xdl_inputs, placeholders) targets = model_func(*model_fn_inputs, **kwargs) local_init_op_names = [ x.initializer.name for x in tf.local_variables() ] if isinstance(targets, tuple): targets = list(targets) else: targets = [targets] # add batch_normalization batchnorm_begin = len(targets) batchnorm_tensors = tf_batchnorm_hook.get_batchnorm_tensors() batchnorm_size = len(batchnorm_tensors) targets.extend(batchnorm_tensors) # add trace tensors trace_tensors = trace.get_tensors(['tf', 'tf_sparse_assign']) trace_size = len(trace_tensors) targets.extend(trace_tensors) var_names = [] gradient_op_names = [] if is_training: loss = targets[0] if isinstance(loss, (list, tuple, dict)): raise 'model function must reture loss as first output' for gear_placeholder in gear_placeholders: add_to_collection(BACKPROP_VARS, ("gear_grad", gear_placeholder)) var_names, gradient_op_names = add_backprop_ops( loss, get_collection(BACKPROP_VARS, ['', model_scope]), init_grad_placeholder) input_op_names = get_op_names(placeholders) target_op_names = get_op_names(targets) target_op_types = get_op_types(targets) op_inputs = xdl_inputs add_variable_inputs(op_inputs, input_op_names) outputs, gradients = xdl.tfbackend_op( inputs=list(op_inputs), output_type=target_op_types, input_op_names=','.join(input_op_names), target_op_names=','.join(target_op_names), gradient_op_names=','.join(gradient_op_names), local_init_op_names=','.join(local_init_op_names), graph_def=serialize_graph(model_scope=model_scope), gradient_size=len(gradient_op_names), gpu_memory_fraction=gpu_memory_fraction) gradients_size = len(gradients) gear_size = len(gear_placeholders) gear_grads = gradients[gradients_size - gear_size:] gradients = gradients[0:gradients_size - gear_size] var_names = var_names[0:gradients_size - gear_size] if len(gear_grads) > 0: add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) if is_training: set_gradients(var_names, gradients, cur_model_scope()) # set trace output trace_output = [] if trace_size == 0 else outputs[-trace_size:] trace.set_values(['tf', 'tf_sparse_assign'], trace_output) if batchnorm_size == 0: batchnorm_output = [] elif trace_size != 0: batchnorm_output = outputs[-(trace_size + batchnorm_size):-trace_size] else: batchnorm_output = outputs[-(trace_size + batchnorm_size):] tf_batchnorm_hook.set_tf_output(batchnorm_output) return outputs if ( trace_size == 0 and batchnorm_size == 0) else outputs[:-(trace_size + batchnorm_size)]