def assign_moving_average(variable, value, decay, zero_debias=True, name=None): #we only deal with not zero_debias assert(not zero_debias) var_mapping = get_collection(VAR_MAPPING, cur_model_scope()) for x in var_mapping: if x[1] == variable: add_to_collection(BATCHNORM_TENSORS, (x[0], value, decay), cur_model_scope()) return real_assgin_moving_average(variable, value, decay, zero_debias, name)
def get_gradient(var_name, scope=None): scope = scope if scope is not None else cur_model_scope() global _VAR_AND_GRADS if scope not in _VAR_AND_GRADS or \ var_name not in _VAR_AND_GRADS[scope]: return None return _VAR_AND_GRADS[scope][var_name]
def add_variable_inputs(inputs, input_op_names): """deal inputs for variables defined in tf-graph""" var_mapping = get_collection(VAR_MAPPING, cur_model_scope()) if var_mapping is None: return inputs.extend([x[0].value for x in var_mapping]) input_op_names.extend([x[1].name for x in var_mapping])
def recursive_make_placeholder(x, xdl_inputs, tf_inputs): global _PLACEHOLDER_INFOS if isinstance(x, dict): ret = {} for key in x.keys(): ret[key] = recursive_make_placeholder(x[key], xdl_inputs, tf_inputs) return ret if isinstance(x, (tuple, list)): return [ recursive_make_placeholder(item, xdl_inputs, tf_inputs) for item in x ] elif isinstance(x, Tensor): placeholder = make_placeholder(x) model_scope = cur_model_scope() if model_scope not in _PLACEHOLDER_INFOS: _PLACEHOLDER_INFOS[model_scope] = [] _PLACEHOLDER_INFOS[model_scope].append( PlaceHolderInfo(placeholder, x.shape, x.dtype, x)) xdl_inputs.append(x) tf_inputs.append(placeholder) return placeholder else: return x
def get_variable(self, name, shape=None, dtype=DataType.float, initializer=None, regularizer=None, reuse=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, use_resource=None, custom_getter=None, constraint=None, **kwargs): global _TF_VAR_DICT scope = cur_model_scope() if scope not in _TF_VAR_DICT: _TF_VAR_DICT[scope] = {} tf_var_dict = _TF_VAR_DICT[scope] if name in tf_var_dict: if tf.get_variable_scope().reuse in [True, tf.AUTO_REUSE]: return tf_var_dict[name] else: raise Exception("must set reuse flag to enable reuse") def _custom_getter(getter, *args, **kwargs): tf_var = getter(*args, **kwargs) xdl_var = xdl_variable( name = name, shape = TF2XDL.convert_shape(shape), dtype = TF2XDL.convert_type(dtype), scope = scope, trainable = True, initializer = TF2XDL.convert_initializer(initializer)) add_to_collection(VAR_MAPPING, (xdl_var, tf_var), scope) add_to_collection(BACKPROP_VARS, (name, tf_var), scope) tf_var_dict[name] = tf_var return tf_var return real_get_variable(self, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, _custom_getter, constraint, **kwargs)
def add_var_mapping(name, var, scope=None): global _VAR_MAPPING scope = scope if scope is not None else cur_model_scope() if scope not in _VAR_MAPPING: _VAR_MAPPING[scope] = {} if name in _VAR_MAPPING[scope]: raise 'duplicate key:', name _VAR_MAPPING[scope][name] = var
def get_var_mapping(key, scope=None): global _VAR_MAPPING scope = scope if scope is not None else cur_model_scope() if scope not in _VAR_MAPPING: return None var_mapping = _VAR_MAPPING[scope] if key in var_mapping: return var_mapping[key] return None
def __init__(self, hooks=None): current_env().sess_start() self._hooks = [] if hooks is None else hooks reader_hooks = get_collection(READER_HOOKS) if reader_hooks is not None: self._hooks.extend(reader_hooks) self._cur_scope = cur_model_scope() self._session = SimpleSession(hooks) self._finish = False
def get_scopes(scope): scopes = [] if scope is None: scopes.append(cur_model_scope()) elif isinstance(scope, (list, tuple)): scopes = list(scope) else: scopes = [scope] return list(set(scopes))
def optimize(self, var_list=None, update_global_step=True): if var_list == None: var_list = trainable_variables() sparse_var_grad = [] update_ops = [] shared_vars = set([]) for var in var_list: grad_name = get_var_mapping(var) grad_name = grad_name if grad_name is not None else var.name grad = get_gradient(grad_name, cur_model_scope()) if grad == None: print( "[WARNING]: no gradient found for var:%s under scope:%s" % (var.name, cur_model_scope()), ", maybe not used?") continue if isinstance(grad, list): raise 'dupcate grad for var:', var if not is_embedding_var(var): update_ops.append(self.dense_update(var, grad)) else: sparse_var_grad.append([var, grad]) sparse_grads = self.compute_sparse_grad(sparse_var_grad) if len(sparse_grads) != len(sparse_var_grad): raise Exception("calc grad failed!") merged_sparse_grads = self.merge_sparse_grad( zip([x[0] for x in sparse_var_grad], sparse_grads)) if get_collection("sparse_grad") == None: add_to_collection("sparse_grad", {}) sparse_grad_dict = get_collection("sparse_grad")[0] for i in range(len(merged_sparse_grads)): if not isinstance(merged_sparse_grads[i][1], SparseGrad): raise Exception("embedding var must hava sparse grads") sparse_grad_dict[merged_sparse_grads[i] [0].name] = merged_sparse_grads[i][1] update_ops.append( self.sparse_update(merged_sparse_grads[i][0], merged_sparse_grads[i][1].grad, merged_sparse_grads[i][1].indices)) if update_global_step: update_ops.append(self.update_global_step_op()) return update_ops
def set_tf_output(output_tensors): res = get_collection(BATCHNORM_TENSORS, cur_model_scope()) if res is None: return assert(len(res) == len(output_tensors)) update_ops = [] for i in range(len(res)): update_op = xdl.ps_apply_moving_average_op( var_name = res[i][0].name, value = output_tensors[i], moment = res[i][2]) update_ops.append(update_op) add_to_collection(UPDATE_OPS, update_ops)
def _wrapper(*inputs, **kwargs): add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) sym_input_dict = {} placeholders = [] for x in inputs: placeholder = recursive_make_placeholder(x, sym_input_dict) placeholders.append(placeholder) gear_input_num = 0 if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] gear_placeholder = recursive_make_placeholder( gear_inputs, sym_input_dict, True) kwargs['gear_inputs'] = gear_placeholder gear_input_num = len(flatten(gear_inputs)) model_outputs = model_func(*placeholders, **kwargs) if len(model_outputs) == 0: raise Exception('model_func must return loss') symbol_list = list(model_outputs) bn_statistic = get_collection(MXNET_BN_STATISTIC) bn_var_names = [] bn_syms = [] moments = [] if bn_statistic is not None and len(bn_statistic) > 0: bn_var_names.extend([x[0] for x in bn_statistic]) bn_syms.extend([x[1] for x in bn_statistic]) moments.extend([x[2] for x in bn_statistic]) symbol_list.extend([mx.sym.BlockGrad(x) for x in bn_syms]) symbol = mx.sym.Group(symbol_list) executor = symbol.simple_bind(ctx=mx.cpu()) add_variable_inputs(symbol, sym_input_dict, is_training=is_training) sym_names = symbol.list_arguments() xdl_inputs = [] for sym in sym_names: xdl_inputs.append(sym_input_dict[sym]) for aux in symbol.list_auxiliary_states(): if aux in sym_input_dict: xdl_inputs.append(sym_input_dict[aux]) sym_names.append(aux) target_size = len(executor.outputs) gradient_size = len(executor.grad_arrays) if device_type.lower() == 'cpu': outputs, gradients = xdl.mxnet_backend_op( inputs=xdl_inputs, var_name_str=','.join(sym_names), device_type=device_type.lower(), graph_def=serialize_graph(symbol), target_size=target_size, gradient_size=gradient_size if is_training else 0, is_training=is_training, init_grad=init_grad if init_grad is not None else np.array( [], dtype=np.float32), has_init_grad=True if init_grad is not None else False) else: with xdl.device('GPU'): outputs, gradients = xdl.mxnet_backend_op( inputs=xdl_inputs, var_name_str=','.join(sym_names), device_type=device_type.lower(), graph_def=serialize_graph(symbol), target_size=target_size, gradient_size=gradient_size if is_training else 0, is_training=is_training, init_grad=init_grad if init_grad is not None else np.array([], dtype=np.float32), has_init_grad=True if init_grad is not None else False) bn_var_num = len(bn_var_names) if bn_var_num > 0: bn_outputs = outputs[len(outputs) - bn_var_num:] outputs = outputs[0:len(outputs) - bn_var_num] bn_update_infos = zip(bn_var_names, bn_outputs, moments) add_to_collection(BN_STATISTIC, bn_update_infos) update_ops = [] for n, v, m in bn_update_infos: update_op = xdl.ps_apply_moving_average_op(var_name=n, value=v, moment=m) update_ops.append(update_op) add_to_collection(UPDATE_OPS, update_ops) if is_training: sym_names_ = [] gradients_ = [] if gear_input_num > 0: global _GEAR_INPUTS gear_grads = [None] * gear_input_num for i in range(len(sym_names)): if sym_names[i] not in _GEAR_INPUTS: gradients_.append(gradients[i]) sym_names_.append(sym_names[i]) else: index = _GEAR_INPUTS.index(sym_names[i]) gear_grads[index] = gradients[i] for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) set_gradients(sym_names_, gradients_, cur_model_scope()) else: set_gradients(sym_names, gradients, cur_model_scope()) return outputs
def get_batchnorm_tensors(): res = get_collection(BATCHNORM_TENSORS, cur_model_scope()) if res is None: return [] return [v[1] for v in res]
def assign_moving_average(self, variable, value, momentum): var_mapping = get_collection(VAR_MAPPING, cur_model_scope()) for x in var_mapping: if x[1] == variable: add_to_collection(BATCHNORM_TENSORS, (x[0], value, momentum), cur_model_scope()) return keras_layers_assgin_moving_average(self, variable, value, momentum)
def trainable_variables(scopes=None): if scopes is None: return trainable_variables_with_scope(['', cur_model_scope()]) else: return trainable_variables_with_scope(scopes)
def global_variables(scopes=None): if scopes is None: return global_variables_with_scope(['', cur_model_scope()]) else: return global_variables_with_scope(scopes)
def global_initializers(scopes=None): if scopes is None: return global_initializers_with_scope(['', cur_model_scope()]) else: return global_initializers_with_scope(scopes)
def variable_registers(scopes=None): if scopes is None: return variable_registers_with_scope(['', cur_model_scope()]) else: return variable_registers_with_scope(scopes)
def _wrapper(*inputs, **kwargs): add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) model_fn_inputs = [] xdl_inputs = [] placeholders = [] for x in inputs: input = recursive_make_placeholder(x, xdl_inputs, placeholders) model_fn_inputs.append(input) gear_placeholders = [] if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] input = recursive_make_placeholder(gear_inputs, xdl_inputs, placeholders) gear_placeholders = flatten(placeholders[-len(gear_inputs):]) kwargs['gear_inputs'] = input init_grad_placeholder = None if init_grad is not None: init_grad_placeholder = recursive_make_placeholder( init_grad, xdl_inputs, placeholders) targets = model_func(*model_fn_inputs, **kwargs) local_init_op_names = [ x.initializer.name for x in tf.local_variables() ] if isinstance(targets, tuple): targets = list(targets) else: targets = [targets] var_names = [] gradient_op_names = [] if is_training: loss = targets[0] if isinstance(loss, (list, tuple, dict)): raise 'model function must reture loss as first output' for gear_placeholder in gear_placeholders: add_to_collection(BACKPROP_VARS, ("gear_grad", gear_placeholder)) var_names, gradient_op_names = add_backprop_ops( loss, get_collection(BACKPROP_VARS, ['', cur_model_scope()]), init_grad_placeholder) input_op_names = get_op_names(placeholders) target_op_names = get_op_names(targets) op_inputs = xdl_inputs add_variable_inputs(op_inputs, input_op_names) outputs, gradients = xdl.tfbackend_op( inputs=list(op_inputs), input_op_names=','.join(input_op_names), target_op_names=','.join(target_op_names), gradient_op_names=','.join(gradient_op_names), local_init_op_names=','.join(local_init_op_names), graph_def=serialize_graph(), target_size=len(target_op_names), gradient_size=len(gradient_op_names), gpu_memory_fraction=gpu_memory_fraction) gradients_size = len(gradients) gear_size = len(gear_placeholders) gear_grads = gradients[gradients_size - gear_size:] gradients = gradients[0:gradients_size - gear_size] var_names = var_names[0:gradients_size - gear_size] if len(gear_grads) > 0: add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) if is_training: set_gradients(var_names, gradients, cur_model_scope()) return outputs
def _wrapper(*inputs, **kwargs): global _TF_GRAPH_DICT model_scope = cur_model_scope() if model_scope not in _TF_GRAPH_DICT: _TF_GRAPH_DICT[model_scope] = tf.Graph() cur_graph = _TF_GRAPH_DICT[model_scope] with cur_graph.as_default(): set_backend_type('tensorflow') add_to_collection(BACKEND_DEVICE_TYPE, device_type.lower()) model_fn_inputs = [] xdl_inputs = [] placeholders = [] for x in inputs: input = recursive_make_placeholder(x, xdl_inputs, placeholders) model_fn_inputs.append(input) gear_placeholders = [] if 'gear_inputs' in kwargs: gear_inputs = kwargs['gear_inputs'] input = recursive_make_placeholder(gear_inputs, xdl_inputs, placeholders) gear_placeholders = flatten( placeholders[-len(gear_inputs):]) #gear_placeholders = flatten(placeholders[-1:]) kwargs['gear_inputs'] = input init_grad_placeholder = None if init_grad is not None: init_grad_placeholder = recursive_make_placeholder( init_grad, xdl_inputs, placeholders) targets = model_func(*model_fn_inputs, **kwargs) local_init_op_names = [ x.initializer.name for x in tf.local_variables() ] if isinstance(targets, tuple): targets = list(targets) else: targets = [targets] # add batch_normalization batchnorm_begin = len(targets) batchnorm_tensors = tf_batchnorm_hook.get_batchnorm_tensors() batchnorm_size = len(batchnorm_tensors) targets.extend(batchnorm_tensors) # add trace tensors trace_tensors = trace.get_tensors(['tf', 'tf_sparse_assign']) trace_size = len(trace_tensors) targets.extend(trace_tensors) var_names = [] gradient_op_names = [] if is_training: loss = targets[0] if isinstance(loss, (list, tuple, dict)): raise 'model function must reture loss as first output' for gear_placeholder in gear_placeholders: add_to_collection(BACKPROP_VARS, ("gear_grad", gear_placeholder)) var_names, gradient_op_names = add_backprop_ops( loss, get_collection(BACKPROP_VARS, ['', model_scope]), init_grad_placeholder) input_op_names = get_op_names(placeholders) target_op_names = get_op_names(targets) target_op_types = get_op_types(targets) op_inputs = xdl_inputs add_variable_inputs(op_inputs, input_op_names) outputs, gradients = xdl.tfbackend_op( inputs=list(op_inputs), output_type=target_op_types, input_op_names=','.join(input_op_names), target_op_names=','.join(target_op_names), gradient_op_names=','.join(gradient_op_names), local_init_op_names=','.join(local_init_op_names), graph_def=serialize_graph(model_scope=model_scope), gradient_size=len(gradient_op_names), gpu_memory_fraction=gpu_memory_fraction) gradients_size = len(gradients) gear_size = len(gear_placeholders) gear_grads = gradients[gradients_size - gear_size:] gradients = gradients[0:gradients_size - gear_size] var_names = var_names[0:gradients_size - gear_size] if len(gear_grads) > 0: add_to_collection(GEAR_GRAD, gear_grads, cur_model_scope()) for i in range(len(gear_inputs)): set_gear_gradient(gear_inputs[i], gear_grads[i]) if is_training: set_gradients(var_names, gradients, cur_model_scope()) # set trace output trace_output = [] if trace_size == 0 else outputs[-trace_size:] trace.set_values(['tf', 'tf_sparse_assign'], trace_output) if batchnorm_size == 0: batchnorm_output = [] elif trace_size != 0: batchnorm_output = outputs[-(trace_size + batchnorm_size):-trace_size] else: batchnorm_output = outputs[-(trace_size + batchnorm_size):] tf_batchnorm_hook.set_tf_output(batchnorm_output) return outputs if ( trace_size == 0 and batchnorm_size == 0) else outputs[:-(trace_size + batchnorm_size)]