def calc_dygraph_grad(self, place): self.program_desc, self.fwd_op_num = self.get_program_desc() self.attrs = self.prepare_attrs() self.attrs['program_id'] = _hash_with_id(self.program_desc) with fluid.dygraph.guard(place): # Step 1. run forward inputs, input_param_list = self.prepare_dygraph_input(place, True) outputs = self.prepare_dygraph_output() framework._dygraph_tracer().trace_op(type=self.op_type, inputs=inputs, outputs=outputs, attrs=self.attrs) for param in input_param_list: var_type = self._get_grad_vartype(param.name) if var_type is None: continue param._set_grad_type(var_type) # Step 2. run backward # NOTE: in unittest, only support single output now actual_outs = outputs['Out'] assert len(actual_outs) == 1 actual_outs[0].backward() # Step 3. prepare grads grads = [] for param in input_param_list: grad = param.gradient() grads.append(grad) return grads
def _load_state_dict_from_save_params(model_path): # Try to load all the files in the directory in VarBase format, # the file name is used as the name of VarBase load_var_list = [] # 1. load file names var_name_list = [] for root, _, files in os.walk(model_path): for filename in files: file_path = os.path.join(root, filename) tmp_var_name = os.path.relpath(file_path, model_path) var_name = tmp_var_name.replace("\\", "/") var_name_list.append(var_name) # 2. create and load VarBase with fluid.dygraph.guard(): for name in var_name_list: new_var = _varbase_creator(name=name, persistable=True) _dygraph_tracer().trace_op( type='load', inputs={}, outputs={'Out': new_var}, attrs={'file_path': os.path.join(model_path, name)}) load_var_list.append(new_var) # 3. construct state_dict load_param_dict = dict() for var in load_var_list: load_param_dict[var.name] = var.numpy() return load_param_dict
def _split_tensors(coalesced_grads_and_grad_vars): if _in_legacy_dygraph(): for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes] framework._dygraph_tracer().trace_op( type='split', inputs={'X': coalesced_grad}, outputs={'Out': origin_grad_vars}, attrs={ 'sections': grad_var_len, 'axis': 0 }) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): _reshape_inplace(x=g_var, shape=g_shape) assert g_var.shape == g_shape elif in_dygraph_mode(): for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes] attrs = () attrs += ('sections', grad_var_len) attrs += ('axis', 0) _C_ops.split(coalesced_grad, origin_grad_vars, *attrs) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): g_var.reshape_(shape=g_shape) assert g_var.shape == g_shape
def _load_persistable_vars_by_program(model_path, program_holder, params_filename=None): # make sure the path has been checked persistable_vars = _get_persistable_vars(program_holder.infer_program) load_var_dict = {} for each_var in persistable_vars: orig_each_name = program_holder._suffix_varname_dict[each_var.name()] if _is_parameter(each_var, program_holder.infer_program): # create output varbase new_var = framework.ParamBase(shape=each_var.shape(), dtype=each_var.dtype(), name=each_var.name(), type=each_var.type(), persistable=True) else: new_var = framework._varbase_creator(type=each_var.type(), name=each_var.name(), shape=each_var.shape(), dtype=each_var.dtype(), persistable=True) if params_filename is None: framework._dygraph_tracer().trace_op( type='load', inputs={}, outputs={'Out': new_var}, attrs={'file_path': os.path.join(model_path, orig_each_name)}) new_var.stop_gradient = False load_var_dict[each_var.name()] = new_var if params_filename is not None: load_var_list = [] for name in sorted(load_var_dict.keys()): load_var_list.append(load_var_dict[name]) framework._dygraph_tracer().trace_op( type='load_combine', inputs={}, outputs={'Out': load_var_list}, attrs={'file_path': os.path.join(model_path, params_filename)}) for each_var in persistable_vars: if not _is_parameter(each_var, program_holder.infer_program): continue param = load_var_dict[each_var.name()] param.stop_gradient = False # NOTE: [Recovery stop gradient information based on the program] # After loading the model, the stop_gradient information # of the original variable is lost, but if a parameter does not # have a corresponding @GRAD variable in the backward program, # it can be said that it is also stop_gradient all_var_names = _get_all_var_names(program_holder.train_program) for var_name in load_var_dict: grad_var_name = var_name + core.grad_var_suffix() if grad_var_name not in all_var_names: load_var_dict[var_name].stop_gradient = True return load_var_dict
def _reshape_inplace(x, shape): x_shape = framework._varbase_creator(dtype=x.dtype) framework._dygraph_tracer().trace_op(type="reshape2", inputs={'X': x}, outputs={ 'Out': x, 'XShape': x_shape }, attrs={'shape': shape})
def _inplace_reshape_dygraph(x, shape): x_shape = _varbase_creator(dtype=x.dtype) _dygraph_tracer().trace_op(type="reshape2", inputs={'X': x}, outputs={ 'Out': x, 'XShape': x_shape }, attrs={'shape': shape}, stop_gradient=True)
def calc_dygraph_output(self, place): with fluid.dygraph.guard(place): inputs = self.prepare_dygraph_input(place) outputs = self.prepare_dygraph_output() framework._dygraph_tracer().trace_op( type=self.op_type, inputs=inputs, outputs=outputs, attrs=self.attrs) return outputs['Out']
def vector_to_parameters(vec, parameters, name=None): """ Transform a 1-D Tensor to the input ``parameters`` . Args: vec (Tensor): A 1-D Tensor, which will be sliced and copied to the input ``parameters`` . parameters (Iterable[Tensor]): Iterable Tensors that are trainable parameters of a Layer. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Examples: .. code-block:: python import paddle weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(3.)) linear1 = paddle.nn.Linear(10, 15, weight_attr) vec = paddle.nn.utils.parameters_to_vector(linear1.parameters()) linear2 = paddle.nn.Linear(10, 15) # copy weight of linear1 to linear2 paddle.nn.utils.vector_to_parameters(vec, linear2.parameters()) # weight: Tensor(shape=[10, 15], dtype=float32, place=CUDAPlace(0), stop_gradient=False, # [[3. , ..., 3. ], # [..., ..., ...], # [3. , ..., 3. ]]) """ origin_shapes = [] sections = [] for param in parameters: shape = param.shape origin_shapes.append(shape) numel = reduce(lambda x, y: x * y, shape) sections.append(numel) if in_dygraph_mode(): with paddle.fluid.dygraph.no_grad(): res = [_varbase_creator() for n in range(len(parameters))] _C_ops.split(vec, res, 'axis', 0, 'sections', sections) for i in range(0, len(res)): res[i]._share_underline_tensor_to(parameters[i]) else: _dygraph_tracer().trace_op(type='split', inputs={'X': [vec]}, outputs={'Out': parameters}, attrs={ 'axis': 0, 'sections': sections }, stop_gradient=True) for i, param in enumerate(parameters): _inplace_reshape_dygraph(param, origin_shapes[i]) return
def _split_tensors(coalesced_grads_and_grad_vars): for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes] framework._dygraph_tracer().trace_op(type='split', inputs={'X': coalesced_grad}, outputs={'Out': origin_grad_vars}, attrs={ 'sections': grad_var_len, 'axis': 0 }) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): _reshape_inplace(x=g_var, shape=g_shape) assert g_var.shape == g_shape
def calc_dygraph_output(self, place): self.program_desc, self.fwd_op_num = self.get_program_desc() self.attrs = self.prepare_attrs() with fluid.dygraph.guard(place): inputs = self.prepare_dygraph_input(place) outputs = self.prepare_dygraph_output() framework._dygraph_tracer().trace_op(type=self.op_type, inputs=inputs, outputs=outputs, attrs=self.attrs) return outputs['Out']
def forward(ctx, run_function, preserve_rng_state, *args): if framework._dygraph_tracer()._has_grad: check_recompute_necessary(args) # store for recomputing ctx.run_function = run_function ctx.preserve_rng_state = preserve_rng_state # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input # the order of tensors in backward()'s output should be the same as tensors in forward()'s input # None tensor inputs will be filtered in backward inputs. # save input for backward ctx.inputs = [] ctx.tensor_indices = [] tensor_inputs = [] for i, arg in enumerate(args): if paddle.is_tensor(arg): tensor_inputs.append(arg) ctx.tensor_indices.append(i) ctx.inputs.append(None) else: ctx.inputs.append(arg) ctx.save_for_backward(*tensor_inputs) # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu. # one process with multiple gpu and mix-gpu-cpu senarios are not support if ctx.preserve_rng_state: cur_device = paddle.get_device() if 'gpu:' not in cur_device: raise RuntimeError( "Recompute with RNG perserve is not support current device: {}.". format(cur_device)) ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state() # TODO support AMP tracer = framework._dygraph_tracer() ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True if tracer._amp_level == core.AmpLevel.O2: ctx.amp_level = 'O2' elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): ctx.amp_level = 'O1' else: raise ValueError("unsupported amp level: {}".format( tracer._amp_level)) ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() with paddle.no_grad(): outputs = run_function(*args) return outputs
def eval(self): """ Sets this Layer and all its sublayers to evaluation mode. This only effects certain modules like `Dropout` and `BatchNorm`. Returns: None """ # global setting framework._dygraph_tracer().eval_mode() # Layer-level setting self.training = False for layer in self.sublayers(): layer.eval()
def _inplace_reshape_dygraph(x, shape): x_shape = _varbase_creator(dtype='int64') if in_dygraph_mode(): with paddle.fluid.dygraph.no_grad(): tmp_out, _ = _C_ops.reshape2(x, None, 'shape', shape) tmp_out._share_underline_tensor_to(x) else: _dygraph_tracer().trace_op(type="reshape2", inputs={'X': x}, outputs={ 'Out': x, 'XShape': x_shape }, attrs={'shape': shape}, stop_gradient=True)
def set_grad_enabled(mode): """ Create a context which enables or disables dygraph gradient calculation. Args: mode(bool): whether to enable (`True`), or disable (`False`) grad. Examples: .. code-block:: python import paddle x = paddle.ones([3, 2]) x.stop_gradient = False with paddle.set_grad_enabled(False): y = x * 2 with paddle.set_grad_enabled(True): z = x * 2 print(y.stop_gradient) # True print(z.stop_gradient) # False """ tracer = _dygraph_tracer() if tracer: prev_mode = tracer._has_grad tracer._has_grad = mode try: yield finally: tracer._has_grad = prev_mode else: yield
def __call__(cls, *inputs): tracer = framework._dygraph_tracer() block = framework.default_main_program().current_block() ivar_inputs = [x._ivar for x in inputs] if not hasattr(cls, 'forward_id'): cls.forward_id = core.PyLayer.num_funcs() + 1 PyLayer.register_func(cls.forward_id, cls._do_forward) cls.backward_id = core.PyLayer.num_funcs() + 1 PyLayer.register_func(cls.backward_id, cls._do_backward) iop = core.OpBase(cls.__class__.__name__ + str(cls.forward_id)) iop.forward_id = cls.forward_id iop.backward_id = cls.backward_id block.ops.append(iop) ivars = tracer.py_trace(iop, ivar_inputs, False) ret = [] for ivar in ivars: tensor = ivar.value().get_tensor() py_var = framework.Variable(block, type=core.VarDesc.VarType.LOD_TENSOR, name=None, shape=tensor.shape(), dtype=tensor._dtype(), ivar=ivar) ret.append(py_var) return ret
def _trace(layer, inputs, feed_prefix='feed_', fetch_prefix='fetch_', tmp_prefix='t_'): assert isinstance(layer, Layer) if not isinstance(inputs, (list, tuple)): inputs = [inputs] tracer = _dygraph_tracer()._get_program_desc_tracer() var_list = extract_vars(inputs) with program_desc_tracing_guard(True): original_outputs = layer(*inputs) if not isinstance(original_outputs, (list, tuple)): outputs = [original_outputs] else: outputs = original_outputs out_vars = [var for var in outputs] program_desc, feed_names, fetch_names, parameters = tracer.create_program_desc( var_list, feed_prefix, out_vars, fetch_prefix, tmp_prefix) tracer.reset() with _dygraph_guard(None): program = create_program_from_desc(program_desc) return original_outputs, program, feed_names, fetch_names, parameters
def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) if self._strategy.nranks > 1 and framework._dygraph_tracer( )._has_grad and self.grad_need_sync: self._reducer.prepare_for_backward( list(self._find_varbase(outputs))) return outputs
def func_main(self): paddle.disable_static() self.tracer = framework._dygraph_tracer() self.tracer._train_mode = True self.assertEqual(self.no_grad_func(1), 1) self.assertEqual(self.no_grad_func.__name__, "no_grad_func") def need_no_grad_func(a, b=1): return a + b decorated_func = paddle.no_grad()(need_no_grad_func) self.assertEqual(str(inspect.getfullargspec(decorated_func)), str(inspect.getfullargspec(need_no_grad_func))) def test_gen(): for i in range(3): yield i a = 0 for i in test_gen(): a += i @paddle.no_grad() def test_wrapped_gen(): for i in range(3): yield i b = 0 for i in test_wrapped_gen(): b += i self.assertEqual(a, b)
def __init__(self, main_program, inputs, outputs, parameters=None, **kwargs): super(PartialProgramLayer, self).__init__() self._inputs = NestSequence(inputs) self._outputs = NestSequence(outputs, need_check=True) self._params = parameters if parameters is not None else [] self._build_strategy = kwargs.get('build_strategy', BuildStrategy()) assert isinstance(self._build_strategy, BuildStrategy) self._origin_main_program = self._verify_program(main_program) self._tmp_scope_vec = self._create_scope_vec() self._cuda_graph_vec = self._create_cuda_graph_vec() self._cuda_graph_capture_mode = "" self._cuda_graph_pool_id = 0 # Set default mode to train self.training = True custom_white_list, custom_black_list = None, None tracer = framework._dygraph_tracer() if tracer: custom_white_list, custom_black_list = tracer._get_amp_op_list() # For AMP training self._amp_list = AutoMixedPrecisionLists( custom_white_list=custom_white_list, custom_black_list=custom_black_list)
def test_main(self): with fluid.dygraph.guard(): self.tracer = framework._dygraph_tracer() self.tracer._train_mode = self.init_mode self.assertEqual(self.no_grad_func(1), 1) self.assertEqual(self.tracer._train_mode, self.init_mode)
def program_desc_tracing_guard(enable): tracer = framework._dygraph_tracer() if tracer: original_val = tracer._enable_program_desc_tracing tracer._enable_program_desc_tracing = enable yield if tracer: tracer._enable_program_desc_tracing = original_val
def _switch_tracer_mode_guard_(is_train=True): tracer = framework._dygraph_tracer() if tracer: mode = tracer._train_mode tracer._train_mode = is_train yield tracer._train_mode = mode else: yield
def parameters_to_vector(parameters, name=None): """ Flatten parameters to a 1-D Tensor. Args: parameters(Iterable[Tensor]): Iterable Tensors that are trainable parameters of a Layer. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: A 1-D Tensor, which represents the parameters of a Layer. Examples: .. code-block:: python import paddle linear = paddle.nn.Linear(10, 15) paddle.nn.utils.parameters_to_vector(linear.parameters()) # 1-D Tensor: [165] """ dtype = parameters[0].dtype origin_shapes = [] for param in parameters: origin_shapes.append(param.shape) _inplace_reshape_dygraph(param, [-1]) out = _varbase_creator(dtype=dtype) if in_dygraph_mode(): with paddle.fluid.dygraph.no_grad(): tmp = _varbase_creator() _C_ops.concat(parameters, tmp, 'axis', 0) tmp._share_underline_tensor_to(out) else: _dygraph_tracer().trace_op(type='concat', inputs={'X': parameters}, outputs={'Out': [out]}, attrs={'axis': 0}, stop_gradient=True) for i, param in enumerate(parameters): _inplace_reshape_dygraph(param, origin_shapes[i]) return out
def backward(ctx, *args): with paddle.fluid.dygraph.guard(): # Restore inputs inputs = list(ctx.inputs) tensor_indices = ctx.tensor_indices tensor_shapes = ctx.tensor_shapes tensors = list(ctx.saved_tensor()) device_id = paddle.distributed.ParallelEnv().device_id for i, idx in enumerate(tensor_indices): if _recompute_partition: state = tensors[i].stop_gradient tensors[i] = _merge_activation( tensors[i]).detach().reshape_(tensor_shapes[i]) tensors[i].stop_gradient = state inputs[idx] = tensors[i].cuda( device_id) if _recompute_offload else tensors[i] tracer = framework._dygraph_tracer() tracer._has_grad = True # need restore auto_cast state as well as w/b list with swith_rng_state_tracker(ctx.fwd_cuda_rng_state, ctx.fwd_cuda_rng_state_tracker): with paddle.amp.auto_cast(enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) if isinstance(outputs, core.eager.Tensor): outputs = (outputs, ) assert len(outputs) == len(args) forward_outputs_with_grad = [] backward_inputs = [] for i in range(len(outputs)): if isinstance( outputs[i], core.eager.Tensor) and not outputs[i].stop_gradient: forward_outputs_with_grad.append(outputs[i]) backward_inputs.append(args[i]) if len(forward_outputs_with_grad) == 0: raise RuntimeError( "none of output has stop_gradient=False, this recompute() is not necessary" ) # actually backward paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) grads = tuple(inp._grad_ivar() for inp in detached_inputs if isinstance(inp, core.eager.Tensor)) return grads
def _switch_tracer_mode_guard_(is_train=True): tracer = framework._dygraph_tracer() if tracer: has_grad = tracer._has_grad tracer._has_grad = is_train try: yield finally: tracer._has_grad = has_grad else: yield
def train(self): """ Sets this Layer and all its sublayers to training mode. This only effects certain modules like `Dropout` and `BatchNorm`. Returns: None Example:: .. code-block:: python import paddle class MyLayer(paddle.nn.Layer): def __init__(self): super(MyLayer, self).__init__() self._linear = paddle.nn.Linear(1, 1) self._dropout = paddle.nn.Dropout(p=0.5) def forward(self, input): temp = self._linear(input) temp = self._dropout(temp) return temp x = paddle.randn([10, 1], 'float32') mylayer = MyLayer() mylayer.eval() # set mylayer._dropout to eval mode out = mylayer(x) mylayer.train() # set mylayer._dropout to train mode out = mylayer(x) """ # global setting in dygraph # NOTE(chenweihang): nn.Layer also can be used in static mode, # but _dygraph_tracer() can not be called in static mode if in_dygraph_mode(): framework._dygraph_tracer().train_mode() # Layer-level setting self.training = True for layer in self.sublayers(): layer.train()
def _in_amp_guard(): """ Judge whether current code block is in `amp_guard` context. """ tracer = _dygraph_tracer() if tracer: if tracer._amp_level == core.AmpLevel.O1: return True else: return False else: return False
def forward(self, inputs): in_vars, out_vars, tmp_scope_vec = self._prepare(inputs) framework._dygraph_tracer().trace_op( type='run_program', inputs={ 'X': valid_vars(in_vars), 'Params': valid_vars(self._params) }, outputs={ 'Out': valid_vars(out_vars), 'OutScope': tmp_scope_vec }, attrs={ 'global_block': self.program.desc.block(0), 'start_op_index': 0, 'end_op_index': self._infer_program.desc.block(0).op_size(), 'is_test': not self.training }) restored_nest_out = self._restore_out(out_vars) return self._remove_no_value(restored_nest_out)
def test_main(self): with fluid.dygraph.guard(): self.tracer = framework._dygraph_tracer() self.tracer._train_mode = self.init_mode self.assertEqual(self.no_grad_func(1), 1) self.assertEqual(self.tracer._train_mode, self.init_mode) with fluid.dygraph.guard(): self.check_not_support_rlt(False) with new_program_scope(): self.check_not_support_rlt(True)
def forward(self, inputs): in_vars, out_vars, tmp_scope_vec = self._prepare(inputs) framework._dygraph_tracer().trace_op( type='run_program', inputs={ 'X': valid_vars(in_vars), 'Params': valid_vars(self._params) }, outputs={ 'Out': valid_vars(out_vars), 'OutScope': tmp_scope_vec }, attrs={ 'global_block': self._trace_program.desc.block(0), 'start_op_index': 0, 'end_op_index': self._infer_program.desc.block(0).op_size(), 'is_test': not self.training }) outs = out_vars if len(outs) == 1: outs = outs[0] return outs