def test_deep_copy(self): with fluid.dygraph.guard(): empty_var = core.VarBase() empty_var_copy = copy.deepcopy(empty_var) self.assertEqual(empty_var.stop_gradient, empty_var_copy.stop_gradient) self.assertEqual(empty_var.persistable, empty_var_copy.persistable) self.assertEqual(empty_var.type, empty_var_copy.type) self.assertEqual(empty_var.dtype, empty_var_copy.dtype) x = paddle.to_tensor([2.], stop_gradient=False) y = paddle.to_tensor([3.], stop_gradient=False) z = x * y memo = {} x_copy = copy.deepcopy(x, memo) y_copy = copy.deepcopy(y, memo) self.assertEqual(x_copy.stop_gradient, y_copy.stop_gradient) self.assertEqual(x_copy.persistable, y_copy.persistable) self.assertEqual(x_copy.type, y_copy.type) self.assertEqual(x_copy.dtype, y_copy.dtype) self.assertTrue(np.array_equal(x.numpy(), x_copy.numpy())) self.assertTrue(np.array_equal(y.numpy(), y_copy.numpy())) self.assertNotEqual(id(x), id(x_copy)) x_copy[:] = 5. self.assertTrue(np.array_equal(x_copy.numpy(), [5.])) self.assertTrue(np.array_equal(x.numpy(), [2.])) with self.assertRaises(RuntimeError): copy.deepcopy(z) x_copy2 = copy.deepcopy(x, memo) y_copy2 = copy.deepcopy(y, memo) self.assertEqual(id(x_copy), id(x_copy2)) self.assertEqual(id(y_copy), id(y_copy2)) # test copy selected rows x = core.VarBase(core.VarDesc.VarType.FP32, [3, 100], "selected_rows", core.VarDesc.VarType.SELECTED_ROWS, True) selected_rows = x.value().get_selected_rows() selected_rows.get_tensor().set(np.random.rand(3, 100), core.CPUPlace()) selected_rows.set_height(10) selected_rows.set_rows([3, 5, 7]) x_copy = copy.deepcopy(x) self.assertEqual(x_copy.stop_gradient, x.stop_gradient) self.assertEqual(x_copy.persistable, x.persistable) self.assertEqual(x_copy.type, x.type) self.assertEqual(x_copy.dtype, x.dtype) copy_selected_rows = x_copy.value().get_selected_rows() self.assertEqual(copy_selected_rows.height(), selected_rows.height()) self.assertEqual(copy_selected_rows.rows(), selected_rows.rows()) self.assertTrue( np.array_equal(np.array(copy_selected_rows.get_tensor()), np.array(selected_rows.get_tensor())))
def _add_grad_as_view(self, param, align): assert np.prod( self.buffer.shape ) > 0, "Cannot add a gradient to a released InternalStorage, please rebuild" assert param.dtype == self.buffer.dtype grad_end = self._fill + np.prod(param.shape) offset = grad_end + align assert offset <= np.prod(self.buffer.shape) # Copy the current grad value to InternalStorage dev_id = 0 if paddle.get_device() == "cpu" else int( paddle.get_device().split(":")[1]) if self._device == "cpu": with device_guard(dev_id, self._device): tmp_var = core.VarBase(self.buffer._slice( self._fill, grad_end)) param._copy_gradient_from(tmp_var) tmp_var.value().get_tensor()._clear() elif self._device == "gpu": tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) param._copy_gradient_from(tmp_var) tmp_var.value().get_tensor()._clear() self._fill = offset
def _param_storage(self, param, buffer_size): assert isinstance(buffer_size, int) value = np.zeros( buffer_size, dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros( buffer_size, dtype=np.float32) buffer = core.VarBase(value=value, place=core.CPUPlace()) param_shape = param.shape origin_state = param.stop_gradient param.stop_gradient = True param.flatten_() param.stop_gradient = origin_state start, end = self._param2buffer[param.name][self._rank] # Copy the current param value tmp_var = core.VarBase(tensor=buffer._slice(0, param._numel()), place=core.CPUPlace()) param_cpu = param.cpu() tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(), core.CPUPlace()) param.value().get_tensor()._set_dims(param_shape) param._clear() # Current rank param_storage param.fw_storage = core.VarBase(buffer._slice(start, end), "slice@" + param.name) param.status = "part" # Updata optimizer master weights if param.dtype == Type.fp16.value: self._optim._master_weights[param.fw_storage.name] = paddle.cast( param.fw_storage, Type.fp32.value)
def allreduce_(*_): if param.name in self._task_flow.full_grad.keys(): full_grad = self._task_flow.full_grad[param.name] # Only support sync allreduce current rank's layer now dist.all_reduce(tensor=full_grad, group=self._group, use_calc_stream=True) dist.wait(tensor=full_grad, group=self._group, use_calc_stream=True) start, end = self._param2buffer[param.name][self._rank] if param.bw_storage is None: param.bw_storage = core.VarBase( full_grad._slice(start, end)).detach().clone() if self._offload: param.bw_storage = _device2cpu(param.bw_storage, True) else: if self._offload: cpu_grad = _device2cpu( core.VarBase(full_grad._slice( start, end)).detach().clone(), True) with device_guard(device="cpu"): param.bw_storage = paddle.add( param.bw_storage, cpu_grad) else: # param.bw_storage.add_( # core.VarBase(full_grad._slice(start, end)) # .detach().clone()) param.bw_storage = paddle.add( param.bw_storage, core.VarBase(full_grad._slice( start, end)).detach().clone()) param.clear_gradient(False) param._gradient_set_empty(False) tmp_var = self._task_flow.full_grad.pop(param.name) tmp_var._clear() if param.name in self._task_flow.full_param.keys(): if param.status == "all": param.use_count = 0 param._clear() start, end = self._param2buffer[param.name][self._rank] param.fw_storage = core.VarBase( self._task_flow.full_param[param.name]._slice( start, end), param.name + "@slice").detach().clone() param.status = "part" tmp_var = self._task_flow.full_param.pop(param.name) tmp_var._clear() if self._offload: param.fw_storage._clear() param.master_weight._share_buffer_to(param.fw_storage)
def create_var_base(is_input, name, np_value, stop_gradient): var = core.VarBase(value=np_value, name=name, place=place, zero_copy=True) var.stop_gradient = stop_gradient return var
def _add_param_as_view(self, param, align, convert_gpu=True): assert ( param.dtype == self.buffer.dtype ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format( param.dtype, self.buffer.dtype) var_end = self._fill + np.prod(param.shape) offset = var_end + align assert offset <= np.prod(self.buffer.shape) p_shape = param.shape origin_state = param.stop_gradient param.stop_gradient = True param.flatten_() param.stop_gradient = origin_state # Copy the current param value dev_id = 0 if paddle.get_device() == "cpu" else int( paddle.get_device().split(":")[1]) with device_guard(dev_id, "cpu"): tmp_var = core.VarBase( tensor=self.buffer._slice(self._fill, var_end)) if convert_gpu: param_cpu = param.cpu() param.value().get_tensor()._clear() tmp_var.set_value(param_cpu) else: tmp_var.set_value(param) self._fill = offset return p_shape
def get_all_parameters(self): assert len(self._trainable_params.keys()) > 0 current_layer_params = self._layer.parameters(include_sublayers=True) trainable_params = list( filter(lambda x: x.trainable, current_layer_params)) for param in trainable_params: if param.use_count > 0: continue assert hasattr(param, "fw_storage" ), "Find {} don't have fw_storage attribute".format( param.name) full_param = _all_gather(param.fw_storage, self._group, use_calc_stream=True) dist.wait(tensor=full_param, group=self._group, use_calc_stream=True) core.VarBase(full_param._slice( 0, param._numel()))._share_buffer_to(param) param.value().get_tensor()._set_dims(param.shape) param.fw_storage._clear() param.fw_storage = None param.status = "all" param.use_count += 1 self._optim._parameter_list = self._ori_parameter_list self._optim._param_groups = self._ori_param_groups
def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream): for param in trainable_params[layer_id]: if param.status == "all": param.use_count += 1 continue if param.name in task_flow.full_param.keys(): full_param = task_flow.full_param[param.name] with paddle.amp.auto_cast(enable=False): paddle.device.cuda.synchronize() core.VarBase(full_param._slice( 0, param._numel()))._share_buffer_to(param) param.value().get_tensor()._set_dims(param.shape) param.fw_storage._clear() param.fw_storage = None param.status = "all" param.use_count += 1 else: _allgather_buffer(layer_id, trainable_params, group, use_calc_stream, task_flow, sync_wait=True) break return task_flow
def create_out(var_id): var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) return var_base
def _allgather_buffer(layer_id, trainable_params, group, use_calc_stream, task_flow, sync_wait=False): for param in trainable_params[layer_id]: if param.status == "all": param.use_count += 1 continue with paddle.amp.auto_cast(enable=False): full_param = _all_gather(param.fw_storage, group, use_calc_stream=use_calc_stream) if sync_wait: with paddle.amp.auto_cast(enable=False): dist.wait(tensor=full_param, group=group, use_calc_stream=use_calc_stream) core.VarBase(full_param._slice( 0, param._numel()))._share_buffer_to(param) param.value().get_tensor()._set_dims(param.shape) param.fw_storage._clear() param.fw_storage = None param.status = "all" param.use_count += 1 task_flow.full_param[param.name] = full_param return task_flow
def _wait_layer(trainable_params, task_flow, group, use_calc_stream, offload=False): paddle.device.cuda.synchronize() for param in trainable_params: if param.status == "all": param.use_count += 1 continue if param.name in task_flow.full_param.keys(): full_param = task_flow.full_param[param.name] core.VarBase(full_param._slice( 0, param._numel()))._share_buffer_to(param) param.fw_storage._clear() param.fw_storage = None param.status = "all" param.use_count += 1 else: _allgather_buffer(trainable_params, group, use_calc_stream=True, task_flow=task_flow, sync_wait=True, offload=offload) break return task_flow
def _prepare(self, inputs): """ Prepare inputs, outputs, attrs. """ assert isinstance(inputs, (tuple, list)) # Flatten inputs with nested structure into single list. flatten_inputs = flatten(inputs) # Convert variable into VarBase and feed in training data. input_vars = [] expected_place = framework._current_expected_place() for i, value in enumerate(flatten_inputs): if isinstance(value, np.ndarray): var = None if not framework._in_eager_mode_: var = core.VarBase(value=value, name=self._inputs[i].desc.name(), persistable=False, place=expected_place, zero_copy=True) else: var = core.eager.Tensor(value=value, name=self._inputs[i].desc.name(), persistable=False, place=expected_place, zero_copy=True) elif isinstance(value, (core.VarBase, core.eager.Tensor)): # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times # into CUDAPlace when it's as input of multi Ops. so we move it in advance # to avoid this problem. if value.stop_gradient and not value.place._equals( expected_place): var = value._copy_to(expected_place, False) var.stop_gradient = True else: var = value var.name = self._inputs[i].desc.name() else: continue input_vars.append(var) def create_out(var_id): var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc varbase = None if not framework._in_eager_mode_: var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) else: var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) return var_base # Create VarBase to receive output data. out_vars = list(map(create_out, self._outputs.var_ids)) return input_vars, out_vars
def _create_fake_var(): """ Create a fake_var (force on CPU) to handle empty input or output """ return [ core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", core.VarDesc.VarType.RAW, False) ]
def _create_scope_vec(self): # Hold forward variables tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) inner_scope = core.Scope() tmp_scope_vec.value().set_scope(inner_scope) return tmp_scope_vec
def _param_storage(self, param, buffer_size): """ This is a function to simplify the handling of parameter InternalStorages. """ assert isinstance(buffer_size, int) value = np.zeros( buffer_size, dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros( buffer_size, dtype=np.float32) buffer = core.VarBase(value=value, place=core.CPUPlace()) param_shape = param.shape origin_state = param.stop_gradient param.stop_gradient = True param.flatten_() param.stop_gradient = origin_state start, end = self._param2buffer[param.name][self._rank] # Copy the current param value tmp_var = core.VarBase(tensor=buffer._slice(0, param._numel()), place=core.CPUPlace()) param_cpu = param.cpu() tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(), core.CPUPlace()) param.value().get_tensor()._set_dims(param_shape) # Current rank param_storage if self._offload: param.fw_storage = core.VarBase(buffer._slice(start, end), core.CPUPlace(), "slice@" + param.name) with device_guard(device="cpu"): param.master_weight = paddle.cast(param.fw_storage, Type.fp32.value) else: param.fw_storage = core.VarBase(buffer._slice(start, end), "slice@" + param.name) param.status = "part" # Updata optimizer master weights if param.dtype == Type.fp16.value and not self._offload: self._optim._master_weights[param.fw_storage.name] = paddle.cast( param.fw_storage, Type.fp32.value) param._clear()
def test_input_cuda_pinned_var(self): with fluid.dygraph.guard(): data = np.random.random((2, 80, 16128)).astype('float32') var = core.VarBase(value=data, name='', persistable=False, place=fluid.CUDAPinnedPlace(), zero_copy=False) sliced = var[:, 10:, :var.shape[1]] self.assertEqual(sliced.shape, [2, 70, 80])
def _get_double_grads(self, program): double_grads = [] for block in program.blocks: for name in block.vars: if "@GRAD" in name: var_desc = block.vars[name].desc var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) double_grads.append(var_base) return self._valid_vars(double_grads)
def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow): for param in trainable_params[id(layer)]: if param.name in task_flow.full_grad.keys(): continue assert isinstance(param2buffer_size[param.name], int) temp_grad = paddle.zeros([param2buffer_size[param.name]], dtype=param.dtype) param._copy_gradient_from( core.VarBase(temp_grad._slice(0, param._numel()))) task_flow.full_grad[param.name] = temp_grad return task_flow
def _prepare(self, inputs): """ Prepare inputs, outputs, attrs. """ assert isinstance(inputs, (tuple, list)) # Flatten inputs with nested structure into single list. flatten_inputs = flatten(inputs) # Convert variable into VarBase and feed in training data. input_vars = [] for i, value in enumerate(flatten_inputs): if isinstance(value, np.ndarray): var = core.VarBase(value=value, name=self._inputs[i].desc.name(), persistable=False, place=framework._current_expected_place(), zero_copy=True) elif isinstance(value, core.VarBase): var = value var.name = self._inputs[i].desc.name() else: continue input_vars.append(var) # Create VarBase to receive output data. out_vars = [] for idx in self._outputs.var_ids: var = self._outputs[idx] assert isinstance(var, framework.Variable) var_desc = var.desc var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) out_vars.append(var_base) # Hold forward variables tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) tmp_scope_vec.value().set_scope(self._inner_scope) return input_vars, out_vars, tmp_scope_vec
def test_type_core(self): paddle.disable_static() inx = np.array([1, 2]) tensorx = core.VarBase(inx) typex_str = str(type(tensorx)) expectx = "<class 'paddle.Tensor'>" self.assertEqual((typex_str == expectx), True) tensorx = paddle.framework.VarBase(inx) typex_str = str(type(tensorx)) expectx = "<class 'paddle.Tensor'>" self.assertEqual((typex_str == expectx), True)
def reduce(*_): if param.name in self._task_flow.full_grad.keys(): full_grad = self._task_flow.full_grad[param.name] with paddle.amp.auto_cast(enable=False): if not self._accumulate_grads: full_grad.scale_(scale=self._world_size_scaling) # Only support sync allreduce current rank's layer now dist.all_reduce(tensor=full_grad, group=self._group, use_calc_stream=True) dist.wait(tensor=full_grad, group=self._group, use_calc_stream=True) start, end = self._param2buffer[param.name][self._rank] if not self._accumulate_grads or param.bw_storage is None: param.bw_storage = core.VarBase( full_grad._slice(start, end)).detach().clone() else: param.bw_storage.add_( core.VarBase(full_grad._slice( start, end)).detach().clone()) param.clear_gradient(False) param._gradient_set_empty(False) tmp_var = self._task_flow.full_grad.pop(param.name) tmp_var._clear() if param.name in self._task_flow.full_param.keys(): if param.status == "all": param.use_count = 0 param._clear() start, end = self._param2buffer[param.name][self._rank] with paddle.amp.auto_cast(enable=False): param.fw_storage = core.VarBase( self._task_flow.full_param[param.name]._slice( start, end), param.name + "@slice").detach().clone() param.status = "part" tmp_var = self._task_flow.full_param.pop(param.name) tmp_var._clear()
def to_variable(value, name=None, zero_copy=None): """ The API will create a ``Variable`` object from numpy\.ndarray or Variable object. Parameters: value(ndarray|Variable): The numpy\.ndarray or Variable object that needs to be converted, it can be multi-dimension, and the data type is one of numpy\.{float16, float32, float64, int16, int32, int64, uint8, uint16}. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` zero_copy(bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace and will be set to True when it is None. Default: None. Returns: Variable: If ``value`` is a numpy\.ndarray object, return ``Tensor`` created from the specified numpy\.ndarray object, which has same data type and shape with ``value``. If ``value`` is a Variable object, just return ``value``. Examples: .. code-block:: python import numpy as np import paddle.fluid as fluid with fluid.dygraph.guard(fluid.CPUPlace()): x = np.ones([2, 2], np.float32) y = fluid.dygraph.to_variable(x, zero_copy=False) x[0][0] = -1 y[0][0].numpy() # array([1.], dtype=float32) y = fluid.dygraph.to_variable(x) x[0][0] = 0 y[0][0].numpy() # array([0.], dtype=float32) """ if isinstance(value, np.ndarray): assert framework.in_dygraph_mode( ), "to_variable could only be called in dygraph mode" if isinstance(framework._current_expected_place(), framework.core.CPUPlace): if zero_copy is None: zero_copy = True else: assert not zero_copy, "zero_copy mode can only be used with CPUPlace" zero_copy = False py_var = core.VarBase(value=value, place=framework._current_expected_place(), persistable=False, zero_copy=zero_copy, name=name if name else '') return py_var elif isinstance(value, (core.VarBase, framework.Variable)): return value else: raise TypeError( "to_variable only accepts 'ndarray' and 'Variable' as value's input" )
def _create_scope_vec(self): # Hold forward variables tmp_scope_vec = None inner_scope = core.Scope() if not framework._in_eager_mode_: tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) tmp_scope_vec.value().set_scope(inner_scope) else: tmp_scope_vec = [inner_scope] return tmp_scope_vec
def create_var_base(is_input, name, np_value, stop_gradient): if _in_eager_mode_: var = core.eager.Tensor(value=np_value, name=name, place=place, zero_copy=True) else: var = core.VarBase(value=np_value, name=name, place=place, zero_copy=True) var.stop_gradient = stop_gradient return var
def _create_out(var): assert isinstance(var, Variable) var_desc = var.desc varbase = None if _in_legacy_dygraph(): var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) else: var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) return var_base
def valid_vars(vars): """ Note: run_program_op.InferShape requires `X`/'Out' not be null. But it's common in dy2static, fake varBase is created to handle the problem. """ if vars: return vars return [ core.VarBase(value=[1], name='Fake_var', place=framework._current_expected_place()) ]
def _create_fake_var(): """ Create a fake_var (force on CPU) to handle empty input or output """ if not framework._in_eager_mode_: return [ core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", core.VarDesc.VarType.RAW, False) ] else: return [ core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", core.VarDesc.VarType.RAW, False) ]
def create_out(var_id): var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc varbase = None if not framework._in_eager_mode_: var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) else: var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) return var_base
def _generate_master_params(self, trainable_params): if self.offload: for param in trainable_params: if param.name not in self._master_params.keys(): self._master_params[param.name] = core.VarBase( name=param.name, value=param.cast(dtype=Type.fp32.value).numpy(), place=core.CPUPlace(), stop_gradient=param.stop_gradient) else: for param in trainable_params: if param.dtype == Type.fp16.value: self._optim._master_weights[param.name] = paddle.cast( param, Type.fp32.value)
def _release_param(layer, trainable_params, param2buffer, rank, task_flow): for param in trainable_params[id(layer)]: # async communicate share weight not clear param.use_count -= 1 if param.use_count == 0: param._clear() if param.name in task_flow.full_param.keys(): start, end = param2buffer[param.name][rank] with paddle.amp.auto_cast(enable=False): param.fw_storage = core.VarBase( task_flow.full_param[param.name]._slice(start, end), param.name + "@slice").detach().clone() param.status = "part" tmp_var = task_flow.full_param.pop(param.name) tmp_var._clear() return