def func_train_eval(self): for device in self.devices: # set device paddle.set_device(device) # for train origin_relu_train_out = self.train_model(use_custom_op=False) custom_relu_train_out = self.train_model(use_custom_op=True) # open this when dy2stat is ready for eager if _in_legacy_dygraph(): custom_relu_dy2stat_train_out = self.train_model( use_custom_op=True, dy2stat=True) # for to_static self.assertTrue( np.array_equal(origin_relu_train_out, custom_relu_dy2stat_train_out)) self.assertTrue( np.array_equal(origin_relu_train_out, custom_relu_train_out)) # for eval origin_relu_eval_out = self.eval_model(use_custom_op=False) custom_relu_eval_out = self.eval_model(use_custom_op=True) if _in_legacy_dygraph(): custom_relu_dy2stat_eval_out = self.eval_model( use_custom_op=True, dy2stat=True) # for to_static self.assertTrue( np.array_equal(origin_relu_eval_out, custom_relu_dy2stat_eval_out)) self.assertTrue( np.array_equal(origin_relu_eval_out, custom_relu_eval_out))
def _split_tensors(coalesced_grads_and_grad_vars): if _in_legacy_dygraph(): for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes] framework._dygraph_tracer().trace_op( type='split', inputs={'X': coalesced_grad}, outputs={'Out': origin_grad_vars}, attrs={ 'sections': grad_var_len, 'axis': 0 }) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): _reshape_inplace(x=g_var, shape=g_shape) assert g_var.shape == g_shape elif in_dygraph_mode(): for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: grad_var_len = [np.prod(g_shape) for g_shape in grad_shapes] attrs = () attrs += ('sections', grad_var_len) attrs += ('axis', 0) _C_ops.split(coalesced_grad, origin_grad_vars, *attrs) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): g_var.reshape_(shape=g_shape) assert g_var.shape == g_shape
def forward(self, x, y): if in_dygraph_mode(): sub = _C_ops.elementwise_sub(x, y) return _C_ops.final_state_p_norm(sub, self.p, 1, self.epsilon, self.keepdim, False) if _in_legacy_dygraph(): sub = _C_ops.elementwise_sub(x, y) return _C_ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim', self.keepdim, 'epsilon', self.epsilon) check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'PairwiseDistance') check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'PairwiseDistance') sub = paddle.subtract(x, y) helper = LayerHelper("PairwiseDistance", name=self.name) attrs = { 'axis': 1, 'porder': self.p, 'keepdim': self.keepdim, 'epsilon': self.epsilon, } out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type='p_norm', inputs={'X': sub}, outputs={'Out': out}, attrs=attrs) return out
def func_tensor_from_numpy(self): data_np = np.array([[2, 3, 1]]).astype('float32') with fluid.dygraph.guard(fluid.CPUPlace()): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") var = fluid.dygraph.to_variable(data_np, zero_copy=True) assert "Currently, zero_copy is not supported, and it will be discarded." in str( w[-1].message) # Temporally diable zero_copy # var = fluid.dygraph.to_variable(data_np, zero_copy=True) # self.assertTrue(np.array_equal(var.numpy(), data_np)) # data_np[0][0] = 4 # self.assertEqual(data_np[0][0], 4) # self.assertEqual(var[0][0].numpy()[0], 4) # self.assertTrue(np.array_equal(var.numpy(), data_np)) var2 = fluid.dygraph.to_variable(data_np, zero_copy=False) self.assertTrue(np.array_equal(var2.numpy(), data_np)) data_np[0][0] = -1 self.assertEqual(data_np[0][0], -1) if not _in_legacy_dygraph(): # eager_mode, var2 is Tensor, is not subscriptable # TODO(wuweilong): to support slice in eager mode later self.assertNotEqual(var2.numpy()[0][0], -1) else: self.assertNotEqual(var2[0][0].numpy()[0], -1) self.assertFalse(np.array_equal(var2.numpy(), data_np))
def func_test_async_read_success(self): offset = paddle.to_tensor(np.array([10, 20], dtype="int64"), place=paddle.CPUPlace()) count = paddle.to_tensor(np.array([5, 10], dtype="int64"), place=paddle.CPUPlace()) with cuda.stream_guard(self.stream): if _in_legacy_dygraph(): core.async_read(self.src, self.dst, self.index, self.buffer, offset, count) else: core.eager.async_read(self.src, self.dst, self.index, self.buffer, offset, count) # index data index_array1 = paddle.gather(self.src, self.index) count_numel = paddle.sum(count).numpy()[0] index_array2 = self.dst[count_numel:count_numel + len(self.index)] self.assertTrue(np.allclose(index_array1.numpy(), index_array2.numpy())) # offset, count offset_a = paddle.gather(self.src, paddle.to_tensor(np.arange(10, 15))) offset_b = paddle.gather(self.src, paddle.to_tensor(np.arange(20, 30))) offset_array1 = paddle.concat([offset_a, offset_b], axis=0) offset_array2 = self.dst[:count_numel] self.assertTrue( np.allclose(offset_array1.numpy(), offset_array2.numpy()))
def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size x.stop_gradient = False y = fluid.layers.relu(x) z = y + 1 w = z * z w_mean = fluid.layers.reduce_mean(w) del y, z, w dx_actual, = self.grad([w_mean], [x], create_graph=False) del w_mean self.assertTrue(dx_actual.stop_gradient) dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) if not _in_legacy_dygraph(): pass else: loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss.backward() x_grad_actual = x.gradient() x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def func_create_varbase(self): x = np.ones([2, 2], np.float32) y = np.zeros([3, 3], np.float32) t = fluid.Tensor() t.set(x, fluid.CPUPlace()) if not _in_legacy_dygraph(): egr_tmp = fluid.core.eager.Tensor( value=x, place=fluid.core.CPUPlace()) egr_tmp2 = fluid.core.eager.Tensor(y, fluid.core.CPUPlace()) egr_tmp3 = paddle.to_tensor(x) egr_tmp4 = fluid.core.eager.Tensor(y) egr_tmp5 = fluid.core.eager.Tensor(value=x) egr_tmp6 = fluid.core.eager.Tensor(t) self.assertTrue(np.array_equal(x, egr_tmp.numpy())) self.assertTrue(np.array_equal(y, egr_tmp2.numpy())) self.assertTrue(np.array_equal(x, egr_tmp3.numpy())) self.assertTrue(np.array_equal(y, egr_tmp4.numpy())) self.assertTrue(np.array_equal(x, egr_tmp5.numpy())) self.assertTrue(np.array_equal(x, egr_tmp6.numpy())) else: tmp = fluid.core.VarBase(value=x, place=fluid.core.CPUPlace()) tmp2 = fluid.core.VarBase(y, fluid.core.CPUPlace()) tmp3 = paddle.to_tensor(x) tmp4 = fluid.core.VarBase(y) tmp5 = fluid.core.VarBase(value=x) tmp6 = fluid.core.VarBase(t) self.assertTrue(np.array_equal(x, tmp.numpy())) self.assertTrue(np.array_equal(y, tmp2.numpy())) self.assertTrue(np.array_equal(x, tmp3.numpy())) self.assertTrue(np.array_equal(y, tmp4.numpy())) self.assertTrue(np.array_equal(x, tmp5.numpy())) self.assertTrue(np.array_equal(x, tmp6.numpy()))
def segment_sum(data, segment_ids, name=None): r""" Segment Sum Operator. This operator sums the elements of input `data` which with the same index in `segment_ids`. It computes a tensor such that $out_i = \\sum_{j} data_{j}$ where sum is over j such that `segment_ids[j] == i`. Args: data (Tensor): A tensor, available data type float32, float64, int32, int64. segment_ids (Tensor): A 1-D tensor, which have the same size with the first dimension of input data. Available data type is int32, int64. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: output (Tensor): the reduced result. Examples: .. code-block:: python import paddle data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32') segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32') out = paddle.incubate.segment_sum(data, segment_ids) #Outputs: [[4., 4., 4.], [4., 5., 6.]] """ if in_dygraph_mode(): return _C_ops.final_state_segment_pool(data, segment_ids, "SUM")[0] if _in_legacy_dygraph(): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out check_variable_and_dtype(data, "X", ("float32", "float64", "int32", "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") helper = LayerHelper("segment_sum", **locals()) out = helper.create_variable_for_type_inference(dtype=data.dtype) summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype) helper.append_op(type="segment_pool", inputs={ "X": data, "SegmentIds": segment_ids }, outputs={ "Out": out, "SummedIds": summed_ids }, attrs={"pooltype": "SUM"}) return out
def func_metaclass(self): self.assertEqual(type(MyLayer).__name__, 'type') self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type') if not _in_legacy_dygraph(): self.assertEqual( type(paddle.fluid.core.eager.Tensor).__name__, 'type') else: self.assertEqual( type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
def init_reducer(self): layers_param = [] params_set = set() for sublayer in self.sublayers(): for _, param in sublayer.named_parameters(include_sublayers=False): if param is None or param in params_set: continue params_set.add(param) if not isinstance(param, self.var_dtype): raise TypeError("The data type of '%s' must be '%s'" % (param.name, self.var_dtype)) if param.trainable: layers_param.append((sublayer, param)) trainable_parameters = [param for _, param in layers_param] assert len(trainable_parameters) > 0, \ "This model does not have any parameters to train, and " \ "does not need to use DataParallel" # NOTE(shenliang03): Here we can only use the attributes to judge whether # parameter is sparse(or SelectedRows). The reason is that the sparse message # can't be obtained when bp hasn't happened yet. So if layer supports sparse parameter, # we should add the layer here like "paddle.nn.layer.common.Embedding". def check_layer_sparse(sublayer): if isinstance(sublayer, paddle.nn.layer.common.Embedding): return sublayer._sparse # NOTE(shenliang03):This is for compatibility. If paddle.fluid.dygraph.Embedding # is removed in the future, the check will also be removed here. if isinstance(sublayer, paddle.fluid.dygraph.Embedding): return sublayer._is_sparse return False is_sparse_gradient = [ check_layer_sparse(sublayer) for sublayer, _ in layers_param ] if in_dygraph_mode(): self.group_indices = core.eager_assign_group_by_size( trainable_parameters, is_sparse_gradient, [self.last_comm_buffer_size, self.comm_buffer_size]) self._reducer = core.EagerReducer( trainable_parameters, list(reversed(self.group_indices)), is_sparse_gradient, self.group.process_group, [self.last_comm_buffer_size, self.comm_buffer_size], self.find_unused_parameters) elif _in_legacy_dygraph(): self.group_indices = core.assign_group_by_size( trainable_parameters, is_sparse_gradient, [self.last_comm_buffer_size, self.comm_buffer_size]) self._reducer = core.Reducer( trainable_parameters, list(reversed(self.group_indices)), is_sparse_gradient, parallel_helper.__parallel_ctx__clz__, [self.last_comm_buffer_size, self.comm_buffer_size], self.find_unused_parameters)
def bernoulli(x, name=None): """ For each element :math:`x_i` in input ``x``, take a sample from the Bernoulli distribution, also called two-point distribution, with success probability :math:`x_i`. The Bernoulli distribution with success probability :math:`x_i` is a discrete probability distribution with probability mass function .. math:: p(y)=\\begin{cases} x_i,&y=1\\\\ 1-x_i,&y=0 \end{cases}. Args: x (Tensor): The input Tensor, it's data type should be float32, float64. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Returns: Tensor: A Tensor filled samples from Bernoulli distribution, whose shape and dtype are same as ``x``. Examples: .. code-block:: python :name: bernoulli-example import paddle paddle.set_device('cpu') # on CPU device paddle.seed(100) x = paddle.rand([2,3]) print(x) # [[0.55355281, 0.20714243, 0.01162981], # [0.51577556, 0.36369765, 0.26091650]] out = paddle.bernoulli(x) print(out) # [[1., 0., 1.], # [0., 1., 0.]] """ if in_dygraph_mode(): return _C_ops.final_state_bernoulli(x) if _in_legacy_dygraph(): return _C_ops.bernoulli(x) check_variable_and_dtype(x, "x", ["float32", "float64"], "bernoulli") helper = LayerHelper("randint", **locals()) out = helper.create_variable_for_type_inference( dtype=x.dtype) # maybe set out to int32 ? helper.append_op(type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={}) out.stop_gradient = True return out
def _is_valid_send_recv_partial(tensor, mp_degree): if _in_legacy_dygraph(): tensor_numel = np.prod(tensor.shape) assert tensor_numel != 0, "can't send/recv zero element" return mp_degree > 1 and tensor_numel % mp_degree == 0 elif in_dygraph_mode(): # TODO(shenliang03) support mp+pp optimizer in future. # (partial_send/partial_recv/partial_allgather_) return False
def forward(self, inputs): if self.model_id == 0: if in_dygraph_mode(): inputs = cus_tanh_eager.apply(inputs) elif _in_legacy_dygraph(): inputs = cus_tanh.apply(inputs) else: inputs = self.tanh(inputs) inputs = paddle.matmul(self.w, inputs) return self.linear(inputs)
def func_isinstance(self): var = fluid.layers.data(shape=[1], name='x', dtype='float32') self.assertTrue(isinstance(var, fluid.Variable)) with fluid.dygraph.guard(): if not _in_legacy_dygraph(): var_base = paddle.to_tensor(np.array([3, 4, 5])) self.assertTrue(isinstance(var_base, core.eager.Tensor)) else: var_base = paddle.to_tensor(np.array([3, 4, 5])) self.assertTrue(isinstance(var_base, core.VarBase)) self.assertTrue(isinstance(var_base, fluid.Variable))
def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker): """ prune gate by capacity(only support CUDA) Args: gate_idx (Tensor): Represents the gate_id sequence corresponding to the input data with type int32, int64. expert_count (Tensor): The quantity value counted on the gate_id sequence of the input data with type int32, int64. n_worker(int,optional): The number of workers on the trainer with type int64. Returns: new_gate_idx (Tensor): The gate_id sequence corresponding to the new input data after passing through prune. Examples: .. code-block:: python import paddle gate_idx = paddle.to_tensor([1, 3, 3, 3, 3, 2, 1, 1], dtype='int32') expert_count = paddle.to_tensor([0, 3, 1, 3, 0, 0, 0, 0], dtype='int32') n_worker = 1 new_gate_id = paddle.distributed.utils.prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker) print(new_gate_id) # Tensor(shape=[8], dtype=int32, place=CUDAPlace(0), stop_gradient=True, [1, 3, 3, 3, -1, 2, 1, 1]) """ if in_dygraph_mode(): return _C_ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker) elif _in_legacy_dygraph(): return core.ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker) check_variable_and_dtype( gate_idx, 'GateIdx', ['int32', 'int64'], 'paddle.distributed.utils.prune_gate_by_capacity') check_variable_and_dtype( expert_count, 'ExpertCount', ['int32', 'int64'], 'paddle.distributed.utils.prune_gate_by_capacity') helper = LayerHelper('prune_gate_by_capacity', **locals()) new_gate_idx = helper.create_variable_for_type_inference( dtype=gate_idx.dtype) helper.append_op(type='prune_gate_by_capacity', inputs={ 'GateIdx': gate_idx, "ExpertCount": expert_count }, outputs={'NewGateIdx': new_gate_idx}, attrs={ "n_expert": n_expert, "n_worker": n_worker }) return new_gate_idx
def func_test_async_read_empty_offset_and_count(self): with cuda.stream_guard(self.stream): if _in_legacy_dygraph(): core.async_read(self.src, self.dst, self.index, self.buffer, self.empty, self.empty) else: core.eager.async_read(self.src, self.dst, self.index, self.buffer, self.empty, self.empty) array1 = paddle.gather(self.src, self.index) array2 = self.dst[:len(self.index)] self.assertTrue(np.allclose(array1.numpy(), array2.numpy()))
def randperm(n, dtype="int64", name=None): """ Returns a 1-D Tensor filled with random permutation values from 0 to n-1, with ``dtype``. Args: n (int): The upper bound (exclusive), and it should be greater than 0. dtype (str|np.dtype, optional): The data type of the output Tensor. Supported data types: int32, int64, float32, float64. Default is int64. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: Tensor: A 1-D Tensor filled with random permutation values from 0 to n-1, with ``dtype``. Examples: .. code-block:: python import paddle out1 = paddle.randperm(5) # [4, 1, 2, 3, 0] # random out2 = paddle.randperm(7, 'int32') # [1, 6, 2, 0, 4, 3, 5] # random """ if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) if in_dygraph_mode(): return _C_ops.final_state_randperm(n, dtype, _current_expected_place()) if _in_legacy_dygraph(): return _C_ops.randperm('n', n, 'seed', 0, 'dtype', dtype) if n < 1: raise ValueError( "The input n should be greater than 0 in randperm op.") check_dtype(dtype, 'dtype', ['int64', 'int32', 'float32', 'float64'], 'randperm') helper = LayerHelper("randperm", **locals()) out = helper.create_variable_for_type_inference(dtype) attrs = {'n': n, 'dtype': dtype, 'seed': 0} helper.append_op(type='randperm', inputs={}, outputs={'Out': out}, attrs=attrs) out.stop_gradient = True return out
def _create_out(var): assert isinstance(var, Variable) var_desc = var.desc varbase = None if _in_legacy_dygraph(): var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) else: var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) return var_base
def p_norm_python_api(x, p=2.0, axis=-1, epsilon=1e-12, keepdim=False, as_vector=False): if in_dygraph_mode(): return _C_ops.final_state_p_norm(x, p, axis, epsilon, keepdim, as_vector) if _in_legacy_dygraph(): return _C_ops.p_norm(x, 'axis', axis, 'porder', float(p), 'keepdim', keepdim, 'epsilon', epsilon, 'as_vector', as_vector)
def func_test_async_read_only_1dim(self): src = paddle.rand([40], dtype="float32").pin_memory() dst = paddle.empty([40], dtype="float32") buffer_ = paddle.empty([20]).pin_memory() with cuda.stream_guard(self.stream): if _in_legacy_dygraph(): core.async_read(src, dst, self.index, buffer_, self.empty, self.empty) else: core.eager.async_read(src, dst, self.index, buffer_, self.empty, self.empty) array1 = paddle.gather(src, self.index) array2 = dst[:len(self.index)] self.assertTrue(np.allclose(array1.numpy(), array2.numpy()))
def func_uva_tensor_creation(self): if paddle.fluid.core.is_compiled_with_cuda(): dtype_list = [ "int32", "int64", "float32", "float64", "float16", "int8", "int16", "bool" ] for dtype in dtype_list: data = np.random.randint(10, size=[4, 5]).astype(dtype) if _in_legacy_dygraph(): tensor = paddle.fluid.core.to_uva_tensor(data, 0) else: tensor = core.eager.to_uva_tensor(data, 0) self.assertTrue(tensor.place.is_gpu_place()) self.assertTrue(np.allclose(tensor.numpy(), data))
def _assign_pos(x, cum_count): """ Assign pos decides which tokens should be fetched belong to specially expert orderingly. Args: x (Tensor): Tensor. Every element in the list must be a Tensor whose data type should be float16, float32, float64, int32 or int64. cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose data type should be int64. Returns: out (Tensor): Assemble numbers in the order of counters. Examples: .. code-block:: python # required: distributed import paddle number_count = [2, 0, 2, 0] numbers = [ [0, 2], [0, 2] ] number_count = paddle.to_tensor(number_count) numbers = paddle.to_tensor(numbers, dtype="int32") num_cum = paddle.cumsum(number_count) pos = paddle.distributed.utils.assign_pos(x=numbers, cum_count=num_cum) print(pos) # the result: (2, 0, 3, 1) """ if in_dygraph_mode(): return _C_ops.assign_pos(x, cum_count, cum_count[-1]) elif _in_legacy_dygraph(): return core.ops.assign_pos(x, cum_count, cum_count[-1]) else: op_type = 'assign_pos' helper = LayerHelper(op_type, **locals()) out = helper.create_variable_for_type_inference(dtype=cum_count.dtype) helper.append_op(type=op_type, inputs={ 'X': [x], 'cum_count': [cum_count], "eff_num_len": [cum_count[-1]] }, outputs={'Out': [out]}) return out
def func_test_async_write_success(self): offset = paddle.to_tensor(np.array([0, 60], dtype="int64"), place=paddle.CPUPlace()) count = paddle.to_tensor(np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()) with cuda.stream_guard(self.stream): if _in_legacy_dygraph(): core.async_write(self.src, self.dst, offset, count) else: core.eager.async_write(self.src, self.dst, offset, count) offset_a = paddle.gather(self.dst, paddle.to_tensor(np.arange(0, 40))) offset_b = paddle.gather(self.dst, paddle.to_tensor(np.arange(60, 120))) offset_array = paddle.concat([offset_a, offset_b], axis=0) self.assertTrue(np.allclose(self.src.numpy(), offset_array.numpy()))
def _limit_by_capacity(expert_count, capacity, n_worker): """ limit the expert count by capacity. Args: expert_count (Tensor): Tensor. The input expert count whose data type should be int32 or int64. capacity (Tensor): Tensor. The input capacity whose data type should be int32 or int64 and the elements of capacity should be the same with expert_count.numel()/n_work. n_work (int): The number of the works. Returns: out (Tensor): The output expert count limit by capacity. Examples: .. code-block:: python # required: distributed import paddle expert_count = [1, 2, 2, 8, 3, 6] capacity = [5, 5, 5] n_work = 2 expert_count = paddle.to_tensor(expert_count, dtype="int32") capacity = paddle.to_tensor(capacity, dtype="int32") out = paddle.distributed.utils.limit_by_capacity(expert_count, capacity, n_work) print(out) # the result: [1, 2, 2, 4, 3, 3] """ if in_dygraph_mode(): return _C_ops.limit_by_capacity(expert_count, capacity, 'n_worker', n_worker) elif _in_legacy_dygraph(): return core.ops.limit_by_capacity(expert_count, capacity, 'n_worker', n_worker) else: op_type = 'limit_by_capacity' helper = LayerHelper(op_type, **locals()) out = helper.create_variable_for_type_inference( dtype=expert_count.dtype) helper.append_op(type=op_type, inputs={ 'expert_count': expert_count, 'capacity': capacity }, outputs={'Out': out}, attrs={'n_worker': n_worker}) return out
def sharding_reduce_gradients(parameter_list, hcg): # TODO allreduce --> reduce # TODO merge grad / nrank with dp logger.debug("sharding start gradients sync") with framework.no_grad(): sharding_nrank = hcg.get_sharding_parallel_group().nranks for param in parameter_list: if param.trainable and (param._grad_ivar() is not None): if in_dygraph_mode(): param.grad.scale_(1.0 / sharding_nrank) paddle.distributed.all_reduce( param.grad, group=hcg.get_sharding_parallel_group(), use_calc_stream=True) elif _in_legacy_dygraph(): g_var = param._grad_ivar() # need use trace_op to allreduce # paddle.distributed.all_reduce( # g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True) paddle.fluid.framework._dygraph_tracer().trace_op( type="c_allreduce_sum", inputs={'X': g_var}, outputs={'Out': g_var}, attrs={ 'ring_id': hcg.get_sharding_parallel_group().id, 'use_calc_stream': True }) # grad / sharding_rank div_factor = paddle.to_tensor( sharding_nrank, dtype=g_var.dtype) paddle.fluid.framework._dygraph_tracer().trace_op( type="elementwise_div", inputs={'X': g_var, 'Y': div_factor}, outputs={'Out': g_var}, attrs={'axis': -1})
def _random_routing(topk_idx, topk_value, prob, topk=2): r""" random routing topk gate idx ``` out = topk_idx for i in len(topk_idx): if topk * value[i][topk-1] < prob[i]: out[i][topk-1] = -1 ``` Args: topk_idx: gate idx, shape=(N, topk) topk_value: values, shape = topk_idx.shape prob: random prob, shape=(topk_idx.shape[0],) """ if topk == 2: if in_dygraph_mode(): return _C_ops.random_routing(prob, topk_value, topk_idx) elif _in_legacy_dygraph(): return core.ops.random_routing(prob, topk_value, topk_idx) else: raise RuntimeError("Not supporting static mode now") else: raise RuntimeError("only topk=2 is supported now")
def _number_count(numbers, upper_range): """ calculate the expert count according to the gate index. Args: numbers (Tensor): Tensor. The input gate index whose data type should be int32 or int64. upper_range (int): The number of the experts. Returns: out (Tensor): The output expert count. Examples: .. code-block:: python # required: distributed import paddle numbers = [ [0, 2], [0, 2] ] upper_range = 6 numbers = paddle.to_tensor(numbers, dtype="int32") number_count = paddle.distributed.utils.number_count(numbers, upper_range) print(number_count) # the result: [2, 0, 2, 0, 0, 0] """ if in_dygraph_mode(): return _C_ops.number_count(numbers, 'upper_range', upper_range) elif _in_legacy_dygraph(): return core.ops.number_count(numbers, 'upper_range', upper_range) else: op_type = 'number_count' helper = LayerHelper(op_type, **locals()) out = helper.create_variable_for_type_inference(dtype=numbers.dtype) helper.append_op(type=op_type, inputs={'numbers': numbers}, outputs={'Out': out}, attrs={'upper_range': upper_range}) return out
def func_empty_grad(self): with fluid.dygraph.guard(): x = np.ones([2, 2], np.float32) new_var = paddle.to_tensor(x) self.assertIsNone(new_var.gradient()) try: new_var.clear_gradient() except Exception as e: assert type(e) == core.EnforceNotMet with fluid.dygraph.guard(): cur_program = fluid.Program() cur_block = cur_program.current_block() # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good. if _in_legacy_dygraph(): new_variable = cur_block.create_var( name="X", shape=[-1, 23, 48], dtype='float32') else: new_variable = cur_block.create_var( name="X", shape=[1, 23, 48], dtype='float32') try: new_variable.gradient() except Exception as e: assert type(e) == ValueError
def func_empty_var(self): with fluid.dygraph.guard(): cur_program = fluid.Program() cur_block = cur_program.current_block() # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good. if _in_legacy_dygraph(): new_variable = cur_block.create_var( name="X", shape=[-1, 23, 48], dtype='float32') else: new_variable = cur_block.create_var( name="X", shape=[1, 23, 48], dtype='float32') try: new_variable.numpy() except Exception as e: assert type(e) == ValueError try: new_variable.backward() except Exception as e: assert type(e) == core.EnforceNotMet try: new_variable.clear_gradient() except Exception as e: assert type(e) == core.EnforceNotMet
def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False, inplace_map=None): if not framework._in_legacy_dygraph(): # inputs : {"sum": [tensor], ...} # outputs : {"sum": [tensor], ...} if type in final_state_name_mapping.keys(): final_state_type = final_state_name_mapping[type][ "final_op_name"] assert final_state_type in _C_ops.__dict__ self.eager_final_state_trace_op(type, inputs, outputs, attrs, stop_gradient, inplace_map) else: self.eager_trace_op(type, inputs, outputs, attrs, stop_gradient, inplace_map) else: self.trace(type, inputs, outputs, attrs, framework._current_expected_place(), self._has_grad and not stop_gradient, inplace_map if inplace_map else {})