def sum_grad(self): """Sum the gradients of all parameters. Call this method after each ``backward`` pass: ```python x = torch.ones(1, requires_grad=True) optimizer = torch.optim.SGD([x], lr=0.1) for epoch in range(2): for step in range(3): y = x + 1 y.backward() optimizer.sum_grad() optimizer.step() print(x) # 0.4 ``` """ current_ws = workspace.get_workspace() for group in self.param_groups: grads, sum_grads = [], [] for param in group['params']: grad = self._get_grad(current_ws, param) if grad is not None: grads.append(grad) sum_grads.append(grad.id + '_sum') Function.apply( 'Axpby', grads[0].device, grads, outputs=sum_grads, alpha=1., beta=1. if self._sums_grad else 0.) self._sums_grad = True
def all_gather(tensor_list, tensor, group=None): """Gather the tensor across all nodes in a group. Parameters ---------- tensor_list : Sequence[dragon.vm.torch.Tensor] The output tensor list. tensor : dragon.vm.torch.Tensor The tensor to be sent. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor output_tensor = Function.apply('Collective', tensor.device, [tensor], operation='ALLGATHER', **group.arguments) if len(tensor_list) > 0: return Function.apply('Split', output_tensor.device, [output_tensor], outputs=[None] * len(tensor_list), axis=0, size_split=None, copy=True) return output_tensor
def transpose(input, dim0, dim1, out=None): """Return a new tensor with two dimensions swapped. Examples: ```python x = torch.ones(2, 3, 4) print(torch.transpose(x, 0, 2).shape) # (4, 3, 2) ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. dim0 : int The first dimension to be transposed. dim1 : int The second dimension to be transposed. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ dims = list(range(input.ndimension())) dims[dim0], dims[dim1] = dims[dim1], dims[dim0] return Function.apply('Transpose', input.device, [input], outputs=[out], ndim=len(dims), perm=dims)
def narrow(input, dimension, start, length): """Return a narrowed tensor of input. Parameters ---------- input : torch.Tensor The input tensor. dimension : int The dimension to slice. start : int The starting position. length : int The distance to the ending position. Returns ------- dragon.vm.torch.Tensor The output tensor. """ sizes = list(input.shape[:]) starts = [0] * len(sizes) starts[dimension], sizes[dimension] = start, length return Function.apply('Slice', input.device, [input], ndim=len(starts), starts=starts, sizes=sizes)
def multinomial(input, num_samples, out=None): """Return an index tensor sampled from the multinomial distribution. Examples: ```python input = torch.tensor([0.5, 0.5]).log() index = torch.multinomial(input, 1) ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. num_samples : int The number of samples in each row. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('Multinomial', input.device, [input], outputs=[out], sample_size=num_samples)
def masked_fill(input, mask, value, out=None): """Fill tensor with the value where mask is true. Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. mask : dragon.vm.torch.Tensor The boolean mask. value : Union[number, dragon.vm.torch.Tensor] The value to fill. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ if not isinstance(value, Tensor): value = constant_ops.scalar(value, input.dtype, input.device) return Function.apply('Where', input.device, [mask, value, input], outputs=[out])
def flip(input, dims): """Reverse elements along the given dimension. :attr:`dims` could be negative: ```python x = torch.tensor([[1, 2, 3], [4, 5, 6]]) # A negative dimension is the last-k dimension print(torch.flip(x, dims=1)) # [[3, 2, 1], [6, 5, 4]] print(torch.flip(x, dims=-1)) # Equivalent # Also, dimension could be a sequence of integers print(torch.flip(x, dims=(0, 1))) # [[6, 5, 4], [3, 2, 1]] ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. dims : Union[int, Sequence[int]] The dimension to reverse. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply( 'Reverse', input.device, [input], axes=nest.flatten(dims) if dims is not None else dims)
def broadcast(tensor, src=0, group=None): """Broadcast the tensor from source node in a group. Parameters ---------- tensor : dragon.vm.torch.Tensor The tensor to be sent. src : int The rank of the source node. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor return Function.apply('Collective', tensor.device, [tensor], outputs=[tensor], operation='BROADCAST', root=src, **group.arguments)
def where(condition, x, y): r"""Select the elements from two branches under the condition. .. math:: \text{out}_{i} = \begin{cases} \text{x}_{i}, & \text{ if } \text{condition}_{i} \\ \text{y}_{i}, & \text{ otherwise } \end{cases} Parameters ---------- condition : dragon.vm.torch.Tensor The condition tensor. x : dragon.vm.torch.Tensor The elements for ``True`` branch. y : dragon.vm.torch.Tensor The elements for ``False`` branch. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('Where', condition.device, [condition, x, y])
def zeros(*size, out=None, dtype='float32', device=None, requires_grad=False): r"""Return a tensor filled with zeros. .. math:: \text{out} \leftarrow 0 Parameters ---------- size : int... The output tensor shape. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional, default='float32' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ size = nest.flatten(size) device = out.device if out else (device or cpp.device()) out = Function.apply('Fill', device, [], outputs=[out], dtype=dtype, value=0.0, ndim=len(size), dims=size) out._requires_grad = requires_grad return out
def zeros_like(input, dtype='float32', device=None, requires_grad=False): r"""Return a tensor of zeros with shape as the other. .. math:: \text{out} \leftarrow 0 Parameters ---------- input : dragon.vm.torch.Tensor The tensor for indicating shape. dtype : str, optional, default='float32' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ device = device or input.device out = Function.apply('Fill', device, [input], dtype=dtype, value=0.0) out._requires_grad = requires_grad return out
def clamp(input, min=None, max=None, out=None): r"""Compute the clipped input according to the given bounds. .. math:: \text{out} = \min(\max(\text{input}, low), high) Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. min : number, optional The min value. max : number, optional The max value. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ low = float(min) if min is not None else None high = float(max) if max is not None else None return Function.apply('Clip', input.device, [input], outputs=[out], low=low, high=high)
def argmin(input, dim, keepdim=False, out=None): """Return the index of minimum elements along the given dimension. :attr:`dim` could be negative: ```python # A negative dimension is the last-k dimension x = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(torch.argmin(x, dim=1)) print(torch.argmin(x, dim=-1)) # Equivalent ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. dim : int, optional The dimension to reduce. keepdim : bool, optional, default=False Keep the reduced dimension or not. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The index of minimum elements. """ return Function.apply('ArgMin', input.device, [input], outputs=[out], axis=dim, keepdims=keepdim)
def normal(mean, std, size, out=None): r"""Return a tensor initialized from the normal distribution. .. math:: \text{out} \sim \mathcal{N}(\mu, \sigma^{2}) Parameters ---------- mean : number The value to :math:`\mu`. std : number The value to :math:`\sigma`. size : Sequence[int] The output tensor shape. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ dtype = out.dtype if out else 'float32' device = out.device if out else cpp.device() return Function.apply( 'RandomNormal', device, [], outputs=[out], dtype=dtype, mean=float(mean), std=float(std), ndim=len(size), dims=size)
def cumsum(input, dim, out=None): """Compute the cumulative sum of elements along the given dimension. :attr:`dim` could be negative: ```python # A negative dimension is the last-k dimension x = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(torch.cumsum(x, dim=1)) # [[1, 3, 6], [4, 9, 15]] print(torch.cumsum(x, dim=-1)) # Equivalent ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. dim : int The cumulative dimension. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('CumSum', input.device, [input], outputs=[out], axis=dim)
def randperm(n, out=None, dtype='int64', device=None, requires_grad=False): """Return a tensor with value in the permuted range. Specify ``n`` to determine an interval :math:`[0, n)`: ```python print(torch.randperm(4)) ``` Parameters ---------- n: number The end of interval. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional, default='int64' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ device = out.device if out else (device or cpp.device()) out = Function.apply( 'Permutation', device, [], outputs=[out], dtype=dtype, limit=n) out._requires_grad = requires_grad return out
def uniform(low, high, size, out=None): r"""Return a tensor initialized from the uniform distribution. .. math:: \text{out} \sim \mathcal{U}(\alpha, \beta) Parameters ---------- low : number The value to :math:`\alpha`. high : number The value to :math:`\beta`. size : Sequence[int] The output tensor shape. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ dtype = out.dtype if out else 'float32' device = out.device if out else cpp.device() return Function.apply( 'RandomUniform', device, [], outputs=[out], dtype=dtype, low=float(low), high=float(high), ndim=len(size), dims=size)
def all_reduce(tensor, op='sum', group=None): """Reduce the tensor across all nodes in a group. Parameters ---------- tensor : dragon.vm.torch.Tensor The tensor to reduce. op : str, optional The reduction op. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor op = op.upper() if op not in ('MEAN', 'SUM'): raise ValueError('Unsupported reduction: ' + op) return Function.apply('Collective', tensor.device, [tensor], outputs=[tensor], operation='ALLREDUCE', reduction=op, **group.arguments)
def flatten(input, start_dim=0, end_dim=-1, out=None): """Return a tensor with dimensions flattened. :attr:`start_dim` and :attr:`end_dim` could be negative: ```python # A negative dimension is the last-k dimension x = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(torch.flatten(x, start_dim=0, end_dim=1)) print(torch.flatten(x, start_dim=0, end_dim=-1)) # Equivalent ``` Parameters ---------- input : torch.Tensor The input tensor. start_dim : int, optional, default=0 The start dimension to flatten. end_dim : int, optional, default=-1 The end dimension to flatten. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('Flatten', input.device, [input], outputs=[out], axis=start_dim, end_axis=end_dim)
def addmm(input, mat1, mat2, beta=1, alpha=1, out=None): r"""Add input to the result of matrix-matrix multiplication. .. math:: \text{out} = \alpha (\text{mat1} \times \text{mat2}) + \beta \text{input} Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. mat1 : dragon.vm.torch.Tensor The first matrix. mat2 : dragon.vm.torch.Tensor The second matrix. beta : float, optional, default=1 The value to :math:`\beta`. alpha : float, optional, default=1 The value to :math:`\alpha`. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('Gemm', input.device, [mat1, mat2, input], outputs=[out], alpha=float(alpha), beta=float(beta))
def randn(*size, out=None, dtype='float32', device=None, requires_grad=False): """Return a tensor from the normal distribution of N(0, 1). Parameters ---------- size : int... The output tensor shape. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional, default='float32' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ size = nest.flatten(size) device = out.device if out else (device or cpp.device()) out = Function.apply( 'RandomNormal', device, [], outputs=[out], dtype=dtype, mean=0.0, std=1.0, ndim=len(size), dims=size) out._requires_grad = requires_grad return out
def split(tensor, split_size_or_sections, dim=0, copy=True): """Split input into chunks along the given dimension. Either size of every chunk or each chunk will be accepted: ```python x = torch.tensor([1, 2, 3, 4, 5, 6]) # Shape: (6,) -> (4,), (2,) print(torch.split(x, split_size_or_sections=4)) # Shape: (6,) -> (5,), (1,) print(torch.split(x, split_size_or_sections=(5, 1))) ``` :attr:`dim` can be negative: ```python x = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(torch.split(x, 2, dim=1)) print(torch.split(x, 2, dim=-1)) # Equivalent ``` Parameters ---------- tensor : dragon.vm.torch.Tensor The input tensor. split_size_or_sections : Union[int, Sequence[int] The number or size of chunks. dim : int, optional, default=0 The dimension to split. copy : bool, optional, default=True Copy or create the views of input. Returns ------- Sequence[dragon.vm.torch.Tensor] The output tensors. """ if nest.is_sequence(split_size_or_sections): size_splits = split_size_or_sections num_splits = len(split_size_or_sections) else: size = tensor.shape[dim] if size % split_size_or_sections == 0: num_splits = size // split_size_or_sections size_splits = [split_size_or_sections] * num_splits else: num_splits = size // split_size_or_sections + 1 size_splits = [split_size_or_sections] * num_splits size_splits[-1] = size - (split_size_or_sections * (num_splits - 1)) return Function.apply('Split', tensor.device, [tensor], outputs=[None] * num_splits, axis=dim, num_splits=num_splits, split=size_splits, copy=copy)
def norm(input, p='fro', dim=None, keepdim=False, out=None, dtype=None): """Compute the norm value of elements along the given dimension. :attr:`dim` could be negative or ``None``: ```python x = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) # A negative dimension is the last-k axis print(torch.norm(x, dim=1)) print(torch.norm(x, dim=-1)) # Equivalent # If ``dim`` is None, the vector-style reduction # will be applied to return a scalar result print(torch.norm(x)) # 9.539 # Also, ``dim`` could be a sequence of integers print(torch.norm(x, dim=(0, 1))) # 9.539 ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. p : {'fro', 1, 2}, optional The norm order. dim : Union[int, Sequence[int]], optional The dimension to reduce. keepdim : bool, optional, default=False Keep the reduced dimension or not. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional The data type to cast to. Returns ------- dragon.vm.torch.Tensor The output tensor. """ if p is None or p == 2 or p == 'fro': op_type = 'ReduceL2' elif p == 1: op_type = 'ReduceL1' else: raise ValueError('Unsupported norm order: ' + str(p)) input = input.to(dtype=dtype) keepdim = keepdim if dim is not None else False dim = nest.flatten(dim) if dim is not None else dim return Function.apply(op_type, input.device, [input], outputs=[out], axes=dim, keepdims=keepdim)
def unique(input, return_inverse=False, return_counts=False, **kwargs): """Return the unique elements of input. If ``return_inverse``, return the extra index where input mapping to: ```python x = torch.tensor([1, 2, 3, 2]) y, index = torch.unique(x, return_inverse=True) print(y) # [1, 2, 3] print(index) # [0, 1, 2, 1] ``` If ``return_counts``, return the extra counts of output: ```python x = torch.tensor([1, 2, 3, 2]) y, counts = torch.unique(x, return_counts=True) print(y) # [1, 2, 3] print(counts) # [1, 2, 1] ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. return_inverse : bool, optional, default=False Return the inverse index or not. return_counts : bool, optional, default=False Return the counts or not. Returns ------- dragon.vm.torch.Tensor The output tensor. dragon.vm.torch.Tensor, optional The inverse index tensor. dragon.vm.torch.Tensor, optional The counts tensor. """ if 'sorted' in kwargs: kwargs.pop('sorted') num_outputs = 1 if return_inverse: num_outputs += 1 if return_counts: num_outputs += 1 return Function.apply('Unique', input.device, [input], outputs=[None] * num_outputs, return_inverse=return_inverse, return_counts=return_counts)
def matmul(input, other, out=None): r"""Compute the matrix multiplication. .. math:: \text{out} = \text{input} \times \text{other} The behavior depends on the shape of input tensors: * If both tensors are 1d, computes the vector product. * If tensors are 1d and >=2d, computes the vector-matrix multiplication. * If tensors are >=2d and 1d, computes the matrix-vector multiplication. * If both tensors are >= 2d, computes the matrix-matrix multiplication. * If one tensor is >= 3d, applies batching and broadcasting to the computation. Examples: ```python # Vector x Vector a = torch.ones(2) b = torch.ones(2) print(torch.matmul(a, b)) # Vector x Matrix a = torch.ones(2) b = torch.ones(2, 3) print(torch.matmul(a, b)) # Matrix x Vector a = torch.ones(3, 2) b = torch.ones(2) print(torch.matmul(a, b)) # Matrix x Matrix a = torch.ones(2, 3) b = torch.ones(3, 2) print(torch.matmul(a, b)) ``` Parameters ---------- input : dragon.vm.torch.Tensor The input tensor. other : dragon.vm.torch.Tensor The tensor to multiply. out : dragon.vm.torch.Tensor, optional The output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Function.apply('MatMul', input.device, [input, other], outputs=[out])
def eye( n, m=None, out=None, dtype='float32', device=None, requires_grad=False, ): r"""Return a tensor constructed as the identity matrix. .. math:: \text{out} \leftarrow \text{diag}(1, 1, ..., 1) The rows and cols of matrix are determined by ``n`` and ``m``: ```python print(torch.eye(2)) # [[1., 0.], [0., 1.]] print(torch.eye(2, 3)) # [[1., 0., 0.], [0., 1., 0.]] ``` Parameters ---------- n : int The number output rows. m : int, optional The number output cols. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional, default='float32' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ m = n if m is None else m device = out.device if out else (device or cpp.device()) out = Function.apply('Eye', device, [], outputs=[out], dtype=dtype, ndim=2, dims=(n, m)) out._requires_grad = requires_grad return out
def _update_group(self, group): """Update parameters for the group.""" execute_ws = workspace.get_workspace() # Collect params and grads. params_with_grad, grads = [], [] for p in group['params']: g = self._get_grad(execute_ws, p, self._sums_grad) if g is not None: params_with_grad.append(p) grads.append(g) # Skip if grads are all missing. if len(params_with_grad) == 0: return # Update hyper from group values. for name in self._hyper.keys(): group_name = group['name'] impl_name, group_dict = self._hyper[name] if group_name not in group_dict: impl_name = group_name + '/' + impl_name group_dict[group_name] = execute_ws.create_tensor(impl_name) impl = group_dict[group_name] impl.FromNumpy(numpy.array(group[name], 'float32'), False) # Reduce grads in the process group. process_group = distributed.get_group() if process_group is not None: Function.apply('Collective', grads[0].device, grads, outputs=grads, operation='ALLREDUCE', reduction='MEAN', **process_group.arguments) # Apply updates. Function.apply(self._op_type, params_with_grad[0].device, grads, outputs=params_with_grad, name=group['name'], weight_decay=None)
def full( size, fill_value, out=None, dtype='int64', device=None, requires_grad=False, ): """Return a tensor filled with a scalar. Examples: ```python print(torch.full((1, 2), 1)) # [[1, 1]] ``` Parameters ---------- size : int... The output shape. fill_value : number The scalar to fill. out : dragon.vm.torch.Tensor, optional The output tensor. dtype : str, optional, default='int64' The data type of output tensor. device : dragon.vm.torch.device, optional The device of output tensor. requires_grad : bool, optional, default=False Record gradient for output tensor or not. Returns ------- dragon.vm.torch.Tensor The output tensor. """ size = nest.flatten(size) device = out.device if out else (device or cpp.device()) out = Function.apply('Fill', device, [], outputs=[out], dtype=dtype, value=float(fill_value), ndim=len(size), dims=size) out._requires_grad = requires_grad return out
def _set_parameter(self, data, layer_id=0, param_id=0, param_type='matrix'): """Set the data of a parameter.""" return Function.apply('RNNParamSet', data.device, [data], outputs=[self.weights], rnn_mode=self.mode, bidirectional=self.bidirectional, input_size=self.input_size, hidden_size=self.hidden_size, layer_id=layer_id, param_id=param_id, param_type=param_type)
def forward(self, input, hx=None): inputs = [input, self.weights] if hx is not None: inputs += nest.flatten(hx) outputs = [None] * (3 if self.mode == 'lstm' else 2) outputs = Function.apply('Recurrent', input.device, inputs, outputs=outputs, rnn_mode=self.mode, bidirectional=self.bidirectional, input_size=self.input_size, hidden_size=self.hidden_size, dropout=self.dropout, phase='TRAIN' if self.training else 'TEST') output, hidden = outputs[0], outputs[1:] return output, hidden[0] if len(hidden) == 1 else hidden