def __init__( self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, ): super(_BatchNorm, self).__init__() self.num_features = num_features self.eps = eps self.momentum = momentum self.affine = affine self.track_running_stats = track_running_stats if self.affine: self.weight = Parameter(Tensor(num_features)) self.bias = Parameter(Tensor(num_features)) else: self.register_buffer('weight', constant_ops.ones(num_features)) self.register_buffer('bias', constant_ops.zeros(num_features)) if self.track_running_stats: self.num_batches_tracked = 0 else: self.num_batches_tracked = None self.register_buffer('running_mean', constant_ops.zeros(num_features)) self.register_buffer('running_var', constant_ops.ones(num_features)) self.reset_parameters()
def __init__( self, num_groups, num_channels, eps=1e-5, affine=True, ): r"""Create a ``GroupNorm`` module. Parameters ---------- num_groups : int The number of groups. num_channels : int The number of channels. eps : float, optional, default=1e-5 The value to :math:`\epsilon`. affine : bool, optional, default=True ``True`` to apply a affine transformation. """ super(GroupNorm, self).__init__() self.num_groups = num_groups self.num_channels = num_channels self.eps = eps self.affine = affine if self.affine: self.weight = Parameter(Tensor(num_channels)) self.bias = Parameter(Tensor(num_channels)) else: self.register_buffer('weight', constant_ops.ones(num_channels)) self.register_buffer('bias', constant_ops.zeros(num_channels)) self.reset_parameters()
def __init__( self, embed_dim, num_heads, dropout=0., bias=True, kdim=None, vdim=None, ): """Create a ``MultiheadAttention`` module. Parameters ---------- embed_dim : int The dimension of input embeddings. num_heads : int The number of parallel heads. dropout: float, optional, default=0. The probability to set the attention to zero. bias : bool, optional, default=True Add a bias tensor to output or not. kdim : int, optional The dimension of key embedding. vdim : int, optional The dimension of value embedding. """ super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads if self.head_dim * num_heads != self.embed_dim: raise ValueError('<embed_dim> must be divisible by <num_heads>.') if not self._qkv_same_embed_dim: self.q_proj_weight = Parameter(Tensor(embed_dim, embed_dim)) self.k_proj_weight = Parameter(Tensor(embed_dim, self.kdim)) self.v_proj_weight = Parameter(Tensor(embed_dim, self.vdim)) self.register_parameter('in_proj_weight', None) else: self.in_proj_weight = Parameter(Tensor(3 * embed_dim, embed_dim)) self.register_parameter('q_proj_weight', None) self.register_parameter('k_proj_weight', None) self.register_parameter('v_proj_weight', None) if bias: self.in_proj_bias = Parameter(Tensor(3 * embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = Linear(embed_dim, embed_dim, bias=bias) self.reset_parameters()
def _register_parameters(self): """Register and flatten the parameters.""" if self.mode == 'lstm': gate_size = 4 * self.hidden_size elif self.mode == 'gru': gate_size = 3 * self.hidden_size else: gate_size = self.hidden_size # Compute the shape of weight and bias. self._matrix_shape, self._bias_shape = [], [] for layer in range(self.num_layers): for direction in range(self.num_directions): layer_input_size = self.input_size if layer == 0 \ else self.hidden_size * self.num_directions w_ih_shape = [gate_size, layer_input_size] w_hh_shape = [gate_size, self.hidden_size] b_ih_shape, b_hh_shape = [gate_size], [gate_size] # W (0 ~ 3), R (4 ~ 7) self._matrix_shape.extend([w_ih_shape, w_hh_shape]) # Bw (0 ~ 3), Br (4 ~ 7) self._bias_shape.extend([b_ih_shape, b_hh_shape]) # Compute total number of parameters. self._weights_count = 0 for shape in self._matrix_shape + self._bias_shape: self._weights_count += int(numpy.prod(shape)) # Create the flat float32 weights. self.weights = Parameter(Tensor(self._weights_count))
def new_leaf(size, kwargs): """Return a leaf tensor from optional kwargs.""" device = kwargs.get('device', cpp.device()) return Tensor(*size, dtype=kwargs.get('dtype', 'float32'), device=cpp.device() if device is None else device, requires_grad=kwargs.get('requires_grad', False))
def tensor(data, dtype=None, device=None, requires_grad=False): """Create a tensor initializing from the given data. Parameters ---------- data : array_like The data to initialize from. dtype : str, optional The optional data type. device : dragon.vm.torch.device, optional The optional device of returned tensor. requires_grad : bool, optional, default=False ``True`` to record gradient for returned tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ array_data = numpy.array(data, copy=True) if dtype is None: dtype = str(array_data.dtype) else: array_data = array_data.astype(dtype) return Tensor( array_data, dtype=dtype, device=cpp.device() if device is None else device, requires_grad=requires_grad, )
def _get_grad(execute_ws, param, summed=False): """Return the grad of a parameter.""" grad_impl = execute_ws.get_tensor( param.id + ('_grad_sum' if summed else '_grad')) if grad_impl: return Tensor(device=param.device, impl=grad_impl) return None
def scalar(input, dtype, device): """Return a cached scalar tensor. Parameters ---------- input : number The scalar value. dtype : str, optional The data type of output tensor. device : dragon.vm.torch.device The device of output tensor. Returns ------- dragon.vm.torch.Tensor The output tensor. """ if isinstance(input, Tensor): return input try: input = float(input) except (TypeError, ValueError): raise ValueError('<input> should be a python number, got {}.'.format( type(input).__name__)) cached_name = '%s(%s)' % (dtype, input) default_ws = workspace.get_workspace() impl = default_ws.get_tensor(cached_name) if impl is None: impl = default_ws.create_tensor(cached_name) impl.FromNumpy(numpy.array(input, dtype), True) return Tensor(device=device, impl=impl)
def empty(*size, dtype=None, device=None, requires_grad=False): """Return a tensor filled with uninitialized data. Parameters ---------- size : int... The sizes of output tensor. dtype : str, optional The optional data type. device : dragon.vm.torch.device, optional The optional device option. requires_grad : bool, optional, default=False Whether to compute the gradient if necessary. Returns ------- dragon.vm.torch.Tensor The output tensor. """ return Tensor( *size, dtype=dtype if dtype else 'float32', device=cpp.device() if device is None else device, requires_grad=requires_grad, )
def __init__(self, input_size, hidden_size, bias, num_chunks): super(RNNCellBase, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.bias = bias self.weight_ih = Parameter(Tensor(num_chunks * hidden_size, input_size)) self.weight_hh = Parameter( Tensor(num_chunks * hidden_size, hidden_size)) if bias: self.bias_ih = Parameter(Tensor(num_chunks * hidden_size)) self.bias_hh = Parameter(Tensor(num_chunks * hidden_size)) else: self.register_parameter('bias_ih', None) self.register_parameter('bias_hh', None) self.reset_parameters()
def __init__(self, num_embeddings, embedding_dim, padding_idx=None): """Create an ``Embedding`` module. Parameters ---------- num_embeddings : int The dictionary size. embedding_dim : int The embedding dimension. padding_idx : int, optional The position where to return zeros. """ super(Embedding, self).__init__() self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim if padding_idx is not None: if padding_idx > 0: if padding_idx >= self.num_embeddings: raise ValueError('<padding_idx> must be within <num_embeddings>.') elif padding_idx < 0: if padding_idx < -self.num_embeddings: raise ValueError('<padding_idx> must be within <num_embeddings>.') padding_idx = self.num_embeddings + padding_idx self.padding_idx = padding_idx self.weight = Parameter(Tensor(num_embeddings, embedding_dim)) self.reset_parameters()
def _steal_grad(ws, param, grad_accum=False): """Steal the grad from backend.""" impl = ws.GetTensor(param.id + ('_grad[accum]' if grad_accum else '_grad')) if impl is not None: return Tensor(device=param.device, impl=impl) return None
def from_dlpack(dlpack): """Create a tensor sharing the dlpack data. Parameters ---------- dlpack : PyCapsule The capsule object of a dlpack tensor. Returns ------- dragon.vm.torch.Tensor The tensor with the dlpack data. """ current_ws = workspace.get_workspace() tensor = Tensor(device=None) tensor._gc = current_ws.collectors.TENSOR tensor._impl = current_ws.create_tensor( tensor._gc.alloc('${DLPACK}')).FromDLPack(dlpack) tensor._device = cpp.device(*tensor._impl.device) return tensor
def __init__(self, in_features, out_features, bias=True): """Create a ``Linear`` module. Parameters ---------- in_features : int The number of input features. out_features : int The number of output features. bias : bool, optional, default=True Add a bias tensor to output or not. """ super(Linear, self).__init__() self.in_features = in_features self.out_features = out_features self.weight = Parameter(Tensor(out_features, in_features)) if bias: self.bias = Parameter(Tensor(out_features)) else: self.bias = None self.reset_parameters()
def __init__(self, num_parameters=1, init=0.25): """Create a ``PReLU`` module. Parameters ---------- num_parameters : int, optional, default=1 The number of parameters. init : float, optional, default=0.25 The default value of parameters. """ super(PReLU, self).__init__() self.num_parameters = num_parameters self.weight = Parameter(Tensor(num_parameters).fill_(init))
def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True): r"""Create a ``LayerNorm`` module. Parameters ---------- normalized_shape : Union[int, Sequence[int]] The size normalized over the last dimensions. eps : float, optional, default=1e-5 The value to :math:`\epsilon`. elementwise_affine : bool, optional, default=True ``True`` to apply a affine transformation. """ super(LayerNorm, self).__init__() self.normalized_shape = tuple(nest.flatten(normalized_shape)) self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = Parameter(Tensor(*self.normalized_shape)) self.bias = Parameter(Tensor(*self.normalized_shape)) else: self.register_buffer('weight', constant_ops.ones(*self.normalized_shape)) self.register_buffer('bias', constant_ops.zeros(*self.normalized_shape)) self.reset_parameters()
def scalar_to_tensor(input, dtype, device): """Return a cached scalar tensor.""" if isinstance(input, Tensor): return input try: input = float(input) except (TypeError, ValueError): raise ValueError('<input> should be a python number, got {}.'.format( type(input).__name__)) name = '/share/scalar/{}/{}'.format(dtype, str(input)) current_ws = workspace.get_workspace() if not current_ws.has_tensor(name): current_ws.feed_tensor(name, numpy.array(input, dtype=dtype)) return Tensor(device=device, impl=current_ws.GetTensor(name), requires_grad=False)
def from_numpy(ndarray): """Create a tensor converting from the given numpy array. Parameters ---------- ndarray : numpy.ndarray The numpy array data. Return ------ dragon.vm.torch.Tensor The torch tensor. """ if not isinstance(ndarray, numpy.ndarray): raise TypeError('<ndarray> should be a numpy array.') return Tensor(ndarray, copy=False)
def from_dlpack(dlpack): """Create a tensor sharing the dlpack data. Parameters ---------- dlpack : PyCapsule The capsule object of a dlpack tensor. Returns ------- dragon.vm.torch.Tensor The tensor with the dlpack data. """ default_ws = workspace.get_workspace() impl = default_ws.create_tensor(scope='DLPack').FromDLPack(dlpack) return Tensor(device=cpp.device(*impl.device), impl=impl, deleter=default_ws._handle_pool)
def _set_parameter(self, layer_id, param_id, param_type, param): """Set parameter to the flatten weights.""" if isinstance(param, numpy.ndarray): param = Tensor( param, copy=False, requires_grad=self.weights.requires_grad, ) return nn_funcs.RNNParamSet \ .instantiate( self.weights.device, layer_id=layer_id, param_id=param_id, param_type=param_type, mode=self.mode, input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, num_directions=self.num_directions, ).apply(param, self.weights)
def flatten_parameters(self): """Flatten parameters into a single weights.""" gate_size = self._num_gates * self.hidden_size # Compute the shape of weight and bias. matrix_shapes, bias_shapes = [], [] for layer in range(self.num_layers): for direction in range(int(self.bidirectional) + 1): layer_input_size = self.input_size if layer == 0 \ else self.hidden_size * self.num_directions w_ih_shape = [gate_size, layer_input_size] w_hh_shape = [gate_size, self.hidden_size] b_ih_shape, b_hh_shape = [gate_size], [gate_size] matrix_shapes.extend([w_ih_shape, w_hh_shape]) bias_shapes.extend([b_ih_shape, b_hh_shape]) # Compute total number of parameters. self._weights_count = 0 self._weights_shapes = matrix_shapes + bias_shapes for shape in self._weights_shapes: self._weights_count += math_util.prod(shape) # Create the flat float32 weights. self.weights = Parameter(Tensor(self._weights_count))
def run_operator( op_def, inputs, outputs, no_grad=False, pre_callback=None, ): """Compute the outputs.""" requires_grad = False input_names, output_names = [], [] default_tape = backprop.get_default_tape() # Add inputs. for input in inputs: input_names.append(input.id) if input.requires_grad: requires_grad = True # Determine the gradient flags. requires_grad = requires_grad and not no_grad requires_grad = requires_grad and grad_mode.is_grad_enabled() if default_tape is not None: no_grad = no_grad and not default_tape.retain_graph requires_grad = requires_grad or default_tape.retain_graph # Allocate outputs. ws = workspace.get_workspace() output_scope = context.get_eager_scope(requires_grad) gc = ws.collectors # Garbage collectors for i, spec in enumerate(outputs): if isinstance(spec, six.string_types): output_names.append(spec) else: if isinstance(spec, device_cls): impl = ws.create_tensor(gc.TENSOR.alloc(output_scope)) outputs[i] = Tensor(device=spec, gc=gc.TENSOR, impl=impl) output_names.append(outputs[i].id) # Generate the OpDef. op_handle = None # Optional resource handle op_def = op_def.DeriveTo(input_names, output_names) # Flag the outputs. if len(inputs) > 0 and not no_grad: if requires_grad: instance_tape = backprop.Tape() for input in inputs: instance_tape.merge_from(input._tape) if not input._requires_grad: instance_tape.add_empty_grad(input.id + '_grad') op_def.name = op_handle = gc.OP.alloc(op_def.type) instance_tape.add_operation(op_def) for output in outputs: output._tape = instance_tape output._requires_grad = True else: for output in outputs: output._requires_grad = False # Record this operation for future developments. if default_tape is not None: default_tape.add_def(op_def) if default_tape.retain_op_handles and op_handle is None: op_def.name = gc.OP.alloc(op_def.type) # Dispatch the computation. if pre_callback is not None: pre_callback(ws, op_def.name) ws.run_operator(op_def) # Return the outputs. return outputs[0] if len(outputs) == 1 else outputs
def forward(inputs, run_config, **kwargs): """Compute the function outputs.""" graph_tape = tapes.get_tape() execute_ws = workspace.get_workspace() device = run_config['device'] # Add inputs. inputs_id = [] enable_grad = False for i, input in enumerate(inputs): inputs_id.append(input.id) if input.requires_grad: enable_grad = True if run_config['check_device'] and input._device != device: raise RuntimeError( 'Mismatched device between function and ' 'element {} of input tensors. ({} vs. {})' .format(i, device, input._device)) # Unify grad modes. no_grad = run_config['no_grad'] no_grad = no_grad or not grad_mode.is_grad_enabled() enable_grad = enable_grad and not no_grad if hasattr(graph_tape, '_exporting'): # Ensure the intermediates saved for the exporting graph. no_grad, enable_grad = False, True # Add outputs. outputs, outputs_id = [], [] output_specs = kwargs.get('outputs', [None]) for i, spec in enumerate(output_specs): if spec is None: outputs.append(Tensor( device=device.copy(), impl=execute_ws.create_tensor( scope=context.get_variable_scope(enable_grad)), deleter=execute_ws._handle_pool)) outputs_id.append(outputs[i].id) else: if isinstance(spec, Tensor): spec._device = device.copy() outputs.append(spec) outputs_id.append(spec.id) else: outputs_id.append(spec) if enable_grad and outputs_id[-1] not in inputs_id: raise RuntimeError('Output tensor should be in inputs if requires grad.') # Specialize def for given inputs and outputs. op_name = '' # Optional operator name. op_def = run_config['def'].DeriveTo(inputs_id, outputs_id) # Record def if grad is enabled. if len(inputs) > 0 and not no_grad: if enable_grad: op_tape = tapes.OrderedTape() op_name = execute_ws.create_handle(op_def.type) op_def.name = op_name op_tape.add_element(op_def) op_tape.add_handle(op_name) for input in inputs: op_tape.add_source(input) for output in outputs: op_tape.merge_from(output._tape) for output in outputs: output._tape = op_tape output._requires_grad = True else: for output in outputs: output._requires_grad = False # Ensure the named operator for the tracing graph. if hasattr(graph_tape, '_tracing'): if not op_name: op_name = execute_ws.create_handle(op_def.type) op_def.name = op_name graph_tape.add_element(op_def) graph_tape.add_handle(op_name) # Save inputs for the checkpointing graph. if hasattr(graph_tape, '_checkpointing'): for input in inputs: if input._tape: if input._retains_grad: graph_tape.add_source(input) elif input._requires_grad: graph_tape.add_source(input) # Emit to dispatch this execution. for feed_key, value_type in run_config['feed_dict'].items(): dest = execute_ws.create_tensor(op_name + '/' + feed_key) dest.FromNumpy(numpy.array(kwargs[feed_key], value_type), True) execute_ws.run_operator(op_def) # Return single or repeated outputs. return outputs[0] if len(outputs) == 1 else outputs