def _create_variable(v, name, shape): # Create and initialize variables class Variable: pass parameter = v.type == "Parameter" variable_instance = None if parameter: if v.initializer.type == 'Normal': initializer = NormalInitializer(v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHe' or v.initializer.type == 'NormalAffineHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHe' or v.initializer.type == 'NormalConvolutionHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Uniform': initializer = UniformInitializer( lim=[-v.initializer.multiplier, v.initializer.multiplier]) elif v.initializer.type == 'UniformAffineGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'UniformConvolutionGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Constant': initializer = ConstantInitializer(value=v.initializer.multiplier) else: initializer = None variable_instance = get_parameter_or_create(name, shape, initializer) else: # create empty variable, memory will be allocated in network.setup() # after network optimization variable_instance = nn.Variable() variable = Variable() variable.name = name variable.parameter = parameter variable.shape = shape variable.variable_instance = variable_instance return variable
def _create_variable(v, name, shape): # Create and initialize variables class Variable: pass parameter = v.type == "Parameter" variable_instance = None if parameter: if v.initializer.type == 'Normal': initializer = NormalInitializer(v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHe' or v.initializer.type == 'NormalAffineHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalAffineGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHe' or v.initializer.type == 'NormalConvolutionHeForward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_forward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionHeBackward': initializer = (lambda shape: NormalInitializer(calc_normal_std_he_backward( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'NormalConvolutionGlorot': initializer = (lambda shape: NormalInitializer(calc_normal_std_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Uniform': initializer = UniformInitializer( lim=[-v.initializer.multiplier, v.initializer.multiplier]) elif v.initializer.type == 'UniformAffineGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[0], numpy.prod(shape[1:])))(shape) * v.initializer.multiplier) elif v.initializer.type == 'UniformConvolutionGlorot': initializer = (lambda shape: UniformInitializer(calc_uniform_lim_glorot( shape[1], shape[0], kernel=shape[2:]))(shape) * v.initializer.multiplier) elif v.initializer.type == 'Constant': initializer = ConstantInitializer(value=v.initializer.multiplier) else: initializer = None variable_instance = get_parameter_or_create(name, shape, initializer) else: # create empty variable, memory will be allocated in network.setup() # after network optimization variable_instance = nn.Variable() variable = Variable() variable.name = name variable.parameter = parameter variable.shape = shape variable.variable_instance = variable_instance return variable
def res_unit(x, scope_name, rng, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): w_init = UniformInitializer(calc_uniform_lim_glorot( C, C / 2, kernel=(1, 1)), rng=rng) h = PF.convolution(x, C / 2, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): w_init = UniformInitializer(calc_uniform_lim_glorot( C / 2, C / 2, kernel=(3, 3)), rng=rng) h = PF.convolution(h, C / 2, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN with nn.parameter_scope("conv3"): w_init = UniformInitializer(calc_uniform_lim_glorot( C / 2, C, kernel=(1, 1)), rng=rng) h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) # Residual -> Relu h = F.relu(h + x) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, itr=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, sn=True, test=False, init_scale=1.0): """ """ if w_init is None: l, u = calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)) l, u = init_scale * l, init_scale * u w_init = UniformInitializer((l, u), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] // group) + tuple(kernel), w_init, not fix_parameters) w_sn = spectral_normalization_for_conv(w, itr=itr, test=test) if sn else w b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w_sn, b, base_axis, pad, stride, dilation, group)
def __init__(self, n_inmaps, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( n_inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w_shape = (n_inmaps, n_outmap) w = nn.Variable.from_numpy_array( w_init(w_shape)).apply(need_grad=not fix_parameters) b = None if with_bias: b_shape = (n_outmap, ) b = nn.Variable.from_numpy_array( b_init(b_shape)).apply(need_grad=not fix_parameters) self.W = w self.b = b self.base_axis = base_axis
def __init__(self, in_features, out_features, base_axis=1, w_init=None, b_init=None, rng=None, bias=True, name=''): Module.__init__(self, name=name) self._scope_name = f'<linear at {hex(id(self))}>' if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( in_features, out_features), rng=rng) self._W = Parameter((in_features, out_features), initializer=w_init, scope=self._scope_name) self._b = None if bias: if b_init is None: b_init = ConstantInitializer() self._b = Parameter((out_features, ), initializer=b_init, scope=self._scope_name) self._base_axis = base_axis self._in_features = in_features self._out_features = out_features
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, itr=1, fix_parameters=False, rng=None, with_bias=True, sn=True, test=False): """ """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) w_sn = spectral_normalization_for_affine( w, itr=itr, test=test) if sn else w b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w_sn, b, base_axis)
def __init__(self, inmaps, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w_shape = (outmaps, inmaps // group) + tuple(kernel) w = nn.Variable.from_numpy_array( w_init(w_shape)).apply(need_grad=not fix_parameters) b = None if with_bias: b_shape = (outmaps, ) b = nn.Variable.from_numpy_array( b_init(b_shape)).apply(need_grad=not fix_parameters) self.W = w self.b = b self.base_axis = base_axis self.pad = pad self.stride = stride self.dilation = dilation self.group = group
def inq_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, num_bits=4, inq_iterations=(), selection_algorithm='random', seed=-1, w_init=None, i_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Incremental Network Quantization Convolution Layer During training, the weights are sequentially quantized to power-of-two values, which allows the training of a multiplierless network. Using `inq_iterations`, one can specify after how many forward passes half of the learnable weights are fixed and quantized to powers-of-two. After reaching the last value in `inq_iterations`, all weights are fixed. For more details, please refer to the reference. Reference: Zhou A, Yao A, Guo Y, Xu L, Chen Y. Incremental network quantization: Towards lossless CNNs with low-precision weights. <https://arxiv.org/abs/1702.03044> Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. num_bits (int): Number of bits per weight. Value has to be larger than 1 as one bit is already used to code the value "0" inq_iterations (tuple of int): Tuple of iteration numbers at which we fix half of the weights. selection_algorithm (str): Chooses algorithm that is used to decide which weights are fixed. ("largest_abs" ... fix weights with largest absolute value, "random" ... fix weights randomly) seed (int): Random seed for INQ algorithm w_init (~nnabla.initializer.BaseInitializer): Initializer for the weight. i_init (~nnabla.initializer.BaseInitializer): Initializer for the indicators (0 ... learnable, 1 ... fixed). b_init (~nnabla.initializer.BaseInitializer): Initializer for the bias. fix_parameters (bool): When set to `True`, the weight and bias will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if i_init is None: i_init = ConstantInitializer() if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) i = get_parameter_or_create( "I", (outmaps, inp.shape[base_axis]) + tuple(kernel), i_init, False) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.inq_convolution(inp, w, i, b, base_axis, pad, stride, dilation, group, num_bits, inq_iterations, selection_algorithm, seed)
def conv(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, use_wscale=True, use_he_backward=False): """ """ # Use He backward if use_he_backward: std = calc_normal_std_he_backward(inp.shape[base_axis], outmaps, kernel=kernel) else: std = calc_normal_std_he_forward(inp.shape[base_axis], outmaps, kernel=kernel) # W init if w_init is None and use_wscale: # Equalized Learning Rate w_init = NormalInitializer(1.) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) w *= std elif w_init is None and not use_wscale: w_init = NormalInitializer(std) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) else: if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) if with_bias and b_init is None: b_init = ConstantInitializer() b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def __init__(self, in_channels, out_channels, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, channel_last=False, name=''): Module.__init__(self, name=name) self._scope_name = f'<convolution at {hex(id(self))}>' if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( in_channels, out_channels, tuple(kernel)), rng=rng) w_shape = (out_channels, in_channels // group) + tuple(kernel) b_shape = (out_channels, ) self._b = None if with_bias and b_init is None: b_init = ConstantInitializer() if fix_parameters: self._W = nn.Variable.from_numpy_array(w_init(w_shape)) if with_bias: self._b = nn.Variable.from_numpy_array(b_init(b_shape)) else: self._W = Parameter(w_shape, initializer=w_init, scope=self._scope_name) if with_bias: self._b = Parameter(b_shape, initializer=b_init, scope=self._scope_name) self._base_axis = base_axis self._pad = pad self._stride = stride self._dilation = dilation self._group = group self._kernel = kernel self._in_channels = in_channels self._out_channels = out_channels self._channel_last = channel_last self._fix_parameters = fix_parameters self._rng = rng
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ N-D Convolution with a bias term. For Dilated Convolution (a.k.a. Atrous Convolusion), refer to: - Chen et al., DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. https://arxiv.org/abs/1606.00915 - Yu et al., Multi-Scale Context Aggregation by Dilated Convolutions. https://arxiv.org/abs/1511.07122 Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True, use_wscale=True, use_he_backward=False): """ """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) # Use He backward if use_he_backward: std = calc_normal_std_he_backward(inp.shape[base_axis], n_outmap) else: std = calc_normal_std_he_forward(inp.shape[base_axis], n_outmap) # W init if w_init is None and use_wscale: # Equalized Learning Rate w_init = NormalInitializer(1.) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) w *= std elif w_init is None and not use_wscale: w_init = NormalInitializer(std) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) else: if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], n_outmaps), rng=rng) w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) if with_bias and b_init is None: b_init = ConstantInitializer() b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def res_unit(x, scope_name, rng, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): w_init = UniformInitializer( calc_uniform_lim_glorot(C, C / 2, kernel=(1, 1)), rng=rng) h = PF.convolution(x, C / 2, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): w_init = UniformInitializer( calc_uniform_lim_glorot(C / 2, C / 2, kernel=(3, 3)), rng=rng) h = PF.convolution(h, C / 2, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN with nn.parameter_scope("conv3"): w_init = UniformInitializer( calc_uniform_lim_glorot(C / 2, C, kernel=(1, 1)), rng=rng) h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) # Residual -> Relu h = F.relu(h + x) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h
def dense(x, output_dim, base_axis=1, w_init=None, b_init=I.ConstantInitializer(0), activation=F.tanh): if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(np.prod(x.shape[1:]), output_dim)) return activation( PF.affine(x, output_dim, base_axis=base_axis, w_init=w_init, b_init=b_init))
def deconvolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ Deconvolution layer. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of deconvolution kernels (which is equal to the number of output channels). For example, to apply deconvolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply deconvolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( outmaps, inp.shape[base_axis], tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (inp.shape[base_axis], outmaps / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.deconvolution(inp, w, b, base_axis, pad, stride, dilation, group)
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """ The affine layer, also known as the fully connected layer. Computes .. math:: {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}. where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively, and :math:`{\\mathbf A}, {\\mathbf b}` are constants. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)f """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer(calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ N-D Convolution with a bias term. For Dilated Convolution (a.k.a. Atrous Convolusion), refer to: - Chen et al., DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. https://arxiv.org/abs/1606.00915 - Yu et al., Multi-Scale Context Aggregation by Dilated Convolutions. https://arxiv.org/abs/1511.07122 Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group)
def affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """ The affine layer, also known as the fully connected layer. Computes .. math:: {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}. where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively, and :math:`{\\mathbf A}, {\\mathbf b}` are constants. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)f """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.affine(inp, w, b, base_axis)
def discriminator(x, y, scopename="discriminator", maps=64, n_classes=1000, s=4, test=False, sn=True): with nn.parameter_scope(scopename): # Resblocks h = optblock_d(x, y, "block-1", n_classes, maps * 1, test=test, sn=sn) h = resblock_d(h, y, "block-2", n_classes, maps * 2, test=test, sn=sn) h = attnblock(h, sn=sn, test=test) h = resblock_d(h, y, "block-3", n_classes, maps * 4, test=test, sn=sn) h = resblock_d(h, y, "block-4", n_classes, maps * 8, test=test, sn=sn) h = resblock_d(h, y, "block-5", n_classes, maps * 16, test=test, sn=sn) h = resblock_d(h, y, "block-6", n_classes, maps * 16, downsample=False, test=test, sn=sn) # Last affine #h = F.leaky_relu(h, 0.2) h = F.relu(h) h = F.sum(h, axis=(2, 3)) o0 = affine(h, 1, sn=sn, test=test) # Project discriminator l, u = calc_uniform_lim_glorot(n_classes, maps * 16) e = embed(y, n_classes, maps * 16, initializer=UniformInitializer((l, u)), name="projection", sn=sn, test=test) o1 = F.sum(h * e, axis=1, keepdims=True) return o0 + o1
def deconvolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ Deconvolution layer. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of deconvolution kernels (which is equal to the number of output channels). For example, to apply deconvolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply deconvolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(outmaps, inp.shape[base_axis], tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (inp.shape[base_axis], outmaps / group) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.deconvolution(inp, w, b, base_axis, pad, stride, dilation, group)
def masked_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """ """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] / group) + tuple(kernel), w_init, not fix_parameters) mask_w = get_parameter_or_create("Mw", w.shape, ConstantInitializer(0.), False) w_masked = w * mask_w b = None b_masked = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) mask_b = get_parameter_or_create("Mb", b.shape, ConstantInitializer(0.), False) b_masked = b * mask_b return F.convolution(inp, w_masked, b_masked, base_axis, pad, stride, dilation, group)
def binary_connect_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Convolution, multiplier-less inner-product. Binary Connect Convolution is the convolution function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with BatchNormalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create("Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.binary_connect_convolution(inp, w, wb, b, base_axis, pad, stride, dilation, group)
def binary_connect_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Convolution, multiplier-less inner-product. Binary Connect Convolution is the convolution function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with BatchNormalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.binary_connect_convolution(inp, w, wb, b, base_axis, pad, stride, dilation, group)
def binary_weight_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Affine, multiplier-less inner-product with a scale factor. Binary Weight Affine is the affine function, but the inner product in this function is the following, .. math:: y_j = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}} \sum_{i} sign(w_{ji}) x_i Therefore :math:`sign(w_{ji})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}}`. The number of ::math:`\\alpha` is the outmaps of the affine function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for the weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for the binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for the bias. fix_parameters (bool): When set to `True`, the weight and bias will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer(calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer(calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create("Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) alpha = get_parameter_or_create("alpha", n_outmaps, ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.binary_weight_affine(inp, w, wb, alpha, b, base_axis)
def binary_connect_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Affine, multiplier-less inner-product. Binary Connect Affine is an affine function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_i = \sum_{i} sign(w_i) x_i. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with Batch Normalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer(calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer(calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create("Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, not fix_parameters) return F.binary_connect_affine(inp, w, wb, b, base_axis)
def lstm(x, mask, state_size, w_init=None, inner_w_init=None, forget_bias_init=I.ConstantInitializer(1), b_init=I.ConstantInitializer(0), initial_state=None, dropout=0, train=True, rng=np.random): """ x: (batch_size, length, input_size) mask: (batch_size, length) """ batch_size, length, input_size = x.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("lstm"): # (batch_size, length, state_size) xi = PF.affine(xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine(xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine(xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine(xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) hs = [] cs = [] with nn.parameter_scope("lstm"): for i, f, c2, o, m in zip(xi, xf, xc, xo, mask): i_t = PF.affine(z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t = F.sigmoid(i + i_t) f_t = PF.affine(z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t = F.sigmoid(f + f_t) c_t = PF.affine(z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine(z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) hs.append(h_t) cs.append(c_t) return concatenate(*hs, axis=1), concatenate(*cs, axis=1)
def cond_att_lstm(x, parent_index, mask, context, context_mask, state_size, att_hidden_size, initial_state=None, initial_cell=None, hist=None, dropout=0, train=True, w_init=None, inner_w_init=None, b_init=I.ConstantInitializer(0), forget_bias_init=I.ConstantInitializer(1)): """ x: (batch_size, length, input_size) parent_index: (batch_size, length) mask: (batch_size, length) context: (batch_size, context_length, context_size) context_mask: (batch_size, context_length) hist: (batch_size, l, state_size) """ batch_size, length, input_size = x.shape _, context_length, context_size = context.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("cond_att_lstm"): # (batch_size, length, state_size) with nn.parameter_scope("lstm"): xi = PF.affine( xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine( xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine( xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine( xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") with nn.parameter_scope("context"): # context_att_trans: (batch_size, context_size, att_hidden_size) context_att_trans = PF.affine( context, att_hidden_size, base_axis=2, w_init=w_init, b_init=b_init, name="layer1_c") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state if initial_cell is None: c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() else: c = initial_cell if hist is None: hist = nn.Variable((batch_size, 1, state_size), need_grad=False) hist.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) # (batch_size, max_action_length) parent_index = parent_index + 1 # index == 0 means that parent is root # (batch_size) parent_index = split(parent_index, axis=1) hs = [] cs = [] ctx = [] for i, f, c2, o, m, p in zip(xi, xf, xc, xo, mask, parent_index): h_num = hist.shape[1] with nn.parameter_scope("context"): h_att_trans = PF.affine( h, att_hidden_size, with_bias=False, w_init=w_init, name="layer1_h") # (batch_size, att_hidden_size) h_att_trans = F.reshape(h_att_trans, (batch_size, 1, att_hidden_size)) h_att_trans = F.broadcast( h_att_trans, (batch_size, context_length, att_hidden_size)) att_hidden = F.tanh(context_att_trans + h_att_trans) att_raw = PF.affine( att_hidden, 1, base_axis=2, w_init=w_init, b_init=b_init) # (batch_size, context_length, 1) att_raw = F.reshape(att_raw, (batch_size, context_length)) ctx_att = F.exp(att_raw - F.max(att_raw, axis=1, keepdims=True)) ctx_att = ctx_att * context_mask ctx_att = ctx_att / F.sum(ctx_att, axis=1, keepdims=True) ctx_att = F.reshape(ctx_att, (batch_size, context_length, 1)) ctx_att = F.broadcast(ctx_att, (batch_size, context_length, context_size)) ctx_vec = F.sum( context * ctx_att, axis=1) # (batch_size, context_size) # parent_history p = F.reshape(p, (batch_size, 1)) p = F.one_hot(p, (h_num, )) p = F.reshape(p, (batch_size, 1, h_num)) par_h = F.batch_matmul(p, hist) # [batch_size, 1, state_size] par_h = F.reshape(par_h, (batch_size, state_size)) with nn.parameter_scope("lstm"): i_t = PF.affine( z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Ci") i_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pi") i_t = F.sigmoid(i + i_t) f_t = PF.affine( z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cf") f_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pf") f_t = F.sigmoid(f + f_t) c_t = PF.affine( z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cc") c_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine( z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Co") o_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Po") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) ctx_vec = F.reshape( ctx_vec, (batch_size, 1, context_size), inplace=False) hs.append(h_t) cs.append(c_t) ctx.append(ctx_vec) hist = F.concatenate( hist, h_t, axis=1) # (batch_size, h_num + 1, state_size) return concatenate( *hs, axis=1), concatenate( *cs, axis=1), concatenate( *ctx, axis=1), hist
def cifar100_resnet23_prediction(image, ctx, test=False): """ Construct ResNet 23 """ # Residual Unit def res_unit(x, scope_name, rng, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): w_init = UniformInitializer( calc_uniform_lim_glorot(C, C / 2, kernel=(1, 1)), rng=rng) h = PF.convolution(x, C / 2, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): w_init = UniformInitializer( calc_uniform_lim_glorot(C / 2, C / 2, kernel=(3, 3)), rng=rng) h = PF.convolution(h, C / 2, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN with nn.parameter_scope("conv3"): w_init = UniformInitializer( calc_uniform_lim_glorot(C / 2, C, kernel=(1, 1)), rng=rng) h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) # Residual -> Relu h = F.relu(h + x) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h # Random generator for using the same init parameters in all devices rng = np.random.RandomState(0) nmaps = 384 ncls = 100 # Conv -> BN -> Relu with nn.context_scope(ctx): with nn.parameter_scope("conv1"): # Preprocess if not test: image = F.image_augmentation(image, contrast=1.0, angle=0.25, flip_lr=True) image.need_grad = False w_init = UniformInitializer( calc_uniform_lim_glorot(3, nmaps, kernel=(3, 3)), rng=rng) h = PF.convolution(image, nmaps, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) h = res_unit(h, "conv2", rng, False) # -> 32x32 h = res_unit(h, "conv3", rng, True) # -> 16x16 h = res_unit(h, "conv4", rng, False) # -> 16x16 h = res_unit(h, "conv5", rng, True) # -> 8x8 h = res_unit(h, "conv6", rng, False) # -> 8x8 h = res_unit(h, "conv7", rng, True) # -> 4x4 h = res_unit(h, "conv8", rng, False) # -> 4x4 h = F.average_pooling(h, kernel=(4, 4)) # -> 1x1 w_init = UniformInitializer( calc_uniform_lim_glorot(int(np.prod(h.shape[1:])), ncls, kernel=(1, 1)), rng=rng) pred = PF.affine(h, ncls, w_init=w_init) return pred
def cifar10_resnet23_prediction(image, ctx, test=False): """ Construct ResNet 23 """ # Residual Unit def res_unit(x, scope_name, rng, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): w_init = UniformInitializer(calc_uniform_lim_glorot( C, C / 2, kernel=(1, 1)), rng=rng) h = PF.convolution(x, C / 2, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): w_init = UniformInitializer(calc_uniform_lim_glorot( C / 2, C / 2, kernel=(3, 3)), rng=rng) h = PF.convolution(h, C / 2, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) # Conv -> BN with nn.parameter_scope("conv3"): w_init = UniformInitializer(calc_uniform_lim_glorot( C / 2, C, kernel=(1, 1)), rng=rng) h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) # Residual -> Relu h = F.relu(h + x) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h # Random generator for using the same init parameters in all devices rng = np.random.RandomState(0) nmaps = 64 ncls = 10 # Conv -> BN -> Relu with nn.context_scope(ctx): with nn.parameter_scope("conv1"): # Preprocess if not test: image = F.image_augmentation(image, contrast=1.0, angle=0.25, flip_lr=True) image.need_grad = False w_init = UniformInitializer(calc_uniform_lim_glorot(3, nmaps, kernel=(3, 3)), rng=rng) h = PF.convolution(image, nmaps, kernel=(3, 3), pad=(1, 1), w_init=w_init, with_bias=False) h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) h = res_unit(h, "conv2", rng, False) # -> 32x32 h = res_unit(h, "conv3", rng, True) # -> 16x16 h = res_unit(h, "conv4", rng, False) # -> 16x16 h = res_unit(h, "conv5", rng, True) # -> 8x8 h = res_unit(h, "conv6", rng, False) # -> 8x8 h = res_unit(h, "conv7", rng, True) # -> 4x4 h = res_unit(h, "conv8", rng, False) # -> 4x4 h = F.average_pooling(h, kernel=(4, 4)) # -> 1x1 w_init = UniformInitializer(calc_uniform_lim_glorot(int( np.prod(h.shape[1:])), ncls, kernel=(1, 1)), rng=rng) pred = PF.affine(h, ncls, w_init=w_init) return pred
def binary_weight_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Convolution, multiplier-less inner-product with a scale factor. Binary Weight Convolution is the convolution function, but the inner product in this function is the following, .. math:: y_{n, a, b} = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}} \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_{n, m, i, j})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}}`. The number of :math:`n` is the number of outmaps of the convolution function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer( calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) alpha = get_parameter_or_create( "alpha", (outmaps, ), ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create( "b", (outmaps,), b_init, not fix_parameters) return F.binary_weight_convolution(inp, w, wb, alpha, b, base_axis, pad, stride, dilation, group)
def binary_weight_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, wb_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Convolution, multiplier-less inner-product with a scale factor. Binary Weight Convolution is the convolution function, but the inner product in this function is the following, .. math:: y_{n, a, b} = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}} \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}. Therefore :math:`sign(w_{n, m, i, j})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}}`. The number of :math:`n` is the number of outmaps of the convolution function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if wb_init is None: wb_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) wb = get_parameter_or_create("Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel), w_init, not fix_parameters) alpha = get_parameter_or_create("alpha", (outmaps, ), ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, not fix_parameters) return F.binary_weight_convolution(inp, w, wb, alpha, b, base_axis, pad, stride, dilation, group)
def quantized_affine(inp, n_outmaps, base_axis=1, w_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True, quantization_w=None, quantization_b=None): """Quantized Affine. Quantized affine with .. math:: y_j = \sum_{i} Q_w(w_{ji}) x_i + Q_b(b_j), where :math:`Q_w(.)` is the weight quantization function and :math:`Q_b(.)` the bias quantization function, respectively. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the quantized weights (`quantized weight`) 2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the quantized weights will not be in sync. 3) CPU and GPU implementations now use float value for `quantized weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. quantization_w (function): Quantization function that is applied to the the weights. Use `None` to not quantize the weights. quantization_b (function): Quantization function that is applied to the the bias. Use `None` to not quantize the bias. Returns: :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`) """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: inmaps = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer(calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() # Floating Weight w = get_parameter_or_create("W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, True, not fix_parameters) # Quantize weights if quantization_w is not None: w_q = get_parameter_or_create( "W_q", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, False) # Link computation graph real_w_q = quantization_w(w) real_w_q.persistent = True w_q.data = real_w_q.data else: real_w_q = w # Bias # Floating b = None b_q = None real_b_q = None if with_bias: b = get_parameter_or_create("b", n_outmaps, b_init, True, not fix_parameters) if quantization_b is not None: b_q = get_parameter_or_create("b_q", n_outmaps, b_init, False) # Link computation graph real_b_q = quantization_b(b) real_b_q.persistent = True b_q.data = real_b_q.data else: real_b_q = b return F.affine(inp, real_w_q, real_b_q, base_axis)
def binary_connect_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Connect Affine, multiplier-less inner-product. Binary Connect Affine is an affine function, except the definition of the inner product is modified. The input-output relation of this function is as follows: .. math:: y_i = \sum_{i} sign(w_i) x_i. Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product simplifies to addition. This function should be used together with Batch Normalization. References: M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect: Training Deep Neural Networks with binary weights during propagations." Advances in Neural Information Processing Systems. 2015. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for bias. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.binary_connect_affine(inp, w, wb, b, base_axis)
def last_affine(self, x, dims, name): c = x.shape[1] l, u = I.calc_uniform_lim_glorot(c, 1) w_init = I.UniformInitializer((l, u)) return PF.affine(x, 1, w_init=w_init, name=name)
def capsule_layer(u, num_j=10, out_channels=16, num_routing_iter=3, grad_dynamic_routing=False, fix_parameters=False): ''' Takes PrimaryCapules output and produces DigitsCapsules. Args: u (nnabla.Variable): A shape of [B, in_capsules, in_channels] num_j (int): Number of output capsules. out_channels (int): Number of units in each capsule of the output. num_routing_iter (int): Dynamic routing iterations. grad_dynamic_routing (bool): If False, it doesn't compute gradients of dynamic routing coefficients as if they are given as hyperparameters. fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [B, num_j, out_channels]. ''' assert num_routing_iter > 0 batch_size = u.shape[0] num_i = u.shape[1] # 32 * 6 * 6 in_channels = u.shape[2] # Routing u_hat = W u in eq 2. # Implementing with broadcast and batch_matmul. Maybe not efficient. # Create a parameter tensor # Note: Consider num input channels multiplied by num input capsules from nnabla.initializer import UniformInitializer, calc_uniform_lim_glorot from nnabla.parameter import get_parameter_or_create w_init = UniformInitializer( calc_uniform_lim_glorot(num_i * in_channels, out_channels)) w_ij = get_parameter_or_create( "W", (1, num_j, num_i, in_channels, out_channels), w_init, not fix_parameters) # Tileing w_ij to [batch_size, num_j, num_i, in_channels, out_channels]. w_ij_tiled = F.broadcast(w_ij, (batch_size,) + w_ij.shape[1:]) # Tileing u to [batch_size, num_j, num_i, 1, in_channels]. u = u.reshape((batch_size, 1, num_i, 1, in_channels)) u_tiled = F.broadcast(u, (batch_size, num_j, num_i, 1, in_channels)) # Apply batched matrix multiplication: # [1, in_channels] * [in_channels, out_channels] --> [1, out_channels] # u_hat shape: [batch_size, num_j, num_i, out_channels] u_hat = F.batch_matmul(u_tiled, w_ij_tiled).reshape( (batch_size, num_j, num_i, out_channels)) # Dynamic Routing iteration doesn't compute gradients. # u_hat only used at the final step of computation of s. u_hat_no_grad = u_hat if not grad_dynamic_routing: u_hat_no_grad = F.identity(u_hat) u_hat_no_grad.need_grad = False # Dynamic routing described in Procedure 1. b = F.constant(0, (batch_size, num_j, num_i, 1)) for r in range(num_routing_iter): # u_hat is only used in the last step. uh = u_hat_no_grad if r == num_routing_iter - 1: uh = u_hat # 4: Softmax in eq 3 c = F.softmax(b, axis=1) # 5: Left of eq 2. s shape: [B, num_j, out_channels] s = F.sum(c * uh, axis=2) # 6: eq 1 v = squash(s, axis=2) if r == num_routing_iter - 1: return u_hat, v # 7: Update by agreement b = b + F.sum(v.reshape((batch_size, num_j, 1, out_channels)) * uh, axis=3, keepdims=True)
def quantized_convolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None, group=1, w_init=None, b_init=None, base_axis=1, fix_parameters=False, rng=None, with_bias=True, quantization_w=None, quantization_b=None): """Quantized Convolution. Quantized Convolution where the input/output relationship is .. math:: y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} Q_w(w_{n, m, i, j}) x_{m, a + i, b + j} + Q_b(b_n), where :math:`Q_w(w_{n, m, i, j})` is the weight quantization function and :math:`Q_w(b_{n})` is the bias quantization function. .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the quantized weights (`quantized weight`) 2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the quantized weights will not be in sync. 3) CPU and GPU implementations now use float value for `quantized weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): N-D array. outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16. kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5). pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions. stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions. dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions. group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction. w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. fix_parameters (bool): When set to `True`, the weights and biases will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. quantization_w (function): Quantization function that is applied to the the weights. Use `None` to not quantize the weights. quantization_b (function): Quantization function that is applied to the the bias. Use `None` to not quantize the bias. Returns: :class:`~nnabla.Variable`: N-D array. """ if w_init is None: w_init = UniformInitializer(calc_uniform_lim_glorot( inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng) if with_bias and b_init is None: b_init = ConstantInitializer() # Floating Weight w = get_parameter_or_create("W", (outmaps, inp.shape[base_axis] // group) + tuple(kernel), w_init, True, not fix_parameters) # Quantize weights if quantization_w is not None: w_q = get_parameter_or_create( "W_q", (outmaps, inp.shape[base_axis] // group) + tuple(kernel), w_init, False) # Link computation graph real_w_q = quantization_w(w) real_w_q.persistent = True w_q.data = real_w_q.data else: real_w_q = w # Bias # Floating b = None b_q = None real_b_q = None if with_bias: b = get_parameter_or_create("b", (outmaps, ), b_init, True, not fix_parameters) if quantization_b is not None: b_q = get_parameter_or_create("b_q", (outmaps, ), b_init, False) # Link computation graph real_b_q = quantization_b(b) real_b_q.persistent = True b_q.data = real_b_q.data else: real_b_q = b return F.convolution(inp, real_w_q, real_b_q, base_axis, pad, stride, dilation, group)
def binary_weight_affine(inp, n_outmaps, base_axis=1, w_init=None, wb_init=None, b_init=None, fix_parameters=False, rng=None, with_bias=True): """Binary Weight Affine, multiplier-less inner-product with a scale factor. Binary Weight Affine is the affine function, but the inner product in this function is the following, .. math:: y_j = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}} \sum_{i} sign(w_{ji}) x_i Therefore :math:`sign(w_{ji})` is either :math:`1` or :math:`-1` and the inner product simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}}`. The number of ::math:`\\alpha` is the outmaps of the affine function. References: Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks." arXiv preprint arXiv:1603.05279 (2016). .. note:: 1) if you would like to share weights between some layers, please make sure to share the standard, floating value weights (`weight`) and not the binarized weights (`binary_weight`) 2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called, and not after a call to :func:`~nnabla._variable.Variable.backward`. To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the float weights and the binary weights will not be in sync. 3) CPU and GPU implementations now use float value for `binary_weight`, since this function is only for simulation purposes. Args: inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix. n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data. base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions. w_init (~nnabla.initializer.BaseInitializer): Initializer for the weight. wb_init (~nnabla.initializer.BaseInitializer): Initializer for the binary weight. b_init (~nnabla.initializer.BaseInitializer): Initializer for the bias. fix_parameters (bool): When set to `True`, the weight and bias will not be updated. rng (numpy.random.RandomState): Random generator for Initializer. with_bias (bool): Specify whether to include the bias term. Returns: :class:`~nnabla.Variable` """ if not hasattr(n_outmaps, '__iter__'): n_outmaps = [n_outmaps] n_outmaps = list(n_outmaps) n_outmap = int(np.prod(n_outmaps)) if w_init is None: fan_in = np.prod(inp.shape[base_axis:]) w_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if wb_init is None: fan_in = np.prod(inp.shape[base_axis:]) wb_init = UniformInitializer( calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng) if b_init is None: b_init = ConstantInitializer() w = get_parameter_or_create( "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, w_init, not fix_parameters) wb = get_parameter_or_create( "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps, wb_init, not fix_parameters) alpha = get_parameter_or_create( "alpha", n_outmaps, ConstantInitializer(0), False) b = None if with_bias: b = get_parameter_or_create( "b", n_outmaps, b_init, not fix_parameters) return F.binary_weight_affine(inp, w, wb, alpha, b, base_axis)