def forward(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with chainer.using_device(self.device): self.c = variable.Variable( xp.zeros((len(x), self.state_size), dtype=x.dtype)) lstm_in = reshape.reshape(lstm_in, (len(lstm_in), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, a.shape[:2]) i = reshape.reshape(i, i.shape[:2]) f = reshape.reshape(f, f.shape[:2]) o = reshape.reshape(o, o.shape[:2]) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((x.shape[0], self.state_size), dtype=x.dtype), volatile="auto") lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), a.shape[1])) i = reshape.reshape(i, (len(i.data), i.shape[1])) f = reshape.reshape(f, (len(f.data), f.shape[1])) o = reshape.reshape(o, (len(o.data), o.shape[1])) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile='auto') lstm_in = reshape.reshape( lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), a.data.shape[1])) i = reshape.reshape(i, (len(i.data), i.data.shape[1])) f = reshape.reshape(f, (len(f.data), f.data.shape[1])) o = reshape.reshape(o, (len(o.data), o.data.shape[1])) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def forward(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp with chainer.using_device(self.device): self.c = variable.Variable( xp.zeros((len(x), self.state_size), dtype=x.dtype)) lstm_in = reshape.reshape( lstm_in, (len(lstm_in), lstm_in.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, a.shape[:2]) i = reshape.reshape(i, i.shape[:2]) f = reshape.reshape(f, f.shape[:2]) o = reshape.reshape(o, o.shape[:2]) peep_in_i = self.peep_i(self.c) peep_in_f = self.peep_f(self.c) a = tanh.tanh(a) i = sigmoid.sigmoid(i + peep_in_i) f = sigmoid.sigmoid(f + peep_in_f) self.c = a * i + f * self.c peep_in_o = self.peep_o(self.c) o = sigmoid.sigmoid(o + peep_in_o) self.h = o * tanh.tanh(self.c) return self.h
def crf1d(cost, xs, ys): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input feature vector for each label. Each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. ys (list of Variable): Expected output labels. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <http://repository.upenn.edu/cis_papers/159/>`_. """ assert xs[0].data.shape[1] == cost.data.shape[0] n_label = cost.data.shape[0] n_batch = xs[0].data.shape[0] alpha = xs[0] for x in xs[1:]: b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x logz = logsumexp.logsumexp(alpha, axis=1) score = 0 cost = reshape.reshape(cost, (cost.data.size, 1)) for y1, y2 in zip(ys[:-1], ys[1:]): score += reshape.reshape( embed_id.embed_id(y1 * n_label + y2, cost), (n_batch,)) for x, y in zip(xs, ys): score += select_item.select_item(x, y) return _sum.sum(logz - score) / n_batch
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size, )) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. When you specify a color image as a :class:`numpy.ndarray`, make sure that color order is RGB. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Use no_backprop_mode to reduce memory consumption with function.no_backprop_mode(), chainer.using_config('train', False): x = Variable(self.xp.asarray(x)) y = self(x, layers=['prob'])['prob'] if oversample: n = len(y) // 10 y_shape = y.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int or tuple of int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if axis is None: pass elif isinstance(axis, tuple): axis = [a + x.ndim if a < 0 else a for a in axis] axis.sort() for a, b in six.moves.zip(axis, axis[1:]): if a == b: raise ValueError('duplicate value in \'axis\'') axis = tuple(axis) else: if axis < 0: axis += x.ndim axis = (axis, ) if weights is not None: if axis is not None and len(axis) > 1: raise ValueError( 'tuple axis is not supported when weights is given') divider = sum_mod.sum(weights) if axis is not None: w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = 1 for a in axis: divider *= x.shape[a] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int or tuple of int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if axis is None: pass elif isinstance(axis, tuple): axis = [a + x.ndim if a < 0 else a for a in axis] axis.sort() for a, b in six.moves.zip(axis, axis[1:]): if a == b: raise ValueError('duplicate value in \'axis\'') axis = tuple(axis) else: if axis < 0: axis += x.ndim axis = (axis,) if weights is not None: if axis is not None and len(axis) > 1: raise ValueError( 'tuple axis is not supported when weights is given') divider = sum_mod.sum(weights) if axis is not None: w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to( reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = 1 for a in axis: divider *= x.shape[a] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast when weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Set volatile option to ON to reduce memory consumption x = Variable(self.xp.asarray(x), volatile=flag.ON) y = self(x, layers=['prob'])['prob'] if oversample: n = y.data.shape[0] // 10 y_shape = y.data.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Use no_backprop_mode to reduce memory consumption with function.no_backprop_mode(): x = Variable(self.xp.asarray(x)) y = self(x, layers=['prob'])['prob'] if oversample: n = y.data.shape[0] // 10 y_shape = y.data.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def __call__(self, x): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ lstm_in = self.upward(x) if self.h is not None: lstm_in += self.lateral(self.h) else: xp = self.xp with cuda.get_device(self._device_id): self.h = variable.Variable( xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile='auto') if self.c is None: xp = self.xp with cuda.get_device(self._device_id): self.c = variable.Variable( xp.zeros((len(x.data), self.state_size), dtype=x.data.dtype), volatile='auto') lstm_in = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4)) a, i, f, o = split_axis.split_axis(lstm_in, 4, 2) a = reshape.reshape(a, (len(a.data), self.state_size)) i = reshape.reshape(i, (len(i.data), self.state_size)) f = reshape.reshape(f, (len(f.data), self.state_size)) o = reshape.reshape(o, (len(o.data), self.state_size)) c_tmp = tanh.tanh(a) * sigmoid.sigmoid(i) + sigmoid.sigmoid(f) * self.c self.c = zoneout.zoneout(self.c, c_tmp, self.c_ratio, self.train) self.h = zoneout.zoneout(self.h, sigmoid.sigmoid(o) * tanh.tanh(c_tmp), self.h_ratio, self.train) return self.h
def covariance(self): """ The covariance of the independent distribution. By definition, the covariance of the new distribution becomes block diagonal matrix. Let :math:`\\Sigma_{\\mathbf{x}}` be the covariance matrix of the original random variable :math:`\\mathbf{x} \\in \\mathbb{R}^d`, and :math:`\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots \\mathbf{x}^{(m)}` be the :math:`m` i.i.d. random variables, new covariance matrix :math:`\\Sigma_{\\mathbf{y}}` of :math:`\\mathbf{y} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots, \\mathbf{x}^{(m)}] \\in \\mathbb{R}^{md}` can be written as .. math:: \\left[\\begin{array}{ccc} \\Sigma_{\\mathbf{x}^{1}} & & 0 \\\\ & \\ddots & \\\\ 0 & & \\Sigma_{\\mathbf{x}^{m}} \\end{array} \\right]. Note that this relationship holds only if the covariance matrix of the original distribution is given analytically. Returns: ~chainer.Variable: The covariance of the distribution. """ num_repeat = array.size_of_shape( self.distribution.batch_shape[-self.reinterpreted_batch_ndims:]) dim = array.size_of_shape(self.distribution.event_shape) cov = repeat.repeat( reshape.reshape( self.distribution.covariance, ((self.batch_shape) + (1, num_repeat, dim, dim))), num_repeat, axis=-4) cov = reshape.reshape( transpose.transpose( cov, axes=( tuple(range(len(self.batch_shape))) + (-4, -2, -3, -1))), self.batch_shape + (num_repeat * dim, num_repeat * dim)) block_indicator = self.xp.reshape( self._block_indicator, tuple([1] * len(self.batch_shape)) + self._block_indicator.shape) return cov * block_indicator
def covariance(self): """ The covariance of the independent distribution. By definition, the covariance of the new distribution becomes block diagonal matrix. Let :math:`\\Sigma_{\\mathbf{x}}` be the covariance matrix of the original random variable :math:`\\mathbf{x} \\in \\mathbb{R}^d`, and :math:`\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots \\mathbf{x}^{(m)}` be the :math:`m` i.i.d. random variables, new covariance matrix :math:`\\Sigma_{\\mathbf{y}}` of :math:`\\mathbf{y} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, \\cdots, \\mathbf{x}^{(m)}] \\in \\mathbb{R}^{md}` can be written as .. math:: \\left[\\begin{array}{ccc} \\Sigma_{\\mathbf{x}^{1}} & & 0 \\\\ & \\ddots & \\\\ 0 & & \\Sigma_{\\mathbf{x}^{m}} \\end{array} \\right]. Note that this relationship holds only if the covariance matrix of the original distribution is given analytically. Returns: ~chainer.Variable: The covariance of the distribution. """ num_repeat = array.size_of_shape( self.distribution.batch_shape[-self.reinterpreted_batch_ndims:]) dim = array.size_of_shape(self.distribution.event_shape) cov = repeat.repeat(reshape.reshape(self.distribution.covariance, ((self.batch_shape) + (1, num_repeat, dim, dim))), num_repeat, axis=-4) cov = reshape.reshape( transpose.transpose(cov, axes=(tuple(range(len(self.batch_shape))) + (-4, -2, -3, -1))), self.batch_shape + (num_repeat * dim, num_repeat * dim)) block_indicator = self.xp.reshape( self._block_indicator, tuple([1] * len(self.batch_shape)) + self._block_indicator.shape) return cov * block_indicator
def forward_one_step(self, hps, x_data, sensor_data, y_data, train=True): x_r = Variable(x_data[:, 0:3, :, :], volatile=not train) x_s = Variable(sensor_data, volatile=not train) t = Variable(y_data, volatile=not train) cn1_r = F.max_pooling_2d(self.prelu1_r( self.bn1_r(self.conv1_r(x_r), test=not train)), ksize=2, stride=2, pad=0) cn2_r = F.max_pooling_2d(self.prelu2_r( self.bn2_r(self.conv2_r(cn1_r), test=not train)), ksize=2, stride=2, pad=0) cn3_r = F.max_pooling_2d(self.prelu3_r( self.bn3_r(self.conv3_r(cn2_r), test=not train)), ksize=2, stride=2, pad=0) cn5_rm = F.max_pooling_2d(self.prelu5( self.bn5(self.conv5(cn3_r), test=not train)), ksize=2, stride=2, pad=0) sen6 = F.dropout(self.prelu6(self.bn6(self.fc6(x_s), test=not train)), ratio=hps.dropout, train=train) cs7 = self.cccp7( concat.concat( (reshape.reshape(cn5_rm, (cn5_rm.shape[0], 1, cn5_rm.shape[1] * cn5_rm.shape[2] * cn5_rm.shape[3], 1)), reshape.reshape(sen6, (sen6.shape[0], 1, sen6.shape[1], 1))), axis=1)) cs8 = F.dropout(self.prelu8(self.bn8(self.fc8(cs7), test=not train)), ratio=hps.dropout, train=train) y = self.fc9(cs8) return y, F.mean_squared_error(y, t)
def __call__(self, x, train=True): x = reshape.reshape(x, (len(x.data), 1) + x.data.shape[1:]) x = self.convolution(x, train) xs = split_axis.split_axis(x, x.data.shape[2], 2) for x in xs: x.data = self.xp.ascontiguousarray(x.data) for r in self.recurrent: r.reset_state() xs = self.recurrent(xs, train) xs = self._linear(xs, train) return xs
def _sum_rightmost(value, dim): """Sum out `dim` many rightmost dimensions of a given tensor. Args: value (Tensor): A tensor of ``.dim()`` at least ``dim``. dim (int): The number of rightmost dims to sum out. """ if dim == 0: return value required_shape = value.shape[:-dim] + (-1, ) return sum_mod.sum(reshape.reshape(value, required_shape), axis=-1)
def __call__(self, x): x = self.embed(x) xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: x = self.rnn1(x) x = self.rnn2(x) x = self.linear(x) x = reshape.reshape(x, x.data.shape + (-1, )) ret.append(x) ret = concat.concat(ret, axis=2) return ret
def forward(self, x, y): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ if self.upward.has_uninitialized_params: in_size = x.size // x.shape[0] self.upward._initialize_params(in_size) self._initialize_params() batch = x.shape[0] lstm_in = self.upward(x) if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) lstm_in += self.lateral(h_update) else: lstm_in += self.lateral(self.h) if self.c is None: xp = self.xp self.c = variable.Variable( xp.zeros((batch, self.state_size), dtype=x.dtype), volatile='auto') r = reshape.reshape(lstm_in, (len(lstm_in.data), lstm_in.data.shape[1] // 4, 4) + lstm_in.data.shape[2:]) a, i, f, o = [r[:, :, i] for i in range(4)] # self.c, y = lstm.lstm(self.c,lstm_in) a = tanh.tanh(a) # tanh.tanh(a) i = sigmoid.sigmoid(i) f = sigmoid.sigmoid(f) o = sigmoid.sigmoid(o) self.c = a * i + f * self.c + tanh(self.w_y(y)) self.h = o * tanh.tanh(self.c) return self.h
def get_weights(self, hps, x_data, sensor_data, train=False): x_r = Variable(x_data[:, 0:3, :, :], volatile=not train) x_s = Variable(sensor_data, volatile=not train) cn1_r = F.max_pooling_2d(self.prelu1_r( self.bn1_r(self.conv1_r(x_r), test=not train)), ksize=2, stride=2, pad=0) cn2_r = F.max_pooling_2d(self.prelu2_r( self.bn2_r(self.conv2_r(cn1_r), test=not train)), ksize=2, stride=2, pad=0) cn3_r = F.max_pooling_2d(self.prelu3_r( self.bn3_r(self.conv3_r(cn2_r), test=not train)), ksize=2, stride=2, pad=0) cn5_rm = F.max_pooling_2d(self.prelu5( self.bn5(self.conv5(cn3_r), test=not train)), ksize=2, stride=2, pad=0) sen6 = F.dropout(self.prelu6(self.bn6(self.fc6(x_s), test=not train)), ratio=hps.dropout, train=train) cs7 = self.cccp7( concat.concat( (reshape.reshape(cn5_rm, (cn5_rm.shape[0], 1, cn5_rm.shape[1] * cn5_rm.shape[2] * cn5_rm.shape[3], 1)), reshape.reshape(sen6, (sen6.shape[0], 1, sen6.shape[1], 1))), axis=1)) cs8 = F.dropout(self.prelu8(self.bn8(self.fc8(cs7), test=not train)), ratio=hps.dropout, train=train) return cuda.to_cpu(cs8.data)
def __call__(self, batchsize): """.... Args: eps (~chainer.Variable): a wsize-length vector whose elements are drawn from normal distribution (mean = 0, std = 1). batchsize (~chainer.Variable): (batch size) * (number of truncated backward gradient calculation for a training dataset) Returns: ~chainer.Variable: Output of the linear layer. """ """ self.m_hat = reshape.reshape(sum.sum(self.M)/self.M.data.shape[0], (1,1)) M, m_hat = broadcast.broadcast(self.M, self.m_hat) self.s2_hat = sum.sum(self.S2 + (M - m_hat)*(M - m_hat))/self.M.data.shape[0] print('m_hat.data {}'.format(self.m_hat.data)) print('self.s2_hat.data {}'.format(self.s2_hat.data)) print('self.S2.data {}'.format(self.S2.data)) print('self.M.data {}'.format(self.M.data)) print('------------------') """ self.fWb, loss = adaptive_weight_noise(batchsize, self.M, self.logS2, self.use_weight_noise) if self.nobias: return reshape.reshape(self.fWb, (self.out_size, self.in_size)), loss else: self.fW, self.fb = split_axis.split_axis( self.fWb, numpy.asarray([(self.in_size - 1) * self.out_size]), axis=0) return reshape.reshape( self.fW, (self.out_size, self.in_size - 1)), self.fb, loss
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Variable to be separated. A :math:`(s_1, s_2, ..., s_N)` -shaped float array. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` .. admonition:: Example >>> x = np.arange(6).reshape((2, 3)).astype('f') >>> x array([[ 0., 1., 2.], [ 3., 4., 5.]], dtype=float32) >>> x.shape (2, 3) >>> y = F.separate(x) # split along axis=0 >>> type(y) <class 'tuple'> >>> len(y) 2 >>> y[0].shape (3,) >>> y[0].data array([ 0., 1., 2.], dtype=float32) >>> y = F.separate(x, axis=1) >>> len(y) 3 >>> y[0].shape (2,) >>> y[0].data array([ 0., 3.], dtype=float32) """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Variable to be separated. A :math:`(s_1, s_2, ..., s_N)` -shaped float array. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` .. admonition:: Example >>> x = np.arange(6).reshape((2, 3)).astype('f') >>> x array([[ 0., 1., 2.], [ 3., 4., 5.]], dtype=float32) >>> x.shape (2, 3) >>> y = F.separate(x) # split along axis=0 >>> isinstance(y, tuple) True >>> len(y) 2 >>> y[0].shape (3,) >>> y[0].data array([ 0., 1., 2.], dtype=float32) >>> y = F.separate(x, axis=1) >>> len(y) 3 >>> y[0].shape (2,) >>> y[0].data array([ 0., 3.], dtype=float32) """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def __call__(self, x): x = self.embed(x) xs = split_axis.split_axis(x, x.data.shape[1], 1) ret = [] for x in xs: for l in self.rnns: x = l(x) x = dropout.dropout(x, 0.25, self.train) for l in self.linears: x = l(x) x = reshape.reshape(x, x.data.shape + (-1, )) ret.append(x) ret = concat.concat(ret, axis=2) return ret
def maxout(x, pool_size, axis=1): """Maxout activation function. It accepts an input tensor ``x``, reshapes the ``axis`` dimension (say the size being ``M * pool_size``) into two dimensions ``(M, pool_size)``, and takes maximum along the ``axis`` dimension. The output of this function is same as ``x`` except that ``axis`` dimension is transformed from ``M * pool_size`` to ``M``. Typically, ``x`` is the output of a linear layer or a convolution layer. The following is the example where we use :func:`maxout` in combination with a Linear link. >>> import numpy, chainer, chainer.links as L >>> in_size, out_size, pool_size = 100, 100, 100 >>> l = L.Linear(in_size, out_size * pool_size) >>> x = chainer.Variable(numpy.zeros((1, in_size), 'f')) # prepare data >>> x = l(x) >>> y = maxout(x, pool_size) Args: x (~chainer.Variable): Input variable. Its first dimension is assumed to be the *minibatch dimension*. The other dimensions are treated as one concatenated dimension. Returns: ~chainer.Variable: Output variable. .. seealso:: :class:`~chainer.links.Maxout` """ if pool_size <= 0: raise ValueError('pool_size must be a positive integer.') x_shape = x.data.shape if x_shape[axis] % pool_size != 0: expect = 'x.data.shape[axis] % pool_size == 0' actual = 'x.data.shape[axis]={}, pool_size={}'.format( x_shape[axis], pool_size) msg = 'axis dimension must be divided by pool_size' raise type_check.InvalidType(expect, actual, msg) shape = (x_shape[:axis] + (x_shape[axis] // pool_size, pool_size) + x_shape[axis + 1:]) x = reshape.reshape(x, shape) return minmax.max(x, axis=axis + 1)
def maxout(x, pool_size, axis=1): """Maxout activation function. It accepts an input tensor ``x``, reshapes the ``axis`` dimension (say the size being ``M * pool_size``) into two dimensions ``(M, pool_size)``, and takes maximum along the ``axis`` dimension. The output of this function is same as ``x`` except that ``axis`` dimension is transformed from ``M * pool_size`` to ``M``. Typically, ``x`` is the output of a linear layer or a convolution layer. The following is the example where we use :func:`maxout` in combination with a Linear link. >>> import numpy, chainer, chainer.links as L >>> in_size, out_size, pool_size = 100, 100, 100 >>> l = L.Linear(in_size, out_size * pool_size) >>> x = chainer.Variable(numpy.zeros((1, in_size), 'f')) # prepare data >>> x = l(x) >>> y = maxout(x, pool_size) Args: x (~chainer.Variable): Input variable. Its first dimension is assumed to be the *minibatch dimension*. The other dimensions are treated as one concatenated dimension. Returns: ~chainer.Variable: Output variable. .. seealso:: :class:`~chainer.links.Maxout` """ if pool_size <= 0: raise ValueError('pool_size must be a positive integer.') x_shape = x.data.shape if x_shape[axis] % pool_size != 0: expect = 'x.data.shape[axis] % pool_size == 0' actual = 'x.data.shape[axis]={}, pool_size={}'.format( x_shape[axis], pool_size) msg = 'axis dimension must be divided by pool_size' raise type_check.InvalidType(expect, actual, msg) shape = (x_shape[:axis] + (x_shape[axis] // pool_size, pool_size) + x_shape[axis + 1:]) x = reshape.reshape(x, shape) return minmax.max(x, axis=axis + 1)
def functions(self): return collections.OrderedDict([ ('conv1', [self.conv1, relu]), ('pool1', [lambda x: max_pooling_2d(x, 3, stride=2)]), ('fire2', [self.fire2]), ('fire3', [self.fire3]), ('pool2', [lambda x: max_pooling_2d(x, 3, stride=2)]), ('fire4', [self.fire4]), ('fire5', [self.fire5]), ('pool3', [lambda x: max_pooling_2d(x, 3, stride=2)]), ('fire6', [self.fire6]), ('fire7', [self.fire7]), ('fire8', [self.fire8]), ('fire9', [self.fire9, dropout]), ('conv10', [self.conv10, relu]), ('pool4', [lambda x: average_pooling_2d(x, 13)]), ('prob', [lambda x: reshape(x, (-1, 1000))]), ])
def bias(x, y, axis=1): """Elementwise summation with broadcasting. Computes a elementwise summation of two input variables, with the shape of the latter variable broadcasted to match the shape of the former. ``axis`` is the first axis of the first variable along which the second variable is applied. The term "broadcasting" here comes from Caffe's bias layer so the "broadcasting" with the following arguments:: x : 100 x 3 x 40 x 5 x 6 y : 3 x 40 axis : 1 is equivalent to the following numpy broadcasting:: x : 100 x 3 x 40 x 5 x 6 y : (1 x) 3 x 40 x 1 x 1 Note that the axis of ``x`` to which we apply ``y`` is specified by the argument ``axis``, whose meaning is different from numpy's ``axis``. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable to be summed. y (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable to sum, broadcasted. axis (int): The first axis of ``x`` along which ``y`` is applied. Returns: ~chainer.Variable: Output variable. """ x_shape = x.shape y_shape = y.shape if chainer.is_debug(): assert x_shape[axis:axis + len(y_shape)] == y_shape y1_shape = tuple([1] * axis + list(y_shape) + [1] * (len(x_shape) - axis - len(y_shape))) y1 = reshape.reshape(y, y1_shape) y2 = broadcast.broadcast_to(y1, x_shape) return x + y2
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if weights is not None: divider = sum_mod.sum(weights) if axis is not None: if axis < 0: axis += x.ndim w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = x.shape[axis] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def bias(x, y, axis=1): """Elementwise summation with broadcasting. Computes a elementwise summation of two input variables, with the shape of the latter variable broadcasted to match the shape of the former. ``axis`` is the first axis of the first variable along which the second variable is applied. The term "broadcasting" here comes from Caffe's bias layer so the "broadcasting" with the following arguments:: x : 100 x 3 x 40 x 5 x 6 y : 3 x 40 axis : 1 is equivalent to the following numpy broadcasting:: x : 100 x 3 x 40 x 5 x 6 y : (1 x) 3 x 40 x 1 x 1 Note that the axis of ``x`` to which we apply ``y`` is specified by the argument ``axis``, whose meaning is different from numpy's ``axis``. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable to be summed. y (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable to sum, broadcasted. axis (int): The first axis of ``x`` along which ``y`` is applied. Returns: ~chainer.Variable: Output variable. """ x_shape = x.shape y_shape = y.shape if chainer.is_debug(): assert x_shape[axis:axis + len(y_shape)] == y_shape y1_shape = tuple([1] * axis + list(y_shape) + [1] * (len(x_shape) - axis - len(y_shape))) y1 = reshape.reshape(y, y1_shape) y2 = broadcast.broadcast_to(y1, x_shape) return x + y2
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if weights is not None: divider = sum_mod.sum(weights) if axis is not None: if axis < 0: axis += x.ndim w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to( reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = x.shape[axis] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def scale(x, y, axis=1): """Elementwise product with broadcasting. Computes a elementwise product of two input variables, with the shape of the latter variable broadcasted to match the shape of the former. ``axis`` is the first axis of the first variable along which the second variable is applied. The term "broadcasting" here comes from Caffe's scale layer so the "broadcasting" with the following arguments:: x : 100 x 3 x 40 x 60 y : 3 x 40 axis : 1 is equivalent to the following numpy broadcasting:: x : 100 x 3 x 40 x 60 y : 1 x 3 x 40 x 1 Note that how the ``axis`` indicates to which axis of ``x`` we apply ``y``. Args: x (~chainer.Variable): Input variable to be scaled. y (~chainer.Variable): Input variable to scale, broadcasted. axis (int): The first axis of ``x`` along which ``y`` is applied. Returns: ~chainer.Variable: Output variable. """ x_shape = x.shape y_shape = y.shape if chainer.is_debug(): assert x_shape[axis:axis + len(y_shape)] == y_shape y1_shape = tuple([1] * axis + list(y_shape) + [1] * (len(x_shape) - axis - len(y_shape))) y1 = reshape.reshape(y, y1_shape) y2 = broadcast.broadcast_to(y1, x_shape) return x * y2
def scale(x, y, axis=1): """Elementwise product with broadcasting. Computes a elementwise product of two input variables, with the shape of the latter variable broadcasted to match the shape of the former. ``axis`` is the first axis of the first variable along which the second variable is applied. The term "broadcasting" here comes from Caffe's scale layer so the "broadcasting" with the following arguments:: x : 100 x 3 x 40 x 60 y : 3 x 40 axis : 1 is equivalent to the following numpy broadcasting:: x : 100 x 3 x 40 x 60 y : 1 x 3 x 40 x 1 Note that how the ``axis`` indicates to which axis of ``x`` we apply ``y``. Args: x (~chainer.Variable): Input variable to be scaled. y (~chainer.Variable): Input variable to scale, broadcasted. axis (int): The first axis of ``x`` along which ``y`` is applied. Returns: ~chainer.Variable: Output variable. """ x_shape = x.shape y_shape = y.shape if chainer.is_debug(): assert x_shape[axis:axis + len(y_shape)] == y_shape y1_shape = tuple([1] * axis + list(y_shape) + [1] * (len(x_shape) - axis - len(y_shape))) y1 = reshape.reshape(y, y1_shape) y2 = broadcast.broadcast_to(y1, x_shape) return x * y2
def separate(x, axis=0): """Separates an array along a given axis. This function separates an array along a given axis. For example, shape of an array is ``(2, 3, 4)``. When it separates the array with ``axis=1``, it returns three ``(2, 4)`` arrays. This function is an inverse of :func:`chainer.functions.stack`. Args: x (chainer.Variable): Variable to be separated. axis (int): Axis along which variables are separated. Returns: tuple of chainer.Variable: Output variables. .. seealso:: :func:`chainer.functions.stack` """ shape = list(x.shape) del shape[axis] ys = split_axis.split_axis(x, x.shape[axis], axis, force_tuple=True) return tuple(reshape.reshape(y, shape) for y in ys)
def _stack_weight(ws): # TODO(unno): Input of the current LSTM implementaiton is shuffled w = stack.stack(ws, axis=1) shape = w.shape return reshape.reshape(w, (shape[0] * shape[1],) + shape[2:])
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction, **kwargs): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): The number of layers. dropout_ratio(float): Dropout ratio. hx (~chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is the number of layers and is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N`` is the dimension of the hidden units. cx (~chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of :class:`~chainer.Variable`): Weight matrices. ``ws[i]`` represents the weights for the i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` corresponds to :math:`W_j` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they are multiplied with input variables, where ``I`` is the size of the input and ``N`` is the dimension of the hidden units. All other matrices are ``(N, N)``-shaped. bs (list of list of :class:`~chainer.Variable`): Bias vectors. ``bs[i]`` represents the biases for the i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` corresponds to :math:`b_j` in the equation. The shape of each matrix is ``(N,)``. xs (list of :class:`~chainer.Variable`): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the mini-batch size for time ``t``. The sequences must be transposed. :func:`~chainer.functions.transpose_sequence` can be used to transpose a list of :class:`~chainer.Variable`\\ s each representing a sequence. When sequences has different lengths, they must be sorted in descending order of their lengths before transposing. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is the same as ``hx``. - ``cy`` is an updated cell states whose shape is the same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is the mini-batch size for time ``t``. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) lengths = [len(x) for x in xs] xs = chainer.functions.concat(xs, axis=0) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), (xs,))) if use_bi_direction: rnn = NStepBiLSTM else: rnn = NStepLSTM hy, cy, ys = rnn(n_layers, states, lengths)(*inputs) sections = numpy.cumsum(lengths[:-1]) ys = chainer.functions.split_axis(ys, sections, 0) return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def _stack_weight(ws): # TODO(unno): Input of the current LSTM implementaiton is shuffled w = stack.stack(ws, axis=1) shape = w.shape return reshape.reshape(w, (shape[0] * shape[1],) + shape[2:])
def _global_average_pooling_2d(x): n, channel, rows, cols = x.data.shape h = average_pooling_2d(x, (rows, cols), stride=1) h = reshape(h, (n, channel)) return h
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype(np.float32)) >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <https://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += (select_item.select_item(x, y) + reshape.reshape( embed_id.embed_id(y_prev * n_label + y, cost), (batch,))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction, **kwargs): """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction) Base function for Stack GRU/BiGRU functions. This function is used at :func:`chainer.functions.n_step_bigru` and :func:`chainer.functions.n_step_gru`. This function's behavior depends on argument ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimension of hidden units. Because of bi-direction, the first dimension length is ``2S``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing six matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing six vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimension of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-direction GRU. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) xp = cuda.get_array_module(hx, hx.data) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiGRU(n_layers, states) else: rnn = NStepGRU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws] hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws] xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs] hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward GRU # di=1, backward GRU xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx]) gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx]) W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1) U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1) r = sigmoid.sigmoid(W_r_x + U_r_h) z = sigmoid.sigmoid(W_z_x + U_z_h) h_bar = tanh.tanh(W_x + r * U_x) h_bar = (1 - z) * h_bar + z * h if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward GRU h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward GRU h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [ concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward) ] hy.append(h) else: # Uni-directional GRU xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def maxout(x, pool_size, axis=1): """Maxout activation function. It accepts an input tensor ``x``, reshapes the ``axis`` dimension (say the size being ``M * pool_size``) into two dimensions ``(M, pool_size)``, and takes maximum along the ``axis`` dimension. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. A :math:`n`-dimensional (:math:`n \\ge` ``axis``) float array. In general, its first dimension is assumed to be the *minibatch dimension*. The other dimensions are treated as one concatenated dimension. pool_size (int): The size used for downsampling of pooling layer. axis (int): The ``axis`` dimension to be reshaped. The size of ``axis`` dimension should be ``M * pool_size``. Returns: ~chainer.Variable: Output variable. The shape of the output is same as ``x`` except that ``axis`` dimension is transformed from ``M * pool_size`` to ``M``. .. seealso:: :class:`~chainer.links.Maxout` .. admonition:: Example Typically, ``x`` is the output of a linear layer or a convolution layer. The following is the example where we use :func:`maxout` in combination with a Linear link. >>> in_size, out_size, pool_size = 10, 10, 10 >>> bias = np.arange(out_size * pool_size).astype('f') >>> l = L.Linear(in_size, out_size * pool_size, initial_bias=bias) >>> x = np.zeros((1, in_size), 'f') # prepare data >>> x = l(x) >>> y = F.maxout(x, pool_size) >>> x.shape (1, 100) >>> y.shape (1, 10) >>> x.reshape((out_size, pool_size)).data array([[ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [ 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.], [ 20., 21., 22., 23., 24., 25., 26., 27., 28., 29.], [ 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.], [ 40., 41., 42., 43., 44., 45., 46., 47., 48., 49.], [ 50., 51., 52., 53., 54., 55., 56., 57., 58., 59.], [ 60., 61., 62., 63., 64., 65., 66., 67., 68., 69.], [ 70., 71., 72., 73., 74., 75., 76., 77., 78., 79.], [ 80., 81., 82., 83., 84., 85., 86., 87., 88., 89.], [ 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.]], \ dtype=float32) >>> y.data array([[ 9., 19., 29., 39., 49., 59., 69., 79., 89., 99.]], \ dtype=float32) """ if pool_size <= 0: raise ValueError('pool_size must be a positive integer.') x_shape = x.shape if x_shape[axis] % pool_size != 0: expect = 'x.shape[axis] % pool_size == 0' actual = 'x.shape[axis]={}, pool_size={}'.format( x_shape[axis], pool_size) msg = 'axis dimension must be divided by pool_size' raise type_check.InvalidType(expect, actual, msg) shape = (x_shape[:axis] + (x_shape[axis] // pool_size, pool_size) + x_shape[axis + 1:]) x = reshape.reshape(x, shape) return minmax.max(x, axis=axis + 1)
def group_normalization(x, groups, gamma, beta, eps=1e-5): """Group normalization function. This function implements a "group normalization" which divides the channels into groups and computes within each group the mean and variance, then normalize by these statistics, scales and shifts them. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Batch tensors. First dimension of this value must be the size of minibatch and second dimension must be the number of channels. Moreover, this value must have one or more following dimensions, such as height and width. groups (int): The number of channel groups. This value must be a divisor of the number of channels. gamma (~chainer.Variable): Scaling parameter. beta (~chainer.Variable): Shifting parameter. eps (float): Epsilon value for numerical stability of normalization. Returns: ~chainer.Variable: The output variable which has the same shape as :math:`x`. See: `Group Normalization <https://arxiv.org/abs/1803.08494>`_ """ if x.ndim <= 2: raise ValueError('Input dimension must be grater than 2, ' 'including batch size dimension ' '(first dimension).') if not isinstance(groups, int): raise TypeError('Argument: \'groups\' type must be (int).') xp = backend.get_array_module(x) batch_size, channels = x.shape[:2] original_shape = x.shape if channels % groups != 0: raise ValueError('Argument: \'groups\' must be a divisor ' 'of the number of channel.') # By doing this reshaping, calling batch_normalization function becomes # equivalent to Group Normalization. # And redundant dimension is added in order to utilize ideep64/cuDNN. x = reshape.reshape(x, (1, batch_size * groups, -1, 1)) with cuda.get_device_from_array(x.array): dummy_gamma = xp.ones(batch_size * groups).astype(xp.float32) dummy_beta = xp.zeros(batch_size * groups).astype(xp.float32) with warnings.catch_warnings(): warnings.simplefilter("ignore") x = batch_normalization.batch_normalization( x, dummy_gamma, dummy_beta, eps=eps) x = reshape.reshape(x, original_shape) target_shape = [1, channels] + [1] * (x.ndim - 2) gamma_broadcast = broadcast.broadcast_to( reshape.reshape(gamma, target_shape), x.shape) beta_broadcast = broadcast.broadcast_to( reshape.reshape(beta, target_shape), x.shape) return x * gamma_broadcast + beta_broadcast
def _global_average_pooling_2d(x): n, channel, rows, cols = x.data.shape h = average_pooling_2d(x, (rows, cols), stride=1) h = reshape(h, (n, channel)) return h
def n_step_lstm( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t &= \\sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) \\\\ f_t &= \\sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) \\\\ o_t &= \\sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) \\\\ a_t &= \\tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) \\\\ c_t &= f_t \\dot c_{t-1} + i_t \\dot a_t \\\\ h_t &= o_t \\dot \\tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def n_step_lstm_base( n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn, use_bi_direction): """Base function for Stack LSTM/BiLSTM functions. This function is used at :func:`chainer.functions.n_step_lstm` and :func:`chainer.functions.n_step_bilstm`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. use_bi_direction (bool): If ``True``, this function uses Bi-directional LSTM. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_lstm` :func:`chainer.functions.n_step_bilstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: rnn = NStepBiLSTM(n_layers, states, train=train) else: rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: direction = 2 if use_bi_direction else 1 split_size = n_layers * direction hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] xs_next = xs hy = [] cy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward LSTM # di=1, backward LSTM h_list = [] c_list = [] layer_idx = direction * layer + di h = hx[layer_idx] c = cx[layer_idx] if di == 0: xs_list = xs_next else: xs_list = reversed(xs_next) for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None c_rest = None if layer != 0: x = dropout.dropout(x, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer_idx], xbs[layer_idx]) + \ linear.linear(h, hws[layer_idx], hbs[layer_idx]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_list.append(h_bar) c_list.append(c_bar) return h, c, h_list, c_list h, c, h_forward, c_forward = _one_directional_loop(di=0) hy.append(h) cy.append(c) if use_bi_direction: # BiLSTM h, c, h_backward, c_backward = _one_directional_loop(di=1) hy.append(h) cy.append(c) h_backward.reverse() # concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in zip(h_forward, h_backward)] else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) cy = stack.stack(cy) return hy, cy, tuple(ys)
def n_step_lstm(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train=True, use_cudnn=True): """Stacked Long Short-Term Memory function for sequence inputs. This function calculates stacked LSTM with sequences. This function gets an initial hidden state :math:`h_0`, an initial cell state :math:`c_0`, an input sequence :math:`x`, weight matrices :math:`W`, and bias vectors :math:`b`. This function calculates hidden states :math:`h_t` and :math:`c_t` for each time :math:`t` from input :math:`x_t`. .. math:: i_t = \sigma(W_0 x_t + W_4 h_{t-1} + b_0 + b_4) f_t = \sigma(W_1 x_t + W_5 h_{t-1} + b_1 + b_5) o_t = \sigma(W_2 x_t + W_6 h_{t-1} + b_2 + b_6) a_t = \tanh(W_3 x_t + W_7 h_{t-1} + b_3 + b_7) c_t = f_t \dot c_{t-1} + i_t \dot a_t h_t = o_t \dot \tanh(c_t) As the function accepts a sequence, it calculates :math:`h_t` for all :math:`t` with one call. Eight weight matrices and eight bias vectors are required for each layers. So, when :math:`S` layers exists, you need to prepare :math:`8S` weigth matrices and :math:`8S` bias vectors. If the number of layers ``n_layers`` is greather than :math:`1`, input of ``k``-th layer is hidden state ``h_t`` of ``k-1``-th layer. Note that all input variables except first layer may have different shape from the first layer. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. cx (chainer.Variable): Variable holding stacked cell states. It has the same shape as ``hx``. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing eight matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing eight vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. train (bool): If ``True``, this function executes dropout. use_cudnn (bool): If ``True``, this function uses cuDNN if available. Returns: tuple: This functions returns a tuple concaining three elements, ``hy``, ``cy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``cy`` is an updated cell states whose shape is same as ``cx``. - ``ys`` is a list of :class:~chainer.Variable. Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.lstm` """ xp = cuda.get_array_module(hx, hx.data) if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \ _cudnn_version >= 5000: states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple( itertools.chain((hx, cx), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) rnn = NStepLSTM(n_layers, states, train=train) ret = rnn(*inputs) hy, cy = ret[:2] ys = ret[2:] return hy, cy, ys else: hx = split_axis.split_axis(hx, n_layers, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] cx = split_axis.split_axis(cx, n_layers, axis=0, force_tuple=True) cx = [reshape.reshape(c, c.shape[1:]) for c in cx] xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws] hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws] xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs] hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs] ys = [] for x in xs: batch = x.shape[0] h_next = [] c_next = [] for layer in six.moves.range(n_layers): h = hx[layer] c = cx[layer] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) c, c_rest = split_axis.split_axis(c, [batch], axis=0) else: h_rest = None x = dropout.dropout(x, ratio=dropout_ratio, train=train) h = dropout.dropout(h, ratio=dropout_ratio, train=train) lstm_in = linear.linear(x, xws[layer], xbs[layer]) + \ linear.linear(h, hws[layer], hbs[layer]) c_bar, h_bar = lstm.lstm(c, lstm_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) c = concat.concat([c_bar, c_rest], axis=0) else: h = h_bar c = c_bar h_next.append(h) c_next.append(c) x = h_bar hx = h_next cx = c_next ys.append(x) hy = stack.stack(hx) cy = stack.stack(cx) return hy, cy, tuple(ys)
def black_out(x, t, W, samples, reduce='mean'): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the no loss values. If it is ``'mean'``, this function takes a mean of loss values. Args: x (~chainer.Variable): Batch of input vectors. Its shape should be :math:`(N, D)`. t (~chainer.Variable): Vector of ground truth labels. Its shape should be :math:`(N,)`. Each elements :math:`v` should satisfy :math:`0 \\geq v \\geq V` or :math:`-1` where :math:`V` is the number of label types. W (~chainer.Variable): Weight matrix. Its shape should be :math:`(V, D)` samples (~chainer.Variable): Negative samples. Its shape should be :math:`(N, S)` where :math:`S` is the number of negative samples. reduce (str): Reduction option. Its value must be either ``'no'`` or ``'mean'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable object holding loss value(s). If ``reduce`` is ``'no'``, the output variable holds an array whose shape is :math:`(N,)` . If it is ``'mean'``, it holds a scalar. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.matmul(neg_emb, x[:, :, None]) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.matmul(pos_emb, x[:, :, None]) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = -(py - logz + _sum.sum(ny, axis=1)) if reduce == 'mean': loss = average.average(loss) return loss
def group_normalization(x, groups, gamma, beta, eps=1e-5): """Group normalization function. This function implements a "group normalization" which divides the channels into groups and computes within each group the mean and variance, then normalize by these statistics, scales and shifts them. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Batch tensors. First dimension of this value must be the size of minibatch and second dimension must be the number of channels. Moreover, this value must have one or more following dimensions, such as height and width. groups (int): The number of channel groups. This value must be a divisor of the number of channels. gamma (:class:`~chainer.Variable` or :ref:`ndarray`): Scaling parameter. beta (:class:`~chainer.Variable` or :ref:`ndarray`): Shifting parameter. eps (float): Epsilon value for numerical stability of normalization. Returns: ~chainer.Variable: The output variable which has the same shape as :math:`x`. See: `Group Normalization <https://arxiv.org/abs/1803.08494>`_ """ if x.ndim <= 2: raise ValueError('Input dimension must be grater than 2, ' 'including batch size dimension ' '(first dimension).') if not isinstance(groups, int): raise TypeError('Argument: \'groups\' type must be (int).') xp = backend.get_array_module(x) batch_size, channels = x.shape[:2] original_shape = x.shape if channels % groups != 0: raise ValueError('Argument: \'groups\' must be a divisor ' 'of the number of channel.') # By doing this reshaping, calling batch_normalization function becomes # equivalent to Group Normalization. # And redundant dimension is added in order to utilize ideep64/cuDNN. x = reshape.reshape(x, (1, batch_size * groups, -1, 1)) with cuda.get_device_from_array(x.array): dummy_gamma = xp.ones(batch_size * groups).astype(xp.float32) dummy_beta = xp.zeros(batch_size * groups).astype(xp.float32) with warnings.catch_warnings(): warnings.simplefilter('ignore') x = batch_normalization.batch_normalization(x, dummy_gamma, dummy_beta, eps=eps) x = reshape.reshape(x, original_shape) target_shape = [1, channels] + [1] * (x.ndim - 2) gamma_broadcast = broadcast.broadcast_to( reshape.reshape(gamma, target_shape), x.shape) beta_broadcast = broadcast.broadcast_to( reshape.reshape(beta, target_shape), x.shape) return x * gamma_broadcast + beta_broadcast