예제 #1
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Computes the forward pass for spatial batch normalization.

        Inputs:
        - x: Input data of shape (N, C, H, W)
        - gamma: Scale parameter, of shape (C,)
        - beta: Shift parameter, of shape (C,)
        - bn_param: Dictionary with the following keys:
            - mode: 'train' or 'test'; required
            - eps: Constant for numeric stability
            - momentum: Constant for running mean / variance. momentum=0 means that
                old information is discarded completely at every time step, while
                momentum=1 means that new information is never incorporated. The
                default of momentum=0.9 should work well in most situations.
            - running_mean: Array of shape (D,) giving running mean of features
            - running_var Array of shape (D,) giving running variance of features

        Returns a tuple of:
        - out: Output data, of shape (N, C, H, W)
        - cache: Values needed for the backward pass
        """
        N, C, H, W = x.shape
        x_flat = x.transpose(0, 2, 3, 1).reshape(-1, C)
        out_flat = super().forward(x_flat)
        out = out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
        return out
예제 #2
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Computes the forward pass for spatial group normalization.
        In contrast to layer normalization, group normalization splits each entry
        in the data into G contiguous pieces, which it then normalizes independently.
        Per feature shifting and scaling are then applied to the data, in a manner
        identical to that of batch normalization and layer normalization.

        Inputs:
        - x: Input data of shape (N, C, H, W)
        - gamma: Scale parameter, of shape (C,)
        - beta: Shift parameter, of shape (C,)
        - G: Integer mumber of groups to split into, should be a divisor of C
        - gn_param: Dictionary with the following keys:
        - eps: Constant for numeric stability

        Returns a tuple of:
        - out: Output data, of shape (N, C, H, W)
        - cache: Values needed for the backward pass
        """
        N, C, H, W = x.shape
        self.gamma = self.gamma.reshape((1, C, 1, 1))
        self.beta = self.beta.reshape((1, C, 1, 1))

        x = x.reshape(N * self.G, -1).T
        sample_mean = np.mean(x, axis=0)
        sample_var = np.var(x, axis=0)
        v = np.sqrt(sample_var + self.eps)
        x_hat = (x - sample_mean) / v
        x_hat = x_hat.T.reshape(N, C, H, W)
        out = self.gamma * x_hat + self.beta

        self.cache = (x_hat, v)

        return out
예제 #3
0
파일: svm.py 프로젝트: TylerYep/edutorch
    def loss(self, X: NPArray, y: NPIntArray) -> tuple[float, NPArray]:
        """
        Structured SVM loss function, vectorized implementation.
        Inputs have dimension D, there are C classes, and we operate on minibatches
        of N examples.
        Inputs:
        - W: A numpy array of shape (D, C) containing weights.
        - X: A numpy array of shape (N, D) containing a minibatch of data.
        - y: A numpy array of shape (N,) containing training labels; y[i] = c means
            that X[i] has label c, where 0 <= c < C.
        - reg: (float) regularization strength
        Returns a tuple of:
        - loss as single float
        - gradient with respect to weights W; an array of same shape as W
        """
        num_train = X.shape[0]
        score_matrix = X.dot(self.W)
        correct_class_scores = score_matrix[np.arange(num_train),
                                            y].reshape(-1, 1)
        margin = score_matrix - correct_class_scores + 1  # note delta = 1
        margin[margin < 0] = 0
        margin[np.arange(num_train), y] = 0
        loss = np.sum(margin)

        margin[margin > 0] = 1
        num_y = np.sum(margin, axis=1)
        margin[np.arange(num_train), y] = -num_y
        dW = X.T.dot(margin)

        return loss, dW
예제 #4
0
    def backward(self, dout: NPArray) -> tuple[NPArray, ...]:
        """
        Backward pass for temporal affine layer.

        Input:
        - dout: Upstream gradients of shape (N, T, M)
        - cache: Values from forward pass

        Returns a tuple of:
        - dx: Gradient of input, of shape (N, T, D)
        - dw: Gradient of weights, of shape (D, M)
        - db: Gradient of biases, of shape (M,)
        """
        (x, ) = self.cache
        N, T, D = x.shape
        M = self.b.shape[0]

        dx = dout.reshape(N * T, M).dot(self.w.T).reshape(N, T, D)
        dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
        db = dout.sum(axis=(0, 1))

        return dx, dw, db
예제 #5
0
    def backward_naive(self, dout: NPArray) -> tuple[NPArray, ...]:
        """
        Backward pass for batch normalization.

        For this implementation, you should write out a computation graph for
        batch normalization on paper and propagate gradients backward through
        intermediate nodes.

        Inputs:
        - dout: Upstream derivatives, of shape (N, D)
        - cache: Variable of intermediates from batchnorm_forward.

        Returns a tuple of:
        - dx: Gradient with respect to inputs x, of shape (N, D)
        - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
        - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
        """
        xn, std = self.cache

        if self.train_mode:
            N = dout.shape[0]
            dbeta = dout.sum(axis=0)
            dgamma = np.sum(xn * dout, axis=0)
            dxn = self.gamma * dout
            dxc = dxn / std
            dstd = -np.sum((dxn * xn) / std, axis=0)
            dvar = 0.5 * dstd / std
            dxc += (2 / N) * (xn * std) * dvar
            dmu = np.sum(dxc, axis=0)
            dx = dxc - dmu / N

        else:
            dbeta = dout.sum(axis=0)
            dgamma = np.sum(xn * dout, axis=0)
            dxn = self.gamma * dout
            dx = dxn / std

        return dx, dgamma, dbeta
예제 #6
0
    def loss(self, X: NPArray, y: NPIntArray) -> tuple[float, NPArray]:
        """
        Softmax loss function, vectorized version.
        Inputs and outputs are the same as softmax_loss_naive.
        """
        num_classes = self.W.shape[1]
        num_train = X.shape[0]
        scores = X.dot(self.W)
        softmx = softmax(scores)

        loss = np.sum(np.log(softmx[np.arange(num_train), y]))

        kronecker = np.zeros((num_train, num_classes))
        kronecker[np.arange(num_train), y] = 1
        dW = X.T.dot(kronecker - softmx)

        return loss, dW
예제 #7
0
    def backward(self, dout: NPArray) -> tuple[NPArray, ...]:
        """
        Computes the backward pass for spatial batch normalization.

        Inputs:
        - dout: Upstream derivatives, of shape (N, C, H, W)
        - cache: Values from the forward pass

        Returns a tuple of:
        - dx: Gradient with respect to inputs, of shape (N, C, H, W)
        - dgamma: Gradient with respect to scale parameter, of shape (C,)
        - dbeta: Gradient with respect to shift parameter, of shape (C,)
        """
        N, C, H, W = dout.shape
        dout_flat = dout.transpose(0, 2, 3, 1).reshape(-1, C)
        dx_flat, dgamma, dbeta = super().backward(dout_flat)
        dx = dx_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2)
        return dx, dgamma, dbeta
예제 #8
0
def temporal_softmax_loss(x: NPArray, y: NPIntArray,
                          mask: NPBoolArray) -> tuple[float, NPArray]:
    """
    A temporal version of softmax loss for use in RNNs. We assume that we are
    making predictions over a vocabulary of size V for each timestep of a
    timeseries of length T, over a minibatch of size N. The input x gives scores
    for all vocabulary elements at all timesteps, and y gives the indices of the
    ground-truth element at each timestep. We use a cross-entropy loss at each
    timestep, summing the loss over all timesteps and averaging across the
    minibatch.

    As an additional complication, we may want to ignore the model output at some
    timesteps, since sequences of different length may have been combined into a
    minibatch and padded with NULL tokens. The optional mask argument tells us
    which elements should contribute to the loss.

    Inputs:
    - x: Input scores, of shape (N, T, V)
    - y: Ground-truth indices, of shape (N, T) where each element is in the range
         0 <= y[i, t] < V
    - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
      the scores at x[i, t] should contribute to the loss.

    Returns a tuple of:
    - loss: Scalar giving loss
    - dx: Gradient of loss with respect to scores x.
    """
    N, T, V = x.shape

    x_flat = x.reshape(N * T, V)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)

    probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
    dx_flat = probs.copy()
    dx_flat[np.arange(N * T), y_flat] -= 1
    dx_flat /= N
    dx_flat *= mask_flat[:, None]

    dx = dx_flat.reshape(N, T, V)
    return loss, dx
예제 #9
0
파일: linear.py 프로젝트: TylerYep/edutorch
    def forward(self, x: NPArray) -> NPArray:
        """
        Computes the forward pass for an affine (fully-connected) layer.

        The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.
        We multiply this against a weight matrix of shape (D, M) where
        D = prod_i d_i

        Inputs:
        x - Input data, of shape (N, d_1, ..., d_k)
        w - Weights, of shape (D, M)
        b - Biases, of shape (M,)

        Returns a tuple of:
        - out: output, of shape (N, M)
        - cache: (x, w, b)
        """
        self.cache = (x, )
        return x.reshape(x.shape[0], -1).dot(self.w) + self.b
예제 #10
0
파일: linear.py 프로젝트: TylerYep/edutorch
    def backward(self, dout: NPArray) -> tuple[NPArray, ...]:
        """
        Computes the backward pass for an affine layer.

        Inputs:
        - dout: Upstream derivative, of shape (N, M)
        - cache: Tuple of:
        - x: Input data, of shape (N, d_1, ... d_k)
        - w: Weights, of shape (D, M)

        Returns a tuple of:
        - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
        - dw: Gradient with respect to w, of shape (D, M)
        - db: Gradient with respect to b, of shape (M,)
        """
        (x, ) = self.cache
        dx = dout.dot(self.w.T).reshape(x.shape)
        dw = x.reshape(x.shape[0], -1).T.dot(dout)
        db = np.sum(dout, axis=0)
        return dx, dw, db
예제 #11
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Forward pass for a temporal affine layer. The input is a set of D-dimensional
        vectors arranged into a minibatch of N timeseries, each of length T. We use
        an affine function to transform each of those vectors into a new vector of
        dimension M.

        Inputs:
        - x: Input data of shape (N, T, D)
        - w: Weights of shape (D, M)
        - b: Biases of shape (M,)

        Returns a tuple of:
        - out: Output data of shape (N, T, M)
        - cache: Values needed for the backward pass
        """
        N, T, D = x.shape
        M = self.b.shape[0]
        self.cache = (x, )
        return x.reshape(N * T, D).dot(self.w).reshape(N, T, M) + self.b
예제 #12
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Forward pass for batch normalization.
        Uses minibatch statistics to compute the mean and variance, use
        these statistics to normalize the incoming data, and scale and
        shift the normalized data using gamma and beta.

        During training the sample mean and (uncorrected) sample variance are
        computed from minibatch statistics and used to normalize the incoming data.
        During training we also keep an exponentially decaying running mean of the mean
        and variance of each feature, and these averages are used to normalize data
        at test-time.

        At each timestep we update the running averages for mean and variance using
        an exponential decay based on the momentum parameter:

        running_mean = momentum * running_mean + (1 - momentum) * sample_mean
        running_var = momentum * running_var + (1 - momentum) * sample_var

        Note that though you should be keeping track of the running
        variance, you should normalize the data based on the standard
        deviation (square root of variance) instead!
        Referencing the original paper (https://arxiv.org/abs/1502.03167)
        might prove to be helpful.

        Note that the batch normalization paper suggests a different test-time
        behavior: they compute sample mean and variance for each feature using a
        large number of training images rather than using a running average. For
        this implementation we have chosen to use running averages instead since
        they do not require an additional estimation step; the torch7 implementation
        of batch normalization also uses running averages.

        Input:
        - x: Data of shape (N, D)
        - gamma: Scale parameter of shape (D,)
        - beta: Shift parameter of shape (D,)
        - bn_param: Dictionary with the following keys:
            - mode: 'train' or 'test'; required
            - eps: Constant for numeric stability
            - momentum: Constant for running mean / variance.
            - running_mean: Array of shape (D,) giving running mean of features
            - running_var Array of shape (D,) giving running variance of features

        Returns a tuple of:
        - out: of shape (N, D)
        - cache: A tuple of values needed in the backward pass
        """
        if self.train_mode:
            # Compute output
            mu = x.mean(axis=0)
            xc = x - mu
            var = np.mean(xc**2, axis=0)
            std = np.sqrt(var + self.eps)
            xn = xc / std
            out = self.gamma * xn + self.beta

            # Update running average of mean
            self.running_mean *= self.momentum
            self.running_mean += (1 - self.momentum) * mu

            # Update running average of variance
            self.running_var *= self.momentum
            self.running_var += (1 - self.momentum) * var

        else:
            # Using running mean and variance to normalize
            std = np.sqrt(self.running_var + self.eps)
            xn = (x - self.running_mean) / std
            out = self.gamma * xn + self.beta

        self.cache = (xn, std)

        return out