def forward(self, inputs): batch_size, n_ts, input_dim = inputs.shape if not self.is_init: self.shapes = { "W_g": [3 * self.n_h, input_dim + self.n_h], "b_g": [3 * self.n_h], "W_c": [self.n_h, input_dim + self.n_h], "b_c": [self.n_h] } self._init_params() h = empty((batch_size, n_ts + 1, self.n_h)) h[:, -1] = 0.0 c = empty((batch_size, n_ts + 1, self.n_h)) c[:, -1] = 0.0 gates = empty((batch_size, n_ts, 3 * self.n_h)) c_hat = empty((batch_size, n_ts + 1, self.n_h)) for t in range(n_ts): z = np.hstack([h[:, t - 1], inputs[:, t]]) gates[:, t] = sigmoid(z @ self.params["W_g"].T + self.params["b_g"]) o_gate, i_gate, f_gate = np.split(gates[:, t], 3, axis=1) c_hat[:, t] = np.tanh(z @ self.params["W_c"].T + self.params["b_c"]) c[:, t] = f_gate * c[:, t - 1] + i_gate * c_hat[:, t] h[:, t] = o_gate * np.tanh(c[:, t]) self.ctx = { "h": h, "c": c, "X": inputs, "gates": gates, "c_hat": c_hat } return h[:, -2]
def grad(self, logits, labels): neg_weight, pos_weight = self._weights grads = neg_weight * sigmoid(logits) - pos_weight * labels + \ (pos_weight - neg_weight) * labels * sigmoid(logits) return grads / labels.shape[0]
def loss(self, logits, labels): neg_weight, pos_weight = self._weights cost = neg_weight * logits * (1 - labels) - \ (pos_weight * labels - neg_weight * (labels - 1)) * \ np.log(sigmoid(logits)) return np.sum(cost) / labels.shape[0]
def func(self, x): self._cache = sigmoid(self._alpha * x) return x * self._cache
def derivative(self, x): return sigmoid(x)
def func(self, x): return sigmoid(x)