def _oper_cpu(cls, x, pz, ps, parameter): p = parameter s = np.zeros( (x.shape[0], p["w"].shape[1] // 4), dtype=precision) if ps is None else ps z = np.zeros( (x.shape[0], p["w"].shape[1] // 4), dtype=precision) if pz is None else pz u = dot(x, p["w"]) + dot(z, p["wr"]) + p["b"] m = u.shape[1] // 4 u, gated = np.split(u, [ m, ], axis=1) u = tanh(u) gated = sigmoid(gated) state = gated[:, m:m * 2] * u + gated[:, :m] * s z = tanh(state) * gated[:, m * 2:] ret = cls._create_node(z) ret.attrs._x = x ret.attrs._p = parameter ret.attrs._u = u ret.attrs._pstate = ps ret.attrs._state = state ret.attrs._gated = gated ret.attrs._dt_d = [p[k] for k in ["wr", "w"]] ret._state = state if isinstance(pz, Node): pz.attrs._pfgate = gated[:, :m] return ret
def _oper_cpu(cls, x, pz, ps, w, wr, b): s = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if ps is None else ps z = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if pz is None else pz u = dot(x, w) + dot(z, wr) + b m = u.shape[1] // 4 u, gated = np.split(u, [m, ], axis=1) u = tanh(u) gated = sigmoid(gated) state = gated[:, m:m * 2] * u + gated[:, :m] * s z = tanh(state) * gated[:, m * 2:] ret = cls._create_node(z) ret.attrs._x = x ret.attrs._w = w ret.attrs._wr = wr ret.attrs._b = b ret.attrs._pz = pz ret.attrs._u = u ret.attrs._pstate = ps ret.attrs._state = state ret.attrs._gated = gated ret._state = state if isinstance(pz, Node): pz.attrs._pfgate = gated[:, :m] return ret
def _oper_gpu(cls, x, pz, ps, parameter): p = parameter s = get_gpu(np.zeros((x.shape[0], p["w"].shape[1]), dtype=precision)) if ps is None else ps z = get_gpu(s).zeros_like_me() if pz is None else pz u = dot(x, p["w"]) + dot(z, p["wr"]) + p["b"] gate_f = sigmoid(dot(x, p["wf"]) + dot(z, p["wfr"]) + p["wfc"] * s + p["bf"]) gate_i = sigmoid(dot(x, p["wi"]) + dot(z, p["wir"]) + p["wic"] * s + p["bi"]) state = gate_i * tanh(u) + gate_f * s gate_o = sigmoid( dot(x, p["wo"]) + dot(z, p["wor"]) + p["bo"] + p["woc"] * state) z = tanh(state) * gate_o ret = cls._create_node(get_gpu(z)) ret.attrs._x = x ret.attrs._p = parameter ret.attrs._u = u ret.attrs._pgated_f = None ret.attrs._pstate = ps ret.attrs._state = state ret.attrs._gated_o = gate_o ret.attrs._gated_f = gate_f ret.attrs._gated_i = gate_i ret.attrs._dt_d = [p[k] for k in ["wr", "wi", "wf", "wo", "w"]] ret._state = state return ret
def _backward_gpu(self, context, dy): p = self.attrs._p s = self.attrs._state ps = self.attrs._pstate u = self.attrs._u go = self.attrs._gated_o gf = self.attrs._gated_f gi = self.attrs._gated_i pgf = get_gpu(gf).zeros_like_me() if self.attrs._pgated_f is None else self.attrs._pgated_f drt, dit, dft, doot, dct = (context.restore(dt, get_gpu(dy).zeros_like_me()) for dt in self.attrs._dt_d) activated_s = tanh(s) activated_u = tanh(u) e = dy + get_gpu(dot(drt, p["wr"].T)) \ + get_gpu(dot(dit, p["wir"].T)) + \ + get_gpu(dot(dft, p["wfr"].T)) + \ + get_gpu(dot(doot, p["wor"].T)) do = gate_diff(go) * activated_s * e ds = go * activation_diff(activated_s) * e dc = ds + pgf * dct + p["wfc"] * dft + p["wic"] * dit + p["woc"] * do df = gate_diff(gf) * ps * dc if ps is not None else get_gpu(gf).zeros_like_me() di = gate_diff(gi) * activated_u * dc d = gi * activation_diff(activated_u) * dc dx = dot(d, p["w"].T) \ + dot(di, p["wi"].T) \ + dot(do, p["wo"].T) \ + dot(df, p["wf"].T) for dt_d, dt in zip(self.attrs._dt_d, (d, di, df, do, dc)): context.store(dt_d, get_gpu(dt)) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, get_gpu(dx)) for k, diff in zip(("w", "wo", "wi", "wf"), (d, do, di, df)): if isinstance(p[k], Node): p[k]._update_diff(context, get_gpu(dot(self.attrs._x.T, diff))) for k, diff in zip(("wr", "wor", "wir", "wfr"), (drt, doot, dit, dft)): if isinstance(p[k], Node): p[k]._update_diff(context, get_gpu(dot(self.T, diff))) for k, diff in zip(("wfc", "wic", "woc"), (dft, dit, do)): if isinstance(p[k], Node): p[k]._update_diff(context, sum(diff * get_gpu(s), axis=0)) for k, diff in zip(("b", "bf", "bi", "bo"), (d, df, di, do)): if isinstance(p[k], Node): p[k]._update_diff(context, sum(diff, axis=0))
def _backward_cpu(self, context, dy): p = self.attrs._p s = self.attrs._state ps = self.attrs._pstate u = self.attrs._u go = self.attrs._gated_o gf = self.attrs._gated_f gi = self.attrs._gated_i pgf = np.zeros_like(gf) if self.attrs._pgated_f is None else self.attrs._pgated_f drt, dit, dft, dot, dct = (context.restore(dt, np.zeros_like(dy)) for dt in self.attrs._dt_d) activated_s = tanh(s) activated_u = tanh(u) e = dy + np.dot(drt, p["wr"].T) + np.dot(dit, p["wir"].T) + \ np.dot(dft, p["wfr"].T) + np.dot(dot, p["wor"].T) do = gate_diff(go) * activated_s * e ds = go * activation_diff(activated_s) * e dc = ds + pgf * dct + p["wfc"] * dft + p["wic"] * dit + p["woc"] * do df = gate_diff(gf) * ps * dc if ps is not None else np.zeros_like(gf) di = gate_diff(gi) * activated_u * dc d = gi * activation_diff(activated_u) * dc dx = np.dot(d, p["w"].T) \ + np.dot(di, p["wi"].T) \ + np.dot(do, p["wo"].T) \ + np.dot(df, p["wf"].T) for dt_d, dt in zip(self.attrs._dt_d, (d, di, df, do, dc)): context.store(dt_d, dt) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) for k, diff in zip(("w", "wo", "wi", "wf"), (d, do, di, df)): if isinstance(p[k], Node): p[k]._update_diff(context, np.dot(to_value(self.attrs._x).T, diff)) for k, diff in zip(("wr", "wor", "wir", "wfr"), (drt, dot, dit, dft)): if isinstance(p[k], Node): p[k]._update_diff(context, np.dot(to_value(self).T, diff)) for k, diff in zip(("wfc", "wic", "woc"), (dft, dit, do)): if isinstance(p[k], Node): p[k]._update_diff(context, np.sum(diff * s, axis=0, keepdims=True)) for k, diff in zip(("b", "bf", "bi", "bo"), (d, df, di, do)): if isinstance(p[k], Node): p[k]._update_diff(context, np.sum(diff, axis=0, keepdims=True))
def _backward_gpu(self, context, dy): p = self.attrs._p u = self.attrs._u s = tanh(self.attrs._state) ps = self.attrs._pstate drt = context.restore(p["wr"], get_gpu(u).zeros_like_me()) dou = context.restore(p["w"], get_gpu(dy).zeros_like_me()) pfg = getattr(self.attrs, "_pfgate", get_gpu(u).zeros_like_me()) e = get_gpu(dy) + get_gpu(dot(drt, p["wr"].T)) dr, dou_n = (get_gpu(a).empty_like_me() for a in (drt, dou)) cu.culstm_backward(*map(get_gpu, (u, dr, s, ps, e, pfg, dou, dou_n))) dx = dot(dr, p["w"].T) context.store(p["wr"], dr) context.store(p["w"], dou_n) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) if isinstance(p["w"], Node): p["w"]._update_diff(context, dot(self.attrs._x.T, dr)) if isinstance(p["wr"], Node): p["wr"]._update_diff(context, dot(self.T, drt)) if isinstance(p["b"], Node): p["b"]._update_diff(context, sum(dr, axis=0))
def _oper_cpu(cls, x, pz, w, u, b): # Initialize Variables m = w.shape[1] // 3 w_z, w_r, w_h = np.split(w, [m, m * 2, ], axis=1) u_z, u_r, u_h = np.split(u, [m, m * 2], axis=1) hminus = Variable(np.zeros((x.shape[0], w.shape[1] // 3), dtype=precision)) if pz is None else pz b_z, b_r, b_h = np.split(b, [m, m * 2], axis=1) if b is not None else (0, 0, 0) A = dot(x, w_z) + dot(hminus, u_z) + b_z B = dot(x, w_r) + dot(hminus, u_r) + b_r C = dot(x, w_h) + sigmoid(B) * dot(hminus, u_h) + b_h h = sigmoid(A) * hminus + (1 - sigmoid(A)) * tanh(C) # Store Variables for Graph ret = cls._create_node(h) ret.attrs._x = x ret.attrs._w = w ret.attrs._w_z = w_z ret.attrs._w_r = w_r ret.attrs._w_h = w_h ret.attrs._u = u ret.attrs._u_z = u_z ret.attrs._u_h = u_h ret.attrs._u_r = u_r ret.attrs._pz = hminus ret.attrs._A = A ret.attrs._B = B ret.attrs._C = C if b is not None: ret.attrs._b = b return ret
def _backward_cpu(self, context, dy, **kwargs): n, m = dy.shape w = self.attrs._w wr = self.attrs._wr wc = self.attrs._wc b = self.attrs._b u = self.attrs._u s = tanh(self.attrs._state) gated = self.attrs._gated gd = gate_diff(gated) ps = self.attrs._pstate pfg = self.attrs.get("_pfgate", np.zeros_like(self)) dot = context.restore(w, np.zeros((n, m), dtype=dy.dtype)) drt = context.restore(wr, np.zeros((n, m * 4), dtype=dy.dtype)) do = dy * s * gd[:, 2 * m:] dou = dy * gated[:, 2 * m:] * activation_diff(s) + do * wc[:, 2 * m:] dou += pfg * dot + drt[:, m:2 * m] * wc[:, :m] + drt[:, 2 * m:3 * m] * wc[:, m:2 * m] df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou) di = dou * gd[:, m:2 * m] * u du = dou * activation_diff(u) * gated[:, m:2 * m] dr = np.hstack((du, df, di, do)) context.store(wr, dr) context.store(w, dou) if isinstance(self.attrs._x, Node): dx = np.dot(dr, w.T) self.attrs._x._update_diff(context, dx) if isinstance(w, Node): w._update_diff(context, np.dot(self.attrs._x.T, dr)) if isinstance(wr, Node): wr._update_diff(context, np.dot(self.T, drt)) if isinstance(wc, Node): dwc = np.zeros(wc.shape, dtype=wc.dtype) dwc[:, 2 * m:] = np.sum(do * self.attrs._state, axis=0) dwc[:, :m] = np.sum(drt[:, m:2 * m] * self.attrs._state, axis=0) dwc[:, m:2 * m] = np.sum(drt[:, 2 * m:3 * m] * self.attrs._state, axis=0) wc._update_diff(context, dwc) if isinstance(b, Node): b._update_diff(context, np.sum(dr, axis=0)) if isinstance(self.attrs._pz, Node): self.attrs._pz._update_diff(context, np.dot(dr, wr.T))
def _backward_cpu(self, context, dy, **kwargs): x = self.attrs._x w_z = self.attrs._w_z w_r = self.attrs._w_r w_h = self.attrs._w_h A = self.attrs._A B = self.attrs._B C = self.attrs._C u_z = self.attrs._u_z u_h = self.attrs._u_h u_r = self.attrs._u_r hminus = self.attrs._pz y = dy dA = y * (hminus - tanh(C)) * sigmoid_diff(A) dC = y * (1 - sigmoid(A)) * tanh_diff(C) dB = dC * dot(hminus, u_h) * sigmoid_diff(B) # Calculate dx dx_z = dot(dA, w_z.T) dx_r = dot(dB, w_r.T) dx_h = dot(dC, w_h.T) dx = dx_z + dx_r + dx_h # Calculate dw dw_z = dot(x.T, dA) dw_r = dot(x.T, dB) dw_h = dot(x.T, dC) dw = np.concatenate([dw_z, dw_r, dw_h], axis=1) # Calculate db db_z = np.sum(dA, axis=0, keepdims=True) db_r = np.sum(dB, axis=0, keepdims=True) db_h = np.sum(dC, axis=0, keepdims=True) db = np.concatenate([db_z, db_r, db_h], axis=1) du_z = dot(hminus.T, dA) du_r = dot(hminus.T, dB) du_h = dot(hminus.T, dC * sigmoid(B)) du = np.concatenate([du_z, du_r, du_h], axis=1) pz_z = dot(dA, u_z.T) pz_r = dot(dB, u_r.T) pz_h = dot(dC * sigmoid(B), u_h.T) dpz = pz_z + pz_r + pz_h + y * sigmoid(A) self.attrs._w._update_diff(context, dw) self.attrs._u._update_diff(context, du) if hasattr(self.attrs, "_b"): self.attrs._b._update_diff(context, db) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) if isinstance(self.attrs._pz, Node): self.attrs._pz._update_diff(context, dpz)
def _oper_cpu(cls, x, pz, ps, w, wr, wc, b): s = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if ps is None else ps z = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if pz is None else pz u = np.dot(x, w) + np.dot(z, wr) if b is not None: u += b m = u.shape[1] // 4 u, gate_u = np.split(u.as_ndarray(), [ m, ], axis=1) u = tanh(u) fg = sigmoid(s * wc[:, :m] + gate_u[:, :m]) ig = sigmoid(s * wc[:, m:2 * m] + gate_u[:, m:2 * m]) state = ig * u + fg * s og = sigmoid(state * wc[:, 2 * m:] + gate_u[:, 2 * m:]) z = tanh(state) * og gated = np.hstack((fg, ig, og)) ret = cls._create_node(z) ret.attrs._x = x ret.attrs._w = w ret.attrs._wr = wr ret.attrs._wc = wc ret.attrs._b = b ret.attrs._u = u ret.attrs._pz = pz ret.attrs._pstate = ps ret.attrs._state = state ret.attrs._gated = gated ret._state = state if isinstance(pz, Node): pz.attrs._pfgate = gated[:, :m] return ret
def _backward_cpu(self, context, dy, **kwargs): n, m = dy.shape w = self.attrs._w wr = self.attrs._wr b = self.attrs._b u = self.attrs._u s = tanh(self.attrs._state) gated = self.attrs._gated gd = gate_diff(gated) ps = self.attrs._pstate drt = context.restore(wr, np.zeros((n, m * 4), dtype=dy.dtype)) dou = context.restore(w, np.zeros((n, m), dtype=dy.dtype)) pfg = self.attrs.get("_pfgate", np.zeros_like(self)) e = dy do = e * s * gd[:, 2 * m:] dou = e * gated[:, 2 * m:] * activation_diff(s) + pfg * dou df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou) di = dou * gd[:, m:2 * m] * u dc = dou * activation_diff(u) * gated[:, m:2 * m] dr = np.hstack((dc, df, di, do)) dx = np.dot(dr, w.T) context.store(wr, dr) context.store(w, dou) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) if isinstance(w, Node): w._update_diff(context, np.dot(self.attrs._x.T, dr)) if isinstance(wr, Node): wr._update_diff(context, np.dot(self.T, drt)) if isinstance(b, Node): b._update_diff(context, np.sum(dr, axis=0, keepdims=True)) if isinstance(self.attrs._pz, Node): self.attrs._pz._update_diff(context, np.dot(dr, wr.T))
def _oper_cpu(cls, x, pz, w, u, b): # Initialize Variables m = w.shape[1] // 3 w_z, w_r, w_h = np.split(w, [ m, m * 2, ], axis=1) u_z, u_r, u_h = np.split(u, [m, m * 2], axis=1) hminus = Variable( np.zeros((x.shape[0], w.shape[1] // 3), dtype=precision)) if pz is None else pz # Perform Forward Calcuations if b is None: A = dot(x, w_z) + hminus * u_z B = dot(x, w_r) + u_r * hminus C = dot(x, w_h) + sigmoid(B) * u_h * hminus else: b_z, b_r, b_h = np.split(b, [m, m * 2], axis=1) A = dot(x, w_z) + hminus * u_z + b_z B = dot(x, w_r) + u_r * hminus + b_r C = dot(x, w_h) + sigmoid(B) * u_h * hminus + b_h h = sigmoid(A) + tanh(C) # Store Variables for Graph ret = cls._create_node(h) ret.attrs._x = x ret.attrs._w = w ret.attrs._w_z = w_z ret.attrs._w_r = w_r ret.attrs._w_h = w_h ret.attrs._b = b ret.attrs._b_z = b_z ret.attrs._b_r = b_r ret.attrs._b_h = b_h ret.attrs._u = u ret.attrs._u_z = u_z ret.attrs._u_h = u_h ret.attrs._u_r = u_r ret.attrs._pz = hminus ret.attrs._A = A ret.attrs._B = B ret.attrs._C = C return ret
def _backward_cpu(self, context, dy): n, m = dy.shape p = self.attrs._p u = self.attrs._u s = tanh(self.attrs._state) gated = self.attrs._gated gd = gate_diff(gated) ps = self.attrs._pstate drt = context.restore(p["wr"], np.zeros((n, m * 4), dtype=dy.dtype)) dou = context.restore(p["w"], np.zeros((n, m), dtype=dy.dtype)) pfg = getattr(self.attrs, "_pfgate", np.zeros_like(self)) e = dy + np.dot(drt, p["wr"].T) do = e * s * gd[:, 2 * m:] dou = e * gated[:, 2 * m:] * activation_diff(s) + pfg * dou df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou) di = dou * gd[:, m:2 * m] * u dc = dou * activation_diff(u) * gated[:, m:2 * m] dr = np.hstack((dc, df, di, do)) dx = np.dot(dr, p["w"].T) context.store(p["wr"], dr) context.store(p["w"], dou) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) if isinstance(p["w"], Node): p["w"]._update_diff(context, np.dot(self.attrs._x.T, dr)) if isinstance(p["wr"], Node): p["wr"]._update_diff(context, np.dot(self.T, drt)) if isinstance(p["b"], Node): p["b"]._update_diff(context, np.sum(dr, axis=0, keepdims=True))
def _backward_gpu(self, context, dy, **kwargs): w = self.attrs._w wr = self.attrs._wr b = self.attrs._b u = self.attrs._u s = tanh(self.attrs._state) ps = self.attrs._pstate drt = context.restore(wr, get_gpu(u).zeros_like_me()) dou = context.restore(w, get_gpu(dy).zeros_like_me()) pfg = self.attrs.get("_pfgate", get_gpu(u).zeros_like_me()) e = get_gpu(dy) dr, dou_n = (get_gpu(a).empty_like_me() for a in (drt, dou)) cu.culstm_backward(*map(get_gpu, (u, dr, s, ps, e, pfg, dou, dou_n))) dx = dot(dr, w.T) context.store(wr, dr) context.store(w, dou_n) if isinstance(self.attrs._x, Node): self.attrs._x._update_diff(context, dx) if isinstance(w, Node): w._update_diff(context, dot(self.attrs._x.T, dr)) if isinstance(wr, Node): wr._update_diff(context, dot(self.T, drt)) if isinstance(b, Node): b._update_diff(context, sum(dr, axis=0)) if isinstance(self.attrs._pz, Node): self.attrs._pz._update_diff(context, dot(dr, wr.T))
def tanh_diff(x): return (1.0 - tanh(x) ** 2)
def func(node): return sum(tanh(node))