示例#1
0
    def pass_backward(self, grad):
        _, time_steps, _ = grad.shape
        next_grad = np.zeros_like(grad)

        if self.is_trainable:

            dW_input = np.zeros_like(self.W_input)
            dW_recur = np.zeros_like(self.W_recur)
            dW_output = np.zeros_like(self.W_output)

            db_input = np.zeros_like(self.b_input)
            db_output = np.zeros_like(self.b_output)

            for t in np.arange(time_steps)[::-1]:  # reversed
                dW_output += np.dot(grad[:, t].T, self.states[:, t])
                db_output += np.sum(grad[:, t], axis=0)
                dstate = np.dot(grad[:, t], self.W_output) * activate(
                    self.activation).backward(self.state_inputs[:, t])
                next_grad[:, t] = np.dot(dstate, self.W_input)

                for tt in np.arange(max(0, t - self.bptt_truncate),
                                    t + 1)[::-1]:  # reversed
                    dW_input += np.dot(dstate.T, self.inputs[:, tt])
                    dW_recur += np.dot(dstate.T, self.states[:, tt - 1])
                    db_input += np.sum(dstate, axis=0)
                    dstate = np.dot(dstate, self.W_recur) * activate(
                        self.activation).backward(self.state_inputs[:, tt - 1])

            # optimize weights and bias
            self.W_input = optimizer(self.optimizer_kwargs).update(
                self.W_input, cg(dW_input))
            self.W_output = optimizer(self.optimizer_kwargs).update(
                self.W_output, cg(dW_output))
            self.W_recur = optimizer(self.optimizer_kwargs).update(
                self.W_recur, cg(dW_recur))

            self.b_input = optimizer(self.optimizer_kwargs).update(
                self.b_input, cg(db_input))
            self.b_output = optimizer(self.optimizer_kwargs).update(
                self.b_output, cg(db_output))

        # endif self.is_trainable

        return next_grad
示例#2
0
    def pass_backward(self, grad):
        _, time_steps, _ = grad.shape

        dW_update = np.zeros_like(self.W_update)
        dW_reset = np.zeros_like(self.W_reset)
        dW_cell = np.zeros_like(self.W_cell)
        dW_final = np.zeros_like(self.W_final)

        db_update = np.zeros_like(self.b_update)
        db_reset = np.zeros_like(self.b_reset)
        db_cell = np.zeros_like(self.b_cell)
        db_final = np.zeros_like(self.b_final)

        dstates = np.zeros_like(self.states)
        dstate_a = np.zeros_like(self.states)
        dstate_b = np.zeros_like(self.states)
        dstate_c = np.zeros_like(self.states)
        dstates_next = np.zeros_like(self.states)
        dstates_prime = np.zeros_like(self.states)

        dz_cell = np.zeros_like(self.cell)
        dcell = np.zeros_like(self.cell)

        dz_reset = np.zeros_like(self.reset)
        dreset = np.zeros_like(self.reset)

        dz_update = np.zeros_like(self.update)
        dupdate = np.zeros_like(self.update)

        next_grad = np.zeros_like(grad)

        for t in np.arange(time_steps)[::-1]:  # reversed

            dW_final += np.dot(self.states[:, t].T, grad[:, t])
            db_final += np.sum(grad[:, t], axis=0)

            dstates[:, t] = np.dot(grad[:, t], self.W_final.T)
            dstates[:, t] += dstates_next[:, t]
            next_grad = np.dot(dstates, self.W_final)

            dcell[:, t] = self.update[:, t] * dstates[:, t]
            dstate_a[:, t] = (1. - self.update[:, t]) * dstates[:, t]
            dupdate[:,
                    t] = self.cell[:,
                                   t] * dstates[:,
                                                t] - self.states[:, t -
                                                                 1] * dstates[:,
                                                                              t]

            dcell[:, t] = activate(self.activation)._backward(
                self.cell[:, t]) * dcell[:, t]
            dW_cell += np.dot(self.z_tilde[:, t - 1].T, dcell[:, t])
            db_cell += np.sum(dcell[:, t], axis=0)
            dz_cell = np.dot(dcell[:, t], self.W_cell.T)

            dstates_prime[:, t] = dz_cell[:, :self.h_units]
            dstate_b[:, t] = self.reset[:, t] * dstates_prime[:, t]

            dreset[:, t] = self.states[:, t - 1] * dstates_prime[:, t]
            dreset[:, t] = activate(self.gate_activation)._backward(
                self.reset[:, t]) * dreset[:, t]
            dW_reset += np.dot(self.z[:, t].T, dreset[:, t])
            db_reset += np.sum(dreset[:, t], axis=0)
            dz_reset = np.dot(dreset[:, t], self.W_reset.T)

            dupdate[:, t] = activate(self.gate_activation)._backward(
                self.update[:, t]) * dupdate[:, t]
            dW_update += np.dot(self.z[:, t].T, dupdate[:, t])
            db_update += np.sum(dupdate[:, t], axis=0)
            dz_update = np.dot(dupdate[:, t], self.W_update.T)

            dz = dz_reset + dz_update
            dstate_c[:, t] = dz[:, :self.h_units]

            dstates_next = dstate_a + dstate_b + dstate_c

        # optimize weights and bias
        self.W_final = optimizer(self.optimizer_kwargs)._update(
            self.W_final, cg(dW_final))
        self.b_final = optimizer(self.optimizer_kwargs)._update(
            self.b_final, cg(db_final))

        self.W_cell = optimizer(self.optimizer_kwargs)._update(
            self.W_cell, cg(dW_cell))
        self.b_cell = optimizer(self.optimizer_kwargs)._update(
            self.b_cell, cg(db_cell))

        self.W_reset = optimizer(self.optimizer_kwargs)._update(
            self.W_reset, cg(dW_reset))
        self.b_reset = optimizer(self.optimizer_kwargs)._update(
            self.b_reset, cg(db_reset))

        self.W_update = optimizer(self.optimizer_kwargs)._update(
            self.W_update, cg(dW_update))
        self.b_update = optimizer(self.optimizer_kwargs)._update(
            self.b_update, cg(db_update))

        return next_grad
示例#3
0
    def pass_backward(self, grad):
        _, time_steps, _ = grad.shape

        dW_forget = np.zeros_like(self.W_forget)
        dW_input = np.zeros_like(self.W_input)
        dW_output = np.zeros_like(self.W_output)
        dW_cell = np.zeros_like(self.W_cell)
        dW_final = np.zeros_like(self.W_final)

        db_forget = np.zeros_like(self.b_forget)
        db_input = np.zeros_like(self.b_input)
        db_output = np.zeros_like(self.b_output)
        db_cell = np.zeros_like(self.b_cell)
        db_final = np.zeros_like(self.b_final)

        dstates = np.zeros_like(self.states)
        dcell = np.zeros_like(self.cell)
        dcell_tilde = np.zeros_like(self.cell_tilde)
        dforget = np.zeros_like(self.forget)
        dinput = np.zeros_like(self.input)
        doutput = np.zeros_like(self.output)

        dcell_next = np.zeros_like(self.cell)
        dstates_next = np.zeros_like(self.states)

        next_grad = np.zeros_like(grad)

        for t in np.arange(time_steps)[::-1]:  # reversed

            dW_final += np.dot(self.states[:, t].T, grad[:, t])
            db_final += np.sum(grad[:, t], axis=0)

            dstates[:, t] = np.dot(grad[:, t], self.W_final.T)
            dstates[:, t] += dstates_next[:, t]
            next_grad = np.dot(dstates, self.W_final)

            doutput[:, t] = activate(self.activation)._forward(
                self.cell[:, t]) * dstates[:, t]
            doutput[:, t] = activate(self.gate_activation)._backward(
                self.output[:, t]) * doutput[:, t]
            dW_output += np.dot(self.z[:, t].T, doutput[:, t])
            db_output += np.sum(doutput[:, t], axis=0)

            dcell[:, t] += self.output[:, t] * dstates[:, t] * activate(
                self.activation)._backward(self.cell[:, t])
            dcell[:, t] += dcell_next[:, t]
            dcell_tilde[:, t] = dcell[:, t] * self.input[:, t]
            dcell_tilde[:, t] = dcell_tilde[:, t] * activate(
                self.activation)._backward(dcell_tilde[:, t])
            dW_cell += np.dot(self.z[:, t].T, dcell[:, t])
            db_cell += np.sum(dcell[:, t], axis=0)

            dinput[:, t] = self.cell_tilde[:, t] * dcell[:, t]
            dinput[:, t] = activate(self.gate_activation)._backward(
                self.input[:, t]) * dinput[:, t]
            dW_input += np.dot(self.z[:, t].T, dinput[:, t])
            db_input += np.sum(dinput[:, t], axis=0)

            dforget[:, t] = self.cell[:, t - 1] * dcell[:, t]
            dforget[:, t] = activate(self.gate_activation)._backward(
                self.forget[:, t]) * dforget[:, t]
            dW_forget += np.dot(self.z[:, t].T, dforget[:, t])
            db_forget += np.sum(dforget[:, t], axis=0)

            dz_forget = np.dot(dforget[:, t], self.W_forget.T)
            dz_input = np.dot(dinput[:, t], self.W_input.T)
            dz_output = np.dot(doutput[:, t], self.W_output.T)
            dz_cell = np.dot(dcell[:, t], self.W_cell.T)

            dz = dz_forget + dz_input + dz_output + dz_cell
            dstates_next[:, t] = dz[:, :self.h_units]
            dcell_next = self.forget * dcell

        # optimize weights and bias
        self.W_final = optimizer(self.optimizer_kwargs)._update(
            self.W_final, cg(dW_final))
        self.b_final = optimizer(self.optimizer_kwargs)._update(
            self.b_final, cg(db_final))

        self.W_forget = optimizer(self.optimizer_kwargs)._update(
            self.W_forget, cg(dW_forget))
        self.b_forget = optimizer(self.optimizer_kwargs)._update(
            self.b_forget, cg(db_forget))

        self.W_input = optimizer(self.optimizer_kwargs)._update(
            self.W_input, cg(dW_input))
        self.b_input = optimizer(self.optimizer_kwargs)._update(
            self.b_input, cg(db_input))

        self.W_output = optimizer(self.optimizer_kwargs)._update(
            self.W_output, cg(dW_output))
        self.b_output = optimizer(self.optimizer_kwargs)._update(
            self.b_output, cg(db_output))

        self.W_cell = optimizer(self.optimizer_kwargs)._update(
            self.W_cell, cg(dW_cell))
        self.b_cell = optimizer(self.optimizer_kwargs)._update(
            self.b_cell, cg(db_cell))

        return next_grad