Пример #1
0
    def construct_updates(self, grads):
        if not self.updates:
            self.updates = OrderedDict({})

        ngrads = OrderedDict({})
        mb_step = sharedX(0, name="mb_step")
        self.updates[mb_step] = mb_step + 1
        cond = TT.eq((mb_step) % self.nbatches, 0)
        rate = 1.0 / self.nbatches

        for op, og in grads.iteritems():
            for i, g in enumerate(self.gs):
                if op.name in g.name:
                    break
            else:
                raise ValueError("Gradient for %s was not found." % op.name)

            if rate < 1.0:
                new_grad = (og + self.gs[i]) * as_floatX(rate)
                self.updates[self.gs[i]] = cond * new_grad + (1 - cond) * og * \
                        as_floatX(rate)
                ngrads[op] = new_grad
            else:
                ngrads[op] = og

        return ngrads
Пример #2
0
    def __init_vals(self):
        self.gs = [theano.shared(as_floatX(k.get_value(borrow=True) * 0.0), \
                        name="grad_%s" % n) for n, k in \
                        self.params.__dict__['params'].iteritems()]

        self.gs_mon = [theano.shared(as_floatX(k.get_value(borrow=True) * 0.0), \
                        name="grad_%s_mon" % n) for n, k in \
                        self.params.__dict__['params'].iteritems()]
Пример #3
0
    def __init__(self,
                 n_hids=None,
                 mem_size=None,
                 mem_nel=None,
                 address_size=None,
                 mem_gater_activ=None,
                 n_mid_key_size=None,
                 scale_size=None,
                 use_scale_layer=True,
                 smoothed_diff_weights=False,
                 use_local_att=False,
                 mem_weight_decay=0.96,
                 read_head=False,
                 use_loc_based_addressing=True,
                 shift_width=3,
                 scale_bias_coef=1.0,
                 use_adv_indexing=False,
                 use_multiscale_shifts=True,
                 use_geom_sig_dot=False,
                 use_reinforce=False,
                 weight_initializer=None,
                 bias_initializer=None,
                 name="nmt_addresser"):

        super(Addresser, self).__init__()

        self.n_hids = n_hids
        self.n_mid_key_size = n_mid_key_size
        self.mem_size = mem_size
        self.mem_nel = mem_nel
        self.use_reinforce = use_reinforce
        self.read_head = read_head
        self.scale_size = scale_size
        self.scale_bias_coef = scale_bias_coef
        self.address_size = address_size
        self.use_scale_layer = use_scale_layer
        self.use_adv_indexing = use_adv_indexing
        self.weight_initializer = weight_initializer
        self.bias_initializer = bias_initializer
        self.name = name
        self.use_loc_based_addressing = use_loc_based_addressing
        self.use_multiscale_shifts = use_multiscale_shifts
        self.shift_width = shift_width
        self.smoothed_diff_weights = smoothed_diff_weights
        self.mem_weight_decay = mem_weight_decay
        self.use_local_att = use_local_att

        if self.use_local_att:
            self.time_idxs = const(as_floatX(np.arange(self.mem_nel)))
            self.time_idxs.name = "time_idxs"

        if self.use_adv_indexing:
            print "Using the advanced indexing."
        else:
            print "Not using the advanced indexing."

        if mem_gater_activ:
            self.mem_gater_activ = mem_gater_activ
        else:
            self.mem_gater_activ = Sigmoid

        if use_geom_sig_dot:
            self.mem_similarity = GeomEuclideanSigmoidDot()
        else:
            self.mem_similarity = MemorySimilarity()

        self.init_params()
Пример #4
0
    def fprop(self,
              state_below,
              memory,
              w_t_before,
              w_t_pre_before=None,
              time_idxs=None):

        if time_idxs is None:
            logger.info("Time indices are empty!")
            time_idxs = self.time_idxs

        fork_outs = self.state_fork_layer.fprop(state_below)
        idx = 0
        # First things first, content based addressing:
        if not self.use_local_att:
            beta_pre = fork_outs[self.names[0]]
            beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],))

            if (state_below.ndim != beta.ndim and beta.ndim == 2
                    and state_below.ndim == 3):
                beta = beta.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != beta.ndim and beta.ndim == 1
                    and state_below.ndim == 2):
                beta = beta.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for beta!")
            beta = TT.shape_padright(beta)
            idx = 1

        key_pre = fork_outs[self.names[idx]]
        idx += 1
        key_t = key_pre
        sim_vals = self.mem_similarity(key_t, memory)

        weights = sim_vals
        new_pre_weights = None

        if self.smoothed_diff_weights:
            dw_scaler = fork_outs[self.names[idx]]
            dw_scaler = TT.addbroadcast(dw_scaler, 1)
            weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before
            new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \
                    self.mem_weight_decay) * w_t_pre_before
            idx += 1
        std = 5

        """
        if self.use_local_att:
            mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below))
            exp_ws = -(time_idxs - mean)**2 / (2.0 * std)
            weights = exp_ws * weights
        """

        if self.use_local_att:
            w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights)
        else:
            if weights.ndim == 3 and beta.ndim == 2:
                beta = beta.dimshuffle('x', 0, 1)
                w_tc = softmax3(weights * beta)
            else:
                # Content based weights:
                w_tc = TT.nnet.softmax(weights * beta)

        if self.use_local_att:
            first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\
                    self.weights_below_local.fprop(weights))
            mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer))
            mean = TT.addbroadcast(mean, 1)
            exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std))
            w_tc = exp_ws * w_tc
            w_tc = w_tc / w_tc.sum(axis=1, keepdims=True)

        if self.use_loc_based_addressing:
            # Location based addressing:
            g_t_pre = fork_outs[self.names[idx]]
            g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],))

            if (state_below.ndim != g_t.ndim and g_t.ndim == 2
                    and state_below.ndim == 3):
                g_t = g_t.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != g_t.ndim and g_t.ndim == 1
                    and state_below.ndim == 2):
                g_t = g_t.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for g_t!")

            g_t = TT.shape_padright(g_t)
            w_tg = g_t * w_tc + (1 - g_t) * w_t_before
            shifts_pre = fork_outs[self.names[idx + 1]]

            if shifts_pre.ndim == 2:
                if self.use_multiscale_shifts:

                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                        scales = scales.dimshuffle(0, 'x', 1)
                    else:
                        scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0))

                    shifts_pre = shifts_pre.reshape((state_below.shape[0],
                                                     -1,
                                                     self.scale_size))

                    shifts_pre = (shifts_pre * scales).sum(-1)

                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))

                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                else:
                    shifts_pre = shifts_pre.reshape(
                        (state_below.shape[0], self.mem_nel))

                if state_below.ndim == 3:
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x')
                else:
                    shifts_pre = shifts_pre.dimshuffle(0, 1)
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True)
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
            elif shifts_pre.ndim == 1:
                if self.use_multiscale_shifts:
                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                    else:
                        scales = TT.exp(TT.arange(self.scale_size))

                    shifts_pre = shifts_pre.reshape((-1, self.scale_size))
                    shifts_pre = (shifts_pre * scales).sum(-1)
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, 1))
                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, 1))
                else:
                    shifts_pre = shifts_pre.reshape((self.mem_nel,))

                if state_below.ndim == 2:
                    shifts_pre = TT.shape_padright(shifts_pre)
                    shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True)

            shifts = TT.exp(shifts_pre)
            if shifts.ndim == 2:
                shifts = shifts / shifts.sum(axis=0, keepdims=True)
            elif shifts.ndim == 3:
                shifts = shifts / shifts.sum(axis=1, keepdims=True)

            CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\
                    CircularConvolve

            w_t_hat = CC()(weights=w_tg, shifts=shifts,
                           mem_size=self.mem_nel,
                           shift_width=self.shift_width)

            if self.use_reinforce:
                if w_t_hat.ndim == 2:
                    w_t = TT.nnet.softmax(w_t_hat)
                elif w_t_hat.ndim == 3:
                    w_t = softmax3(w_t_hat)
            else:
                gamma_pre = fork_outs[self.names[4]]
                assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for "
                                                        " w_t_hat and gamma_pre should "
                                                        " be the same")

                if gamma_pre.ndim == 1:
                    gamma_pre = gamma_pre
                else:
                    gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],))

                gamma_pre = TT.shape_padright(gamma_pre)
                gamma = TT.nnet.softplus(gamma_pre) + const(1)

                w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42)
                if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2
                        and state_below.ndim == 3):
                    w_t = w_t.reshape((state_below.shape[0], state_below.shape[1]))
                    w_t = w_t.dimshuffle(0, 1, 'x')
                elif (state_below.ndim != w_t.ndim and w_t.ndim == 1
                        and state_below.ndim == 2):
                    w_t = w_t.reshape((state_below.shape[0],))
                    w_t = w_t.dimshuffle(0, 'x')

                if w_t.ndim == 2:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
                elif w_t.ndim == 3:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
        else:
            w_t = w_tc

        return [w_t], [new_pre_weights]
Пример #5
0
    def init_params(self):
        if not self.use_local_att:
            names = ["fork_state_beta_t",
                     "fork_state_key_t"]
            self.n_outs = [1, self.mem_size + self.address_size]

        else:
            names = ["fork_state_key_t"]
            self.n_outs = [self.mem_size + self.address_size]

        self.shift_size = self.mem_nel

        if self.use_multiscale_shifts:
            logger.info("Using the multiscale shifts.")
            if self.scale_size is None or self.scale_size < -1:
                self.scale_size = int(np.floor(np.log(self.mem_nel)))
                logger.info("Size of the scales is %d" % self.scale_size)

        self.shift_size = self.shift_width * self.scale_size

        binit_vals = [None, None]


        if self.smoothed_diff_weights:
            names.append("fork_state_diff_gate")
            self.n_outs += [1]
            binit_vals += [-0.16]

        if self.use_loc_based_addressing:
            names +=  [ "fork_state_gater_t",
                        "fork_state_shift_hat_t" ]

            self.n_outs += [1, self.shift_size]
            binit_vals += [None, None]

            if not self.use_reinforce:
                names += [ "fork_state_sharpen_hat_t" ]
                self.n_outs += [1]
                binit_vals += [0.001]

            if self.use_scale_layer:
                self.scale_layer = AffineLayer(n_in=self.n_hids,
                                               n_out=self.scale_size,
                                               weight_initializer=self.weight_initializer,
                                               bias_initializer=self.bias_initializer,
                                               use_bias=True,
                                               name=self.pname("scale_layer"))

                pname = self.scale_layer.params.getparamname("bias")
                arng = as_floatX(np.arange(self.scale_size))
                arng = arng / arng.sum()
                self.scale_layer.params[pname] = self.scale_bias_coef * arng
                self.children.extend([self.scale_layer])

        if self.use_local_att:
            bott_size = self.n_hids
            logger.info("Using the local attention.")
            self.state_below_local = AffineLayer(n_in=self.n_hids,
                                                 n_out=bott_size,
                                                 weight_initializer=self.weight_initializer,
                                                 bias_initializer=self.bias_initializer,
                                                 use_bias=True,
                                                 name=self.pname("state_below_loc_layer"))

            self.weights_below_local = AffineLayer(n_in=self.mem_nel,
                                                   n_out=bott_size,
                                                   weight_initializer=self.weight_initializer,
                                                   bias_initializer=self.bias_initializer,
                                                   use_bias=False,
                                                   name=self.pname("weights_loc_layer"))

            self.mean_pred = AffineLayer(n_in=bott_size,
                                         n_out=1,
                                         weight_initializer=self.weight_initializer,
                                         bias_initializer=self.bias_initializer,
                                         use_bias=True,
                                         name=self.pname("mean_pred"))

            self.children.extend([self.state_below_local, self.weights_below_local, self.mean_pred])

        names = map(lambda x: self.pname(x), names)
        self.names = names

        self.state_fork_layer = ForkLayer(n_in=self.n_hids,
                                          n_outs=self.n_outs,
                                          weight_initializer=self.weight_initializer,
                                          bias_initializer=self.bias_initializer,
                                          init_bias_vals = binit_vals,
                                          names=names)

        self.children.extend([self.state_fork_layer])
        self.powerup_layer = None
        self.merge_params()
Пример #6
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({
                p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0,
                            grads[p])
                for p in grads.keys()
            })

        #Block-normalize gradients:
        nparams = len(grads.keys())

        #Apply the gradient clipping, this is only sometimes
        #necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                g * self.grad_clip * nparams / gnorm, g)
                grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg)

        tot_norm_up = 0
        tot_param_norm = 0

        fix_decay = self.slow_decay**(step + 1)
        for param in grads.keys():
            grads[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps,
                                name="mean_grad_%s" % param.name)
            mean_corrected_grad = sharedX(param.get_value() * 0 + eps,
                                          name="mean_corrected_grad_%s" %
                                          param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps),
                                name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" %
                                          param.name)
            """
               Initialization of accumulators
            """
            taus_x_t = sharedX(
                (np.ones_like(param.get_value()) + eps) * slow_constant,
                name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.,
                                     name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = grads[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (prod_taus * (1 - 1 / taus_x_t))
            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) +
                                     T.sqr(norm_grad) / (taus_x_t))
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) +
                             norm_grad / taus_x_t)

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay +
                             T.sqr(norm_grad).sum() * (1 - self.slow_decay))

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t)
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (mg - norm_grad) * (old_grad - mg)) / taus_x_t)

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (
                    1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (sum_square_grad + T.sqr(g))
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) +
                                 (cur_curvature / taus_x_t))
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) +
                                     (cur_curvature_sqr / taus_x_t))
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t /
                                      (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info(
                    "Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info(
                    "Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(
                1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) +
                                  (T.sqr(delta_x_t) / taus_x_t))

            #To compute the E[\Delta]_t
            new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)))

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(
                T.or_(
                    abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)),
                    abs(cur_curvature - nc_ave) >
                    (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                T.switch(new_taus_t > 2.5, sharedX(2.5),
                         new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) +
                             (delta_x_t * cur_curvature) * (1 / taus_x_t))

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            tot_param_norm += param.norm(2)

            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        return updates, tot_norm_up, tot_param_norm
Пример #7
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        tot_param_norm = 0

        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()]))
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping

            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        for param in grads.keys():

            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(grads[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * grads[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_param_norm += param.norm(2)
            tot_norm_up += learning_rate * normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity

        update_param_norm_ratio = tot_norm_up / (tot_param_norm + 1e-7)

        new_lr = ifelse.ifelse(
            T.ge(update_param_norm_ratio, self.update_param_norm_ratio),
            as_floatX(learning_rate * self.update_param_norm_ratio) /
            update_param_norm_ratio, as_floatX(learning_rate))

        new_lr = ifelse.ifelse(T.ge(counter, 6000), new_lr,
                               as_floatX(learning_rate))

        for param in grads.keys():
            updates[param] = param + new_lr * normalized_velocities[param]

        updates[counter] = counter + 1
        return updates, tot_norm_up, tot_param_norm
Пример #8
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]),
                T.isnan(grads[p])), 0, grads[p]) for
                p in grads.keys()})

        # Block-normalize gradients:
        nparams = len(grads.keys())

        # Apply the gradient clipping, this is only sometimes
        # necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                 g * self.grad_clip * nparams / gnorm , g)
                grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg)

        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        fix_decay = self.slow_decay**(step + 1)

        for param in gshared.keys():
            gshared[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps),
                                 name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" % param.name)

            """
               Initialization of accumulators
            """
            taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant,
                               name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = gshared[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (
                prod_taus * (1 - 1 / taus_x_t)
            )

            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (
                mean_square_grad * (1 - 1 / taus_x_t)  +
                T.sqr(norm_grad) / (taus_x_t)
            )
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (
                mean_grad * (1 - 1 / taus_x_t) +
                norm_grad / taus_x_t
            )

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (
                    gnorm_sqr_o * self.slow_decay +
                    T.sqr(norm_grad).sum() * (1 - self.slow_decay)
            )

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (
                gamma_nume_sqr * (1 - 1 / taus_x_t) +
                T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t
            )
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (
                gamma_deno_sqr * (1 - 1 / taus_x_t) +
                T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t
            )

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (
                    sum_square_grad + T.sqr(g)
                )
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (
                mean_curvature * (1 - 1 / taus_x_t) +
                (cur_curvature / taus_x_t)
            )
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (
                mean_curvature_sqr * (1 - 1 / taus_x_t) +
                (cur_curvature_sqr / taus_x_t)
            )
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info("Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info("Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (
                 msdx * (1 - 1 / taus_x_t) +
                 (T.sqr(delta_x_t) / taus_x_t)
             )

            #To compute the E[\Delta]_t
            new_mean_dx = (
                mdx * (1 - 1 / taus_x_t) +
                (delta_x_t / (taus_x_t))
            )

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq  - mg**2)),
                                        abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                                        T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (
                cov_num_t * (1 - 1 / taus_x_t) +
                (delta_x_t * cur_curvature) * (1 / taus_x_t)
            )

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
    def fprop(self,
              inps=None,
              use_mask=True,
              use_cmask=True,
              use_noise=False,
              mdl_name=None):

        self.build_model(use_noise=use_noise, mdl_name=mdl_name)

        if not inps:
            inps = self.inps

        X = inps[0]

        if use_mask:
            mask = inps[2]
            qmask = inps[3]

        if use_cmask:
            cmask = inps[4]

        assert (3 + sum([use_mask, use_cmask
                         ])) == len(inps), "inputs have illegal shape."
        m0 = as_floatX(TT.gt(X, 0))

        if cmask is not None:
            m1 = mask * TT.eq(cmask, 0)
        else:
            raise ValueError("Mask for the answers should not be empty.")

        dropOp = None
        low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1)
        Xr = X.reshape(low_inp_shp)

        grulow_inps = self.grulow_layer.fprop(Xr, deterministic=not use_noise)

        linps = [low_reset_below, low_gater_below, low_state_below]
        inp_shp = (X.shape[1], X.shape[2], -1)

        h0 = self.low_gru_layer.fprop(inps=linps,
                                      mask=m0,
                                      batch_size=self.batch_size)

        h0 = m1.dimshuffle(0, 1, 'x') * (h0.reshape(
            (X.shape[0], X.shape[1], X.shape[2], -1))[-1]).reshape(inp_shp)

        if self.dropout:
            if dropOp is None:
                dropOp = Dropout(dropout_prob=self.dropout)
            h0 = dropOp(h0, deterministic=not use_noise)

        gruup_inps = self.gruup_layer.fprop(h0, deterministic=not use_noise)

        reset_below = gruup_inps.values()[0].reshape(inp_shp)
        gater_below = gruup_inps.values()[1].reshape(inp_shp)
        state_below = gruup_inps.values()[2].reshape(inp_shp)
        uinps = [reset_below, gater_below, state_below]

        h1, _ = self.gru_layer.fprop(inps=uinps,
                                     maskf=m1,
                                     maskq=qmask,
                                     batch_size=self.batch_size)

        if self.dropout:
            if dropOp is None:
                dropOp = Dropout(dropout_prob=self.dropout)
            h1 = dropOp(h1, deterministic=not use_noise)

        out_layer = self.out_layer.fprop(h1, deterministic=not use_noise)
        self.probs = Softmax(out_layer)
        return self.probs, h1
Пример #10
0
    def fprop(self, inps=None, leak_rate=0.05, use_noise=False, mdl_name=None):

        self.build_model(use_noise=use_noise, mdl_name=mdl_name)
        self.ntm.evaluation_mode = use_noise
        if not inps:
            inps = self.inps

        # First two are X and targets
        # assert (2 + sum([use_mask, use_cmask])) + 1 >= len(inps), \
        #    "inputs have illegal shape."
        cmask = None
        mask = None
        if isinstance(inps, list):
            X = inps[0]
            y = inps[1]

            if self.use_mask:
                mask = inps[2]
                if self.use_cost_mask:
                    cmask = inps[3]
        else:
            X = inps['X']
            y = inps['y']
            if self.use_mask:
                mask = inps['mask']
                if self.use_cost_mask:
                    cmask = inps['cmask']

        if self.use_cost_mask:
            if cmask is not None:
                if self.use_bow_cost_mask:
                    if mask.ndim == cmask.ndim:
                        m = (mask * TT.eq(cmask, 0)).reshape(
                            (cmask.shape[0] * cmask.shape[1], -1))
                    else:
                        m = (mask.dimshuffle(0, 1, 'x') *
                             TT.eq(cmask, 0))[:, :, 0].reshape(
                                 (cmask.shape[0] * cmask.shape[1], -1))
                else:
                    m = mask
            else:
                raise ValueError("Mask for the answers should not be empty.")

        if X.ndim == 2 and y.ndim == 1:
            # For sequential MNIST.
            if self.permute_order:
                X = X.dimshuffle(1, 0)
                idxs = self.rnd_indxs
                X = X[idxs]
            inp_shp = (X.shape[0], X.shape[1], -1)
        else:
            inp_shp = (X.shape[1], X.shape[2], -1)

        #import pdb;pdb.set_trace()
        self.ntm_in = None
        if self.use_bow_input and not self.use_gru_inp_rep and not self.use_simple_rnn_inp_rep:

            bow_out = self.bow_layer.fprop(X,
                                           amask=m,
                                           deterministic=not use_noise)
            bow_out = bow_out.reshape((X.shape[1], X.shape[2], -1))
            self.ntm_in = bow_out

        elif self.use_gru_inp_rep:
            m0 = as_floatX(TT.gt(X, 0))
            if self.use_mask and self.use_cost_mask:
                if cmask is not None:
                    m1 = mask * TT.eq(cmask, 0)
                else:
                    raise ValueError(
                        "Mask for the answers should not be empty.")

            low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1)
            Xr = X.reshape(low_inp_shp)
            grufact_inps = self.gru_fact_layer_inps.fprop(Xr)
            low_reset_below = grufact_inps.values()[0].reshape(low_inp_shp)
            low_gater_below = grufact_inps.values()[1].reshape(low_inp_shp)
            low_state_below = grufact_inps.values()[2].reshape(low_inp_shp)
            linps = [low_reset_below, low_gater_below, low_state_below]

            m0_part = TT.cast(
                m0.sum(0).reshape(
                    (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32')
            m0_part = TT.switch(TT.eq(m0_part, as_floatX(0)), as_floatX(1),
                                m0_part)

            h0 = self.gru_fact_layer.fprop(inps=linps,
                                           mask=m0,
                                           batch_size=self.batch_size)

            self.ntm_in = m1.dimshuffle(0, 1, 'x') * ((m0.dimshuffle(0, 1, 2, 'x') * h0.reshape((X.shape[0],
                                                                   X.shape[1],
                                                                   X.shape[2],
                                                                   -1))).sum(0) \
                                                                           / m0_part).reshape(inp_shp)
        elif self.use_simple_rnn_inp_rep:
            m0 = as_floatX(TT.gt(X, 0))
            if cmask is not None:
                m1 = mask * TT.eq(cmask, 0)
            else:
                raise ValueError("Mask for the answers should not be empty.")

            low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1)
            Xr = X.reshape(low_inp_shp)
            rnnfact_inps = self.rnn_fact_layer_inps.fprop(Xr).reshape(
                low_inp_shp)
            m0 = m0.reshape(low_inp_shp)

            h0 = self.rnn_fact_layer.fprop(inps=rnnfact_inps,
                                           mask=m0,
                                           batch_size=self.batch_size)

            m0_part = TT.cast(
                m0.sum(0).reshape(
                    (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32')
            m0_part = TT.switch(m0_part == 0, as_floatX(1), m0_part)
            self.ntm_in = m1.dimshuffle(0, 1, 'x') * (h0.reshape((X.shape[0],
                                                                  X.shape[1],
                                                                  X.shape[2],
                                                                  -1)).sum(0) / \
                                                                          m0_part).reshape(inp_shp)

        else:
            X_proj = self.inp_proj_layer.fprop(X)
            if not self.learn_embeds:
                X_proj = block_gradient(X_proj)

            if self.use_batch_norm:
                X_proj = self.batch_norm_layer.fprop(X_proj,
                                                     inference=not use_noise)
            self.ntm_in = X_proj

        context = None
        if self.use_context:
            if self.use_qmask:
                context = (self.qmask.dimshuffle(0, 1, 'x') *
                           self.ntm_in).sum(0)
            else:
                m1_part = m1.sum(0).dimshuffle(0, 'x')
                context = self.ntm_in.sum(0) / m1_part

        self.ntm_outs = self.ntm.fprop(self.ntm_in,
                                       mask=mask,
                                       cmask=cmask,
                                       context=context,
                                       batch_size=self.batch_size,
                                       use_mask=self.use_mask,
                                       use_noise=not use_noise)

        h, m_read = self.ntm_outs[0], self.ntm_outs[2]

        if self.use_reinforce:
            self.w_samples, self.r_samples = self.ntm_outs[-2], self.ntm_outs[
                -1]

            if self.smoothed_diff_weights:
                idx = -6
            else:
                idx = -4

            self.write_weights, self.read_weights = self.ntm_outs[idx], \
                    self.ntm_outs[idx+1]
        else:
            self.write_weights, self.read_weights = self.ntm_outs[
                3], self.ntm_outs[4]

        if self.anticorrelation:
            acorr = AntiCorrelationConstraint(level=self.anticorrelation)
            rw1 = self.read_weights[:, 0]
            rw2 = self.read_weights[:, 1]
            self.reg += acorr(rw1, rw2, mask=mask)

        if self.correlation_ws:
            logger.info("Applying the correlation constraint.")
            corr_cons = CorrelationConstraint(level=self.correlation_ws)
            self.reg += corr_cons(self.read_weights, self.write_weights, mask,
                                  self.qmask)

        if self.use_last_hidden_state:
            h = h.reshape(inp_shp)
            h = h[-1]

        if self.use_deepout:
            merged_out = self.merge_layer.fprop([h, m_read])
            out_layer = Leaky_Rect(merged_out, leak_rate)

            if self.dropout:
                dropOp = Dropout(dropout_prob=self.dropout)
                out_layer = dropOp(out_layer, deterministic=not use_noise)

            out_layer = self.out_layer.fprop(out_layer,
                                             deterministic=not use_noise)
        else:
            if self.use_out_mem:
                if self.dropout:
                    dropOp = Dropout(dropout_prob=self.dropout)
                    m_read = dropOp(m_read, deterministic=not use_noise)

                mem_out = self.out_mem.fprop(m_read,
                                             deterministic=not use_noise)
                mem_scaler = self.out_scaler.fprop(
                    h, deterministic=not use_noise).reshape(
                        (mem_out.shape[0], )).dimshuffle(0, 'x')

                h_out = self.out_layer.fprop(h, deterministic=not use_noise)
                out_layer = h_out + mem_out * Sigmoid(mem_scaler)
            else:
                if self.dropout:
                    dropOp = Dropout(dropout_prob=self.dropout)
                    h = dropOp(h, deterministic=not use_noise)
                out_layer = self.out_layer.fprop(h,
                                                 deterministic=not use_noise)

        if self.predict_bow_out and self.bow_out_layer:
            logger.info("Using the bow output prediction.")
            self.bow_pred_out = Sigmoid(
                self.bow_out_layer.fprop(h, deterministic=not use_noise))

        if self.softmax:
            self.probs = Softmax(out_layer)
        else:
            self.probs = Sigmoid(out_layer)

        if self.ntm.updates:
            self.updates.update(self.ntm.updates)

        self.str_params(logger)

        self.h = h
        return self.probs, self.ntm_outs
Пример #11
0
    def __call__(self, probs,
                 samples,
                 baseline,
                 updates,
                 cost = None,
                 cost_mean=None,
                 mask=None,
                 seq_len=20,
                 batch_size=140,
                 deterministic=False,
                 dimshuffle_probs=True):

        print("Using the input based baseline")
        if input is None:
            raise ValueError("input for the %s should"
                             " not be empty." % __class__.__name__)

        if cost_mean is None:
            cost_mean = cost.mean()

        step = 0
        key_step = get_key_byname_from_dict(updates, "step")
        if key_step:
            step = updates[key_step]
        else:
            step = sharedX(0., name="step")
            updates[step] = step + as_floatX(1)

        key_center = get_key_byname_from_dict(updates, "center")
        if key_center:
            center = updates[key_center]
            new_center = center
        else:
            if self.generative_pred:
                center = sharedX(np.zeros((self.maxlen,)) + 0.15 + self.eps, name="center")
                new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.mean(-1)
            else:
                center = sharedX(0.15 + self.eps, name="center")
                assert cost_mean is not None, "Cost mean should not be empty!"
                if cost.ndim > 2 and cost.broadcastable[0] is False:
                    new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean
                else:
                    new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean
            updates[center] = new_center

        key_cvar = get_key_byname_from_dict(updates, "cost_var")
        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            if self.generative_pred:
                cost_var_tot = (cost_mean - new_center)**2
                cost_var = sharedX(numpy.zeros((self.maxlen,)) + as_floatX(1.0), name="cost_var")
            else:
                if cost.ndim >  2 and cost.broadcastable[0] is False:
                    cost_var_tot = (cost_mean - new_center)**2
                else:
                    cost_var_tot = (cost_mean - new_center)**2

                cost_var = sharedX(1.0, name="cost_var")

            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \
                    cost_var_tot
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg
        """
        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)
        """

        if dimshuffle_probs:
            probsd = probs.dimshuffle(0, 2, 1)
        else:
            probsd = probs

        if samples.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')
            policy = -(TT.log(probsd + 1e-8) * samples).mean((2, 3)).sum()
        else:
            if cost.ndim == 2:
                if dimshuffle_probs:
                    reward = cost.dimshuffle(0, 'x', 1)
                    if self.generative_pred:
                        new_center = new_center.dimshuffle(0, 'x', 'x')
                        new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x')
                    baseline = baseline.dimshuffle(0, 2, 1)
                else:
                    reward = cost.dimshuffle(0, 1, 'x')
                policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum()
            elif cost.ndim == 1:
                reward = cost.dimshuffle('x', 0, 'x')
                if dimshuffle_probs:
                    baseline = baseline.dimshuffle(0, 2, 1)
                else:
                    baseline = baseline.dimshuffle(1, 0, 2)

        cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0)
        centered_reward = (reward - baseline - new_center) / cost_std

        if cost.ndim == 2:
            centered_reward = TT.addbroadcast(centered_reward, 1)

        N = probs.shape[-1]
        gradp = self.lambda1_reg * (centered_reward) * \
                (samples / (probsd + 1e-8)) + lambda2_reg * (TT.log(probsd + 1e-6) + as_floatX(1))

        if dimshuffle_probs:
            gradp = gradp.dimshuffle(0, 2, 1)

        if mask is not None:
            if self.generative_pred:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp / N
            else:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp

        known_grads = {probs: gradp}
        return updates, known_grads, new_center, cost_std, policy, lambda2_reg
Пример #12
0
    def __call__(self, probs,
                 samples,
                 updates,
                 cost=None,
                 mask=None,
                 deterministic=False,
                 child_probs=None,
                 dimshuffle_probs=False,
                 child_samples=None):

        if input is None:
            raise ValueError("input for the %s should "
                             " not be empty." % __class__.__name__)

        key_baseline = get_key_byname_from_dict(updates, "baseline")
        step = 0

        if key_baseline:
            rbaseline = updates[key_baseline]
            key_step = get_key_byname_from_dict(updates, "step")
            if key_step:
                step = updates[key_step]
            else:
                step = sharedX(0., name="step")
        else:
            if self.generative_pred:
                baseline = sharedX(np.zeros((self.maxlen,)) + 1.0 + self.eps, name="baseline")
            else:
                baseline = sharedX(0. + 1.0 + self.eps, name="new_baseline")

            key_step = get_key_byname_from_dict(updates, "step")
            fix_decay = self.decay**(step + as_floatX(1))

            if key_step:
                step = updates[key_step]
            else:
                step = sharedX(0., name="step")
                updates[step] = step + as_floatX(1)

            if self.use_rms_baseline:
                if self.generative_pred:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)**2
                else:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()**2

                updates[baseline] = new_baseline
                rbaseline = new_baseline / (1 - fix_decay)
                rbaseline = TT.sqrt(rbaseline)
            else:
                if self.generative_pred:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)
                else:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()

                updates[baseline] = new_baseline
                rbaseline = new_baseline

        key_cvar = get_key_byname_from_dict(updates, "cost_var")

        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            if self.generative_pred:
                cost_var = sharedX(np.zeros((self.maxlen,)) + as_floatX(1.2), name="cost_var")
                cost_var_ave = (cost.mean(-1) - new_baseline)**2
            else:
                cost_var = sharedX(as_floatX(1.2), name="cost_var")
                cost_var_ave = (cost.mean() - new_baseline)**2

            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * cost_var_ave
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg

        """
        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)
        """

        if dimshuffle_probs:
            probsd = probs.dimshuffle(0, 2, 1)
        else:
            probsd = probs

        if probs.ndim == 3 and cost.ndim == 1:
            if dimshuffle_probs:
                reward = cost.dimshuffle('x', 'x', 0)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 'x', 0)
                    cost_std = new_cost_var.dimshuffle('x', 'x', 0)
            else:
                reward = cost.dimshuffle('x', 0, 'x')
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 0, 'x')
                    cost_std = new_cost_var.dimshuffle('x', 0, 'x')
        elif probs.ndim == 3 and cost.ndim == 2:
            if dimshuffle_probs:
                reward = cost.dimshuffle(0, 'x', 1)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle(0, 'x', 'x')
                    new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x')
            else:
                reward = cost.dimshuffle('x', 0, 1)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 0, 'x')
                    new_cost_var = new_cost_var.dimshuffle('x', 0, 'x')

        elif probs.ndim == 4 and self.cost.ndim == 1:
            reward = cost.dimshuffle('x', 'x', 0, 'x')
        elif probs.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')

        centered_cost = reward - rbaseline
        N = probsd.shape[-1]
        if self.use_cost_std:
            cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0)
        else:
            cost_std = 1

        if child_probs is not None and child_samples is not None:
            cprobs1 = child_samples / (child_probs + 1e-8) + samples / (probsd + 1e-8)
        else:
            cprobs1 = samples / (probsd + 1e-8)

        gradp = self.lambda1_reg * (centered_cost / cost_std) * \
                (cprobs1) + (lambda2_reg) * (TT.log(probsd + 1e-8) + as_floatX(1))

        if dimshuffle_probs:
            gradp = gradp.dimshuffle(0, 2, 1)

        if mask is not None:
            if dimshuffle_probs:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp
            else:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp / N

        known_grads = {probs: gradp}
        policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum()
        return updates, known_grads, rbaseline, cost_std, policy, lambda2_reg
Пример #13
0
    def fprop(self,
              inps=None,
              use_mask=True,
              use_cmask=True,
              use_noise=False,
              mdl_name=None):

        self.build_model(use_noise=use_noise, mdl_name=mdl_name)

        if not inps:
            inps = self.inps

        X = inps[0]

        if use_mask:
            mask = inps[2]

        if use_cmask:
            cmask = inps[3]
            qmask = inps[4]

        assert (3 + sum([use_mask, use_cmask
                         ])) == len(inps), "inputs have illegal shape."

        if cmask is not None:
            m = mask * TT.eq(cmask.reshape(
                (cmask.shape[0], cmask.shape[1])), 0)
        else:
            raise ValueError("Mask for the answers should not be empty.")

        bow_out = self.bow_layer.fprop(X,
                                       amask=m,
                                       qmask=qmask,
                                       deterministic=not use_noise)
        new_bow = TT.roll(bow_out, 1, axis=0)
        new_bow = TT.set_subtensor(new_bow[0], as_floatX(0))
        bow_outs = self.bowup_layer.fprop(bow_out, deterministic=not use_noise)

        forget_below = bow_outs[self.cnames[0]].reshape(
            (X.shape[1], X.shape[2], -1))
        input_below = bow_outs[self.cnames[1]].reshape(
            (X.shape[1], X.shape[2], -1))
        output_below = bow_outs[self.cnames[2]].reshape(
            (X.shape[1], X.shape[2], -1))
        cell_below = bow_outs[self.cnames[3]].reshape(
            (X.shape[1], X.shape[2], -1))

        inps = [forget_below, input_below, output_below, cell_below]

        h, c = self.lstm_layer.fprop(inps=inps,
                                     mask=mask,
                                     batch_size=self.batch_size)
        if self.deepout:
            h_deepout = self.deepout_layer_ht.fprop(h)
            emb_deepout = self.deepout_layer_qbow.fprop(new_bow)
            z = Leaky_Rect(h_deepout + emb_deepout, 0.01)
            if self.dropout:
                dropOp = Dropout(dropout_prob=self.dropout)
                z = dropOp(z, deterministic=not use_noise)
        else:
            z = h
            if self.dropout:
                dropOp = Dropout(dropout_prob=self.dropout)
                z = dropOp(z, deterministic=not use_noise)

        out_layer = self.out_layer.fprop(z, deterministic=not use_noise)
        self.probs = Softmax(out_layer)
        return self.probs, h
Пример #14
0
    def __call__(self, probs,
                 samples,
                 baseline,
                 updates,
                 cost = None,
                 mask=None,
                 seq_len=20,
                 batch_size=140,
                 deterministic=False):

        if input is None:
            raise ValueError("input for the %s should"
                             " not be empty." % __class__.__name__)
        step = 0
        key_step = get_key_byname_from_dict(updates, "step")
        if key_step:
            step = updates[key_step]
        else:
            step = sharedX(0., name="step")
            updates[step] = step + as_floatX(1)

        key_center = get_key_byname_from_dict(updates, "center")
        if key_center:
            center = updates[key_center]
            new_center = center
        else:
            center = sharedX(0.08 + self.eps, name="center")
            new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.sum(0).mean()
            updates[center] = new_center

        key_cvar = get_key_byname_from_dict(updates, "cost_var")
        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            cost_var_tot = (cost.sum(0).mean() - new_center)**2
            cost_var = sharedX(as_floatX(0.5), name="cost_var")
            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \
                    cost_var_tot
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg

        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)

        action_probs = samples * probs
        if samples.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')
            policy = (TT.log(probs + 1e-8) * samples).mean((2, 3)).sum()
        else:
            if cost.ndim == 2:
                reward = cost.dimshuffle(0, 1, 'x')
            elif cost.ndim == 1:
                reward = cost.dimshuffle('x', 0, 'x')
                baseline = baseline.dimshuffle(1, 0, 2)

            policy = (TT.log(probs + 1e-8) * samples).mean((1, 2)).sum()

        cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1e-6)
        centered_reward = (reward - baseline - new_center) / cost_std
        N = probs.shape[-1]

        gradp = self.lambda1_reg * (centered_reward) * \
                (samples / (probs + 1e-8)) + lambda2_reg * (TT.log(probs + 1e-6) + as_floatX(1))

        if mask is not None:
            gradp = mask.dimshuffle(0, 1, 'x') * gradp / N

        known_grads = {probs: gradp}
        return updates, known_grads, new_center, cost_std, policy, lambda2_reg