示例#1
0
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate], [],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#2
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0
        tot_param_norm = 0

        for param in grads.keys():

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(grads[param]))

            # Compute update
            epsilon = lr_scalers.get(param, 1.) * learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * grads[param]

            # Accumulate updates
            new_mean_square_dx = (self.decay * mean_square_dx +
                                  (1 - self.decay) * T.sqr(delta_x_t))

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)
            tot_param_norm += param.norm(2)

        return updates, tot_norm_up, tot_param_norm
示例#3
0
def adam(lr, tparams, grads, inp, cost, errors):
    gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                           name='%s_grad' % p.name)
                           for p, g in grads.iteritems()})

    gsup = [(gshared[p], g) for p, g in grads.iteritems()]
    gnorm = get_norms(grads.values())
    pnorm = get_norms(tparams.values())
    f_grad_shared = theano.function(inp,
                                    [cost, errors,
                                        gnorm, pnorm],
                                    updates=gsup,
                                    profile=profile)

    lr0 = lr
    b1 = 0.1
    b2 = 0.001
    e = 1e-8

    updates = []
    i = sharedX(numpy.float32(0.))
    i_t = i + 1.

    fix1 = 1.0 - (1 - b1)**(i_t)
    fix2 = 1.0 - (1 - b2)**(i_t)

    lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
    up_list = []

    for p in tparams.values():
        g = gshared[p]
        m = sharedX(p.get_value() * 0.)
        v = sharedX(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        up_list.append(lr_t * g_t)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))

    updates.append((i, i_t))
    upnorm = get_norms(up_list)
    f_update = theano.function([lr],
                               [upnorm],
                               updates=updates,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate],
                                   [],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#5
0
def adam(lr, tparams, grads, inp, cost, errors):
    gshared = OrderedDict({
        p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
        for p, g in grads.iteritems()
    })

    gsup = [(gshared[p], g) for p, g in grads.iteritems()]
    gnorm = get_norms(grads.values())
    pnorm = get_norms(tparams.values())
    f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                    updates=gsup,
                                    profile=profile)

    lr0 = lr
    b1 = 0.1
    b2 = 0.001
    e = 1e-8

    updates = []
    i = sharedX(numpy.float32(0.))
    i_t = i + 1.

    fix1 = 1.0 - (1 - b1)**(i_t)
    fix2 = 1.0 - (1 - b2)**(i_t)

    lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
    up_list = []

    for p in tparams.values():
        g = gshared[p]
        m = sharedX(p.get_value() * 0.)
        v = sharedX(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        up_list.append(lr_t * g_t)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))

    updates.append((i, i_t))
    upnorm = get_norms(up_list)
    f_update = theano.function([lr], [upnorm],
                               updates=updates,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
示例#6
0
    def __init__(self,
                 decay=0.95,
                 gamma_clip=0.0,
                 grad_clip=None,
                 start_var_reduction=0,
                 delta_clip=None,
                 use_adagrad=False,
                 skip_nan_inf=False,
                 use_corrected_grad=True):

        assert decay >= 0.
        assert decay < 1.

        self.start_var_reduction = start_var_reduction
        self.delta_clip = delta_clip
        self.gamma_clip = gamma_clip
        self.grad_clip = grad_clip
        self.decay = sharedX(decay, "decay")
        self.use_corrected_grad = use_corrected_grad
        self.use_adagrad = use_adagrad
        self.damping = 1e-7

        # We have to bound the tau to prevent it to
        # grow to an arbitrarily large number, oftenwise
        # that causes numerical instabilities for very deep
        # networks. Note that once tau become very large, it will keep,
        # increasing indefinitely.
        self.skip_nan_inf = skip_nan_inf
        self.upper_bound_tau = 1e7
        self.lower_bound_tau = 1.5
示例#7
0
    def construct_updates(self, grads):
        if not self.updates:
            self.updates = OrderedDict({})

        ngrads = OrderedDict({})
        mb_step = sharedX(0, name="mb_step")
        self.updates[mb_step] = mb_step + 1
        cond = TT.eq((mb_step) % self.nbatches, 0)
        rate = 1.0 / self.nbatches

        for op, og in grads.iteritems():
            for i, g in enumerate(self.gs):
                if op.name in g.name:
                    break
            else:
                raise ValueError("Gradient for %s was not found." % op.name)

            if rate < 1.0:
                new_grad = (og + self.gs[i]) * as_floatX(rate)
                self.updates[self.gs[i]] = cond * new_grad + (1 - cond) * og * \
                        as_floatX(rate)
                ngrads[op] = new_grad
            else:
                ngrads[op] = og

        return ngrads
示例#8
0
def adadelta(lr, tparams, grads, inp, cost, errors):
    gnorm = get_norms(grads)
    pnorm = get_norms(tparams.values())

    zipped_grads = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]

    running_up2 = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2' % k)
        for k, p in tparams.iteritems()
    ]

    running_grads2 = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in \
            zip(zipped_grads, grads)]

    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \
                for rg2, g in \
                zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm], \
                                    updates=zgup + rg2up)

    updir = [-tensor.sqrt(ru2 + 1e-6) / \
                tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \
                in zip(zipped_grads, running_up2, running_grads2)]

    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \
               for ru2, ud in zip(running_up2, updir)]

    param_up = [(p, p + ud) for p, ud in \
                   zip(itemlist(tparams), updir)]

    upnorm = get_norms(updir)
    f_update = theano.function([lr], [upnorm],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
示例#9
0
def adadelta(lr, tparams, grads, inp, cost, errors):
    gnorm = get_norms(grads)
    pnorm = get_norms(tparams.values())

    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.),
                                  name='%s_grad'%k)
                    for k, p in tparams.iteritems()]

    running_up2 = [sharedX(p.get_value() * numpy.float32(0.),
                                 name='%s_rup2'%k)
                   for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.),
                                    name='%s_rgrad2'%k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in \
            zip(zipped_grads, grads)]

    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \
                for rg2, g in \
                zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm], \
                                    updates=zgup + rg2up)

    updir = [-tensor.sqrt(ru2 + 1e-6) / \
                tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \
                in zip(zipped_grads, running_up2, running_grads2)]

    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \
               for ru2, ud in zip(running_up2, updir)]

    param_up = [(p, p + ud) for p, ud in \
                   zip(itemlist(tparams), updir)]

    upnorm = get_norms(updir)
    f_update = theano.function([lr], [upnorm],
                               updates=ru2up+param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
示例#10
0
def rmsprop(lr, tparams, grads, inp, cost, errors):
    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_grad'%k) for k, p in tparams.iteritems()]

    running_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad'%k) for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad2'%k) for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \
            in zip(running_grads2, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm],
                                    updates=zgup+rgup+rg2up,
                                    profile=profile)

    updir = [sharedX(p.get_value() * numpy.float32(0.),
                     name='%s_updir'%k) \
                     for k, p in tparams.iteritems()]

    updir_new = [(ud, 0.9 * ud - lr * zg / \
                 tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, \
                        running_grads, running_grads2)]

    param_up = [(p, p + udn[1]) for p, udn in \
                zip(itemlist(tparams), updir_new)]

    upnorm = get_norms(updir_new)
    f_update = theano.function([lr],
                               [upnorm],
                               updates=updir_new+param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
示例#11
0
def rmsprop(lr, tparams, grads, inp, cost, errors):
    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_grad'%k) for k, p in tparams.iteritems()]

    running_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad'%k) for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad2'%k) for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \
            in zip(running_grads2, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                    updates=zgup + rgup + rg2up,
                                    profile=profile)

    updir = [sharedX(p.get_value() * numpy.float32(0.),
                     name='%s_updir'%k) \
                     for k, p in tparams.iteritems()]

    updir_new = [(ud, 0.9 * ud - lr * zg / \
                 tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, \
                        running_grads, running_grads2)]

    param_up = [(p, p + udn[1]) for p, udn in \
                zip(itemlist(tparams), updir_new)]

    upnorm = get_norms(updir_new)
    f_update = theano.function([lr], [upnorm],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
示例#12
0
def sgd(lr, tparams, grads, x, mask, y, cost, errors):
    gshared = [sharedX(p.get_value() * 0.,
                       name='%s_grad'%k) for k, p \
                               in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function([x, mask, y],
                                    [cost, errors, gnorm, pnorm],
                                    updates=gsup, profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    upnorm = lr*gnorm
    f_update = theano.function([lr], [upnorm], updates=pup, profile=profile)

    return f_grad_shared, f_update
示例#13
0
def sgd(lr, tparams, grads, x, mask, y, cost, errors):
    gshared = [sharedX(p.get_value() * 0.,
                       name='%s_grad'%k) for k, p \
                               in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm],
                                    updates=gsup,
                                    profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    upnorm = lr * gnorm
    f_update = theano.function([lr], [upnorm], updates=pup, profile=profile)

    return f_grad_shared, f_update
示例#14
0
    def __init__(self,
                 init_momentum=0.9,
                 averaging_coeff=0.99,
                 stabilizer=1e-4,
                 update_param_norm_ratio=0.003,
                 gradient_clipping=None):
        init_momentum = float(init_momentum)
        assert init_momentum >= 0.
        assert init_momentum <= 1.
        averaging_coeff = float(averaging_coeff)
        assert averaging_coeff >= 0.
        assert averaging_coeff <= 1.
        stabilizer = float(stabilizer)
        assert stabilizer >= 0.

        self.__dict__.update(locals())
        del self.self
        self.momentum = sharedX(self.init_momentum)
        self.update_param_norm_ratio = update_param_norm_ratio

        self.gradient_clipping = gradient_clipping
        if gradient_clipping is not None:
            self.gradient_clipping = np.cast[config.floatX](gradient_clipping)
示例#15
0
    def __init__(self,
                 init_momentum,
                 averaging_coeff=0.95,
                 stabilizer=1e-2,
                 use_first_order=False,
                 bound_inc=False,
                 momentum_clipping=None):
        init_momentum = float(init_momentum)
        assert init_momentum >= 0.
        assert init_momentum <= 1.
        averaging_coeff = float(averaging_coeff)
        assert averaging_coeff >= 0.
        assert averaging_coeff <= 1.
        stabilizer = float(stabilizer)
        assert stabilizer >= 0.

        self.__dict__.update(locals())
        del self.self
        self.momentum = sharedX(self.init_momentum)

        self.momentum_clipping = momentum_clipping
        if momentum_clipping is not None:
            self.momentum_clipping = np.cast[config.floatX](momentum_clipping)
示例#16
0
    def __init__(self,
                 init_momentum,
                 averaging_coeff=0.95,
                 stabilizer=1e-2,
                 use_first_order=False,
                 bound_inc=False,
                 momentum_clipping=None):
        init_momentum = float(init_momentum)
        assert init_momentum >= 0.
        assert init_momentum <= 1.
        averaging_coeff = float(averaging_coeff)
        assert averaging_coeff >= 0.
        assert averaging_coeff <= 1.
        stabilizer = float(stabilizer)
        assert stabilizer >= 0.

        self.__dict__.update(locals())
        del self.self
        self.momentum = sharedX(self.init_momentum)

        self.momentum_clipping = momentum_clipping
        if momentum_clipping is not None:
            self.momentum_clipping = np.cast[config.floatX](momentum_clipping)
示例#17
0
    def __init__(self,
                 init_momentum=0.9,
                 averaging_coeff=0.99,
                 stabilizer=1e-4,
                 update_param_norm_ratio=0.003,
                 gradient_clipping=None):
        init_momentum = float(init_momentum)
        assert init_momentum >= 0.
        assert init_momentum <= 1.
        averaging_coeff = float(averaging_coeff)
        assert averaging_coeff >= 0.
        assert averaging_coeff <= 1.
        stabilizer = float(stabilizer)
        assert stabilizer >= 0.

        self.__dict__.update(locals())
        del self.self
        self.momentum = sharedX(self.init_momentum)
        self.update_param_norm_ratio = update_param_norm_ratio

        self.gradient_clipping = gradient_clipping
        if gradient_clipping is not None:
            self.gradient_clipping = np.cast[config.floatX](gradient_clipping)
示例#18
0
    def __init__(self, decay=0.95,
                 gamma_clip=0.0,
                 grad_clip=None,
                 start_var_reduction=0,
                 delta_clip=None,
                 gamma_reg=1e-6,
                 slow_decay=0.995,
                 learning_rate=1.0,
                 use_adagrad=False,
                 perform_update=True,
                 skip_nan_inf=False,
                 use_corrected_grad=True):

        assert decay >= 0.
        assert decay < 1.

        self.start_var_reduction = start_var_reduction
        self.delta_clip = delta_clip
        self.gamma_clip = gamma_clip
        self.grad_clip = grad_clip
        self.slow_decay = slow_decay
        self.decay = sharedX(decay, "decay")
        self.use_corrected_grad = use_corrected_grad
        self.use_adagrad = use_adagrad
        self.gamma_reg = gamma_reg
        self.damping = 1e-7
        self.learning_rate = learning_rate
        self.perform_update = perform_update

        # We have to bound the tau to prevent it to
        # grow to an arbitrarily large number, oftenwise
        # that causes numerical instabilities for very deep
        # networks. Note that once tau become very large, it will keep,
        # increasing indefinitely.
        self.skip_nan_inf = skip_nan_inf
        self.upper_bound_tau = 1e7
        self.lower_bound_tau = 1.5
示例#19
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict()

        for (param, grad) in six.iteritems(grads):
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        return updates
示例#20
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        tot_norm_up = 0
        tot_param_norm = 0

        for param in grads.keys():

            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\
                (1 - self.averaging_coeff) * T.sqr(grads[param])
            if self.use_first_order:
                avg_grad = sharedX(np.zeros_like(param.get_value()))
                if param.name is not None:
                    avg_grad.name = 'avg_grad_' + param.name
                new_avg_grad = self.averaging_coeff * avg_grad +\
                    (1 - self.averaging_coeff) * grads[param]
                rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2)
                updates[avg_grad] = new_avg_grad
            else:
                rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = grads[param] / (rms_grad_t)
            new_velocity = self.momentum * velocity[param] -\
                learning_rate * normalized_grad
            tot_norm_up += new_velocity.norm(2)
            tot_param_norm += param.norm(2)

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + new_velocity

        if self.momentum_clipping is not None:
            tot_norm_up = 0

            new_mom_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [updates[velocity[param]] for param in grads.keys()]))
            new_mom_norm = T.sqrt(new_mom_norm)
            scaling_den = T.maximum(self.momentum_clipping, new_mom_norm)
            scaling_num = self.momentum_clipping

            for param in grads.keys():
                if self.bound_inc:
                    updates[velocity[param]] *= (scaling_num / scaling_den)
                    updates[param] = param + updates[velocity[param]]
                else:
                    update_step = updates[velocity[param]] * (scaling_num /
                                                              scaling_den)
                    tot_norm_up += update_step.norm(2)
                    updates[param] = param + update_step

        return updates, tot_norm_up, tot_param_norm
def train(dim_word_desc=400,# word vector dimensionality
          dim_word_q=400,
          dim_word_ans=600,
          dim_proj=300,
          dim=400,# the number of LSTM units
          encoder_desc='lstm',
          encoder_desc_word='lstm',
          encoder_desc_sent='lstm',
          use_dq_sims=False,
          eyem=None,
          learn_h0=False,
          use_desc_skip_c_g=False,
          debug=False,
          encoder_q='lstm',
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          clip_c=-1.,
          lrate=0.01,
          n_words_q=49145,
          n_words_desc=115425,
          n_words_ans=409,
          pkl_train_files=None,
          pkl_valid_files=None,
          maxlen=2000, # maximum length of the description
          optimizer='rmsprop',
          batch_size=2,
          vocab=None,
          valid_batch_size=16,
          use_elu_g=False,
          saveto='model.npz',
          model_dir=None,
          ms_nlayers=3,
          validFreq=1000,
          saveFreq=1000, # save the parameters after every saveFreq updates
          datasets=[None],
          truncate=400,
          momentum=0.9,
          use_bidir=False,
          cost_mask=None,
          valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
                          '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'],
          dropout_rate=0.5,
          use_dropout=True,
          reload_=True,
          **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv ** 2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c,
                                     g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr,
                                                  tparams,
                                                  grads,
                                                  inps,
                                                  cost,
                                                  errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_,
                                                                            q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d,
                                                           d_mask,
                                                           q,
                                                           q_mask,
                                                           a,
                                                           dlen,
                                                           slen,
                                                           qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask,
                                                           q, q_mask,
                                                           a,
                                                           dlen,
                                                           qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave,
                                         cost)
            train_err_ave = running_ave(train_err_ave,
                                        errors)
            train_gnorm_ave = running_ave(train_gnorm_ave,
                                          gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb; ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best, history_errs=history_errs, **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate([va.argmax(0) for va  in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb; ipdb.set_trace()


                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()


        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
示例#22
0
    def __call__(self, probs,
                 samples,
                 updates,
                 cost=None,
                 mask=None,
                 deterministic=False,
                 child_probs=None,
                 dimshuffle_probs=False,
                 child_samples=None):

        if input is None:
            raise ValueError("input for the %s should "
                             " not be empty." % __class__.__name__)

        key_baseline = get_key_byname_from_dict(updates, "baseline")
        step = 0

        if key_baseline:
            rbaseline = updates[key_baseline]
            key_step = get_key_byname_from_dict(updates, "step")
            if key_step:
                step = updates[key_step]
            else:
                step = sharedX(0., name="step")
        else:
            if self.generative_pred:
                baseline = sharedX(np.zeros((self.maxlen,)) + 1.0 + self.eps, name="baseline")
            else:
                baseline = sharedX(0. + 1.0 + self.eps, name="new_baseline")

            key_step = get_key_byname_from_dict(updates, "step")
            fix_decay = self.decay**(step + as_floatX(1))

            if key_step:
                step = updates[key_step]
            else:
                step = sharedX(0., name="step")
                updates[step] = step + as_floatX(1)

            if self.use_rms_baseline:
                if self.generative_pred:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)**2
                else:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()**2

                updates[baseline] = new_baseline
                rbaseline = new_baseline / (1 - fix_decay)
                rbaseline = TT.sqrt(rbaseline)
            else:
                if self.generative_pred:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean(-1)
                else:
                    new_baseline = as_floatX(self.decay) * baseline + as_floatX(1 - self.decay) * cost.mean()

                updates[baseline] = new_baseline
                rbaseline = new_baseline

        key_cvar = get_key_byname_from_dict(updates, "cost_var")

        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            if self.generative_pred:
                cost_var = sharedX(np.zeros((self.maxlen,)) + as_floatX(1.2), name="cost_var")
                cost_var_ave = (cost.mean(-1) - new_baseline)**2
            else:
                cost_var = sharedX(as_floatX(1.2), name="cost_var")
                cost_var_ave = (cost.mean() - new_baseline)**2

            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * cost_var_ave
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg

        """
        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)
        """

        if dimshuffle_probs:
            probsd = probs.dimshuffle(0, 2, 1)
        else:
            probsd = probs

        if probs.ndim == 3 and cost.ndim == 1:
            if dimshuffle_probs:
                reward = cost.dimshuffle('x', 'x', 0)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 'x', 0)
                    cost_std = new_cost_var.dimshuffle('x', 'x', 0)
            else:
                reward = cost.dimshuffle('x', 0, 'x')
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 0, 'x')
                    cost_std = new_cost_var.dimshuffle('x', 0, 'x')
        elif probs.ndim == 3 and cost.ndim == 2:
            if dimshuffle_probs:
                reward = cost.dimshuffle(0, 'x', 1)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle(0, 'x', 'x')
                    new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x')
            else:
                reward = cost.dimshuffle('x', 0, 1)
                if self.generative_pred:
                    rbaseline = rbaseline.dimshuffle('x', 0, 'x')
                    new_cost_var = new_cost_var.dimshuffle('x', 0, 'x')

        elif probs.ndim == 4 and self.cost.ndim == 1:
            reward = cost.dimshuffle('x', 'x', 0, 'x')
        elif probs.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')

        centered_cost = reward - rbaseline
        N = probsd.shape[-1]
        if self.use_cost_std:
            cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0)
        else:
            cost_std = 1

        if child_probs is not None and child_samples is not None:
            cprobs1 = child_samples / (child_probs + 1e-8) + samples / (probsd + 1e-8)
        else:
            cprobs1 = samples / (probsd + 1e-8)

        gradp = self.lambda1_reg * (centered_cost / cost_std) * \
                (cprobs1) + (lambda2_reg) * (TT.log(probsd + 1e-8) + as_floatX(1))

        if dimshuffle_probs:
            gradp = gradp.dimshuffle(0, 2, 1)

        if mask is not None:
            if dimshuffle_probs:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp
            else:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp / N

        known_grads = {probs: gradp}
        policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum()
        return updates, known_grads, rbaseline, cost_std, policy, lambda2_reg
示例#23
0
    def __call__(self, probs,
                 samples,
                 baseline,
                 updates,
                 cost = None,
                 cost_mean=None,
                 mask=None,
                 seq_len=20,
                 batch_size=140,
                 deterministic=False,
                 dimshuffle_probs=True):

        print("Using the input based baseline")
        if input is None:
            raise ValueError("input for the %s should"
                             " not be empty." % __class__.__name__)

        if cost_mean is None:
            cost_mean = cost.mean()

        step = 0
        key_step = get_key_byname_from_dict(updates, "step")
        if key_step:
            step = updates[key_step]
        else:
            step = sharedX(0., name="step")
            updates[step] = step + as_floatX(1)

        key_center = get_key_byname_from_dict(updates, "center")
        if key_center:
            center = updates[key_center]
            new_center = center
        else:
            if self.generative_pred:
                center = sharedX(np.zeros((self.maxlen,)) + 0.15 + self.eps, name="center")
                new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.mean(-1)
            else:
                center = sharedX(0.15 + self.eps, name="center")
                assert cost_mean is not None, "Cost mean should not be empty!"
                if cost.ndim > 2 and cost.broadcastable[0] is False:
                    new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean
                else:
                    new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost_mean
            updates[center] = new_center

        key_cvar = get_key_byname_from_dict(updates, "cost_var")
        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            if self.generative_pred:
                cost_var_tot = (cost_mean - new_center)**2
                cost_var = sharedX(numpy.zeros((self.maxlen,)) + as_floatX(1.0), name="cost_var")
            else:
                if cost.ndim >  2 and cost.broadcastable[0] is False:
                    cost_var_tot = (cost_mean - new_center)**2
                else:
                    cost_var_tot = (cost_mean - new_center)**2

                cost_var = sharedX(1.0, name="cost_var")

            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \
                    cost_var_tot
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg
        """
        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)
        """

        if dimshuffle_probs:
            probsd = probs.dimshuffle(0, 2, 1)
        else:
            probsd = probs

        if samples.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')
            policy = -(TT.log(probsd + 1e-8) * samples).mean((2, 3)).sum()
        else:
            if cost.ndim == 2:
                if dimshuffle_probs:
                    reward = cost.dimshuffle(0, 'x', 1)
                    if self.generative_pred:
                        new_center = new_center.dimshuffle(0, 'x', 'x')
                        new_cost_var = new_cost_var.dimshuffle(0, 'x', 'x')
                    baseline = baseline.dimshuffle(0, 2, 1)
                else:
                    reward = cost.dimshuffle(0, 1, 'x')
                policy = -(TT.log(probsd + 1e-8) * samples).mean((1, 2)).sum()
            elif cost.ndim == 1:
                reward = cost.dimshuffle('x', 0, 'x')
                if dimshuffle_probs:
                    baseline = baseline.dimshuffle(0, 2, 1)
                else:
                    baseline = baseline.dimshuffle(1, 0, 2)

        cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1.0)
        centered_reward = (reward - baseline - new_center) / cost_std

        if cost.ndim == 2:
            centered_reward = TT.addbroadcast(centered_reward, 1)

        N = probs.shape[-1]
        gradp = self.lambda1_reg * (centered_reward) * \
                (samples / (probsd + 1e-8)) + lambda2_reg * (TT.log(probsd + 1e-6) + as_floatX(1))

        if dimshuffle_probs:
            gradp = gradp.dimshuffle(0, 2, 1)

        if mask is not None:
            if self.generative_pred:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp / N
            else:
                gradp = mask.dimshuffle(0, 1, 'x') * gradp

        known_grads = {probs: gradp}
        return updates, known_grads, new_center, cost_std, policy, lambda2_reg
示例#24
0
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()]))
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping
            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(gshared[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * gshared[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_norm_up += learning_rate * normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + normalized_velocities[param]

        updates[counter] = counter + 1
        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#25
0
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0

        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(gshared[param]))

            # Compute update
            epsilon = learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * gshared[param]

            # Accumulate updates
            new_mean_square_dx = (self.decay * mean_square_dx +
                                  (1 - self.decay) * T.sqr(delta_x_t))

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#26
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({
                p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0,
                            grads[p])
                for p in grads.keys()
            })

        #Block-normalize gradients:
        nparams = len(grads.keys())

        #Apply the gradient clipping, this is only sometimes
        #necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                g * self.grad_clip * nparams / gnorm, g)
                grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg)

        tot_norm_up = 0
        tot_param_norm = 0

        fix_decay = self.slow_decay**(step + 1)
        for param in grads.keys():
            grads[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps,
                                name="mean_grad_%s" % param.name)
            mean_corrected_grad = sharedX(param.get_value() * 0 + eps,
                                          name="mean_corrected_grad_%s" %
                                          param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps),
                                name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" %
                                          param.name)
            """
               Initialization of accumulators
            """
            taus_x_t = sharedX(
                (np.ones_like(param.get_value()) + eps) * slow_constant,
                name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.,
                                     name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = grads[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (prod_taus * (1 - 1 / taus_x_t))
            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) +
                                     T.sqr(norm_grad) / (taus_x_t))
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) +
                             norm_grad / taus_x_t)

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay +
                             T.sqr(norm_grad).sum() * (1 - self.slow_decay))

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t)
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (mg - norm_grad) * (old_grad - mg)) / taus_x_t)

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (
                    1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (sum_square_grad + T.sqr(g))
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) +
                                 (cur_curvature / taus_x_t))
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) +
                                     (cur_curvature_sqr / taus_x_t))
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t /
                                      (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info(
                    "Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info(
                    "Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(
                1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) +
                                  (T.sqr(delta_x_t) / taus_x_t))

            #To compute the E[\Delta]_t
            new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)))

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(
                T.or_(
                    abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)),
                    abs(cur_curvature - nc_ave) >
                    (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                T.switch(new_taus_t > 2.5, sharedX(2.5),
                         new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) +
                             (delta_x_t * cur_curvature) * (1 / taus_x_t))

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            tot_param_norm += param.norm(2)

            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        return updates, tot_norm_up, tot_param_norm
def train(
        dim_word_desc=400,  # word vector dimensionality
        dim_word_q=400,
        dim_word_ans=600,
        dim_proj=300,
        dim=400,  # the number of LSTM units
        encoder_desc='lstm',
        encoder_desc_word='lstm',
        encoder_desc_sent='lstm',
        use_dq_sims=False,
        eyem=None,
        learn_h0=False,
        use_desc_skip_c_g=False,
        debug=False,
        encoder_q='lstm',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_q=49145,
        n_words_desc=115425,
        n_words_ans=409,
        pkl_train_files=None,
        pkl_valid_files=None,
        maxlen=2000,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=2,
        vocab=None,
        valid_batch_size=16,
        use_elu_g=False,
        saveto='model.npz',
        model_dir=None,
        ms_nlayers=3,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        datasets=[None],
        truncate=400,
        momentum=0.9,
        use_bidir=False,
        cost_mask=None,
        valid_datasets=[
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'
        ],
        dropout_rate=0.5,
        use_dropout=True,
        reload_=True,
        **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best),
                      'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(
                    d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, slen, qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave, cost)
            train_err_ave = running_ave(train_err_ave, errors)
            train_gnorm_ave = running_ave(train_gnorm_ave, gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb
                ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best,
                                history_errs=history_errs,
                                **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate(
                    [va.argmax(0) for va in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(
                        history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
示例#28
0
    def __init__(self,
                 n_in,
                 n_hids,
                 n_out,
                 mem_size,
                 mem_nel,
                 deep_out_size,
                 bow_size=40,
                 inps=None,
                 dropout=None,
                 predict_bow_out=False,
                 seq_len=None,
                 n_read_heads=1,
                 n_layers=1,
                 n_write_heads=1,
                 train_profile=False,
                 erase_activ=None,
                 content_activ=None,
                 l1_pen=None,
                 l2_pen=None,
                 use_reinforce=False,
                 use_reinforce_baseline=False,
                 n_reading_steps=2,
                 use_gru_inp_rep=False,
                 use_simple_rnn_inp_rep=False,
                 use_nogru_mem2q=False,
                 sub_mb_size=40,
                 lambda1_rein=2e-4,
                 lambda2_rein=2e-5,
                 baseline_reg=1e-2,
                 anticorrelation=None,
                 use_layer_norm=False,
                 recurrent_dropout_prob=-1,
                 correlation_ws=None,
                 hybrid_att=True,
                 max_fact_len=7,
                 use_dice_val=False,
                 use_qmask=False,
                 renormalization_scale=4.8,
                 w2v_embed_scale=0.42,
                 emb_scale=0.32,
                 use_soft_att=False,
                 use_hard_att_eval=False,
                 use_batch_norm=False,
                 learning_rule=None,
                 use_loc_based_addressing=True,
                 smoothed_diff_weights=False,
                 use_multiscale_shifts=True,
                 use_ff_controller=False,
                 use_gate_quad_interactions=False,
                 permute_order=False,
                 wpenalty=None,
                 noise=None,
                 w2v_embed_path=None,
                 glove_embed_path=None,
                 learn_embeds=True,
                 use_last_hidden_state=False,
                 use_adv_indexing=False,
                 use_bow_input=True,
                 use_out_mem=True,
                 use_deepout=True,
                 use_q_mask=False,
                 use_inp_content=True,
                 rnd_indxs=None,
                 address_size=0,
                 learn_h0=False,
                 use_context=False,
                 debug=False,
                 controller_activ=None,
                 mem_gater_activ=None,
                 weight_initializer=None,
                 bias_initializer=None,
                 use_cost_mask=True,
                 use_bow_cost_mask=True,
                 theano_function_mode=None,
                 batch_size=32,
                 use_noise=False,
                 reinforce_decay=0.9,
                 softmax=False,
                 use_mask=False,
                 name="ntm_model",
                 **kwargs):

        assert deep_out_size is not None, ("Size of the deep output "
                                           " should not be None.")

        if sub_mb_size is None:
            sub_mb_size = batch_size

        assert sub_mb_size <= batch_size, "batch_size should be greater than sub_mb_size"
        self.hybrid_att = hybrid_att

        self.state = locals()
        self.use_context = use_context
        self.eps = 1e-8
        self.use_mask = use_mask
        self.l1_pen = l1_pen
        self.l2_pen = l2_pen
        self.l2_penalizer = None
        self.emb_scale = emb_scale
        self.w2v_embed_path = w2v_embed_path
        self.glove_embed_path = glove_embed_path
        self.learn_embeds = learn_embeds
        self.exclude_params = {}

        self.use_gate_quad_interactions = use_gate_quad_interactions
        self.reinforce_decay = reinforce_decay
        self.max_fact_len = max_fact_len
        self.lambda1_reinf = lambda1_rein
        self.lambda2_reinf = lambda2_rein
        self.use_reinforce_baseline = use_reinforce_baseline
        self.use_reinforce = use_reinforce
        self.use_gru_inp_rep = use_gru_inp_rep
        self.use_simple_rnn_inp_rep = use_simple_rnn_inp_rep
        self.use_q_mask = use_q_mask
        self.use_inp_content = use_inp_content
        self.rnd_indxs = rnd_indxs

        self.use_layer_norm = use_layer_norm
        self.recurrent_dropout_prob = recurrent_dropout_prob

        self.n_reading_steps = n_reading_steps
        self.sub_mb_size = sub_mb_size
        self.predict_bow_out = predict_bow_out
        self.correlation_ws = correlation_ws
        self.smoothed_diff_weights = smoothed_diff_weights
        self.use_soft_att = use_soft_att
        self.use_hard_att_eval = use_hard_att_eval

        if anticorrelation and n_read_heads < 2:
            raise ValueError("Anti-correlation of the attention weight"
                             " do not support the multiple read heads.")

        self.anticorrelation = anticorrelation

        if self.predict_bow_out:
            if len(inps) <= 4:
                raise ValueError(
                    "The number of inputs should be greater than 4.")

        if l2_pen:
            self.l2_penalizer = L2Penalty(self.l2_pen)

        #assert use_bow_input ^ use_gru_inp_rep ^ self.use_simple_rnn_inp_rep, \
        #        "You should either use GRU or BOW input."

        self.renormalization_scale = renormalization_scale
        self.w2v_embed_scale = w2v_embed_scale

        self.baseline_reg = baseline_reg
        self.inps = inps
        self.erase_activ = erase_activ
        self.use_ff_controller = use_ff_controller
        self.content_activ = content_activ
        self.use_bow_cost_mask = use_bow_cost_mask
        self.ntm_outs = None
        self.theano_function_mode = theano_function_mode
        self.n_in = n_in
        self.dropout = dropout
        self.wpenalty = wpenalty
        self.noise = noise
        self.bow_size = bow_size
        self.use_last_hidden_state = use_last_hidden_state
        self.use_loc_based_addressing = use_loc_based_addressing
        self.train_profile = train_profile
        self.use_nogru_mem2q = use_nogru_mem2q
        self.use_qmask = use_qmask
        self.permute_order = permute_order
        self.use_batch_norm = use_batch_norm

        # Use this if you have a ff-controller because otherwise this is not effective:
        self.n_layers = n_layers
        if self.use_reinforce:
            reinforceCls = REINFORCE
            if not self.use_reinforce_baseline:
                reinforceCls = REINFORCEBaselineExt

            self.Reinforce = reinforceCls(lambda1_reg=self.lambda1_reinf,
                                          lambda2_reg=self.lambda2_reinf,
                                          decay=self.reinforce_decay)

            self.ReaderReinforce = \
                    ReinforcePenalty(reinf_level=self.lambda1_reinf,
                                     maxent_level=self.lambda2_reinf,
                                     use_reinforce_baseline=self.use_reinforce_baseline)
        self.dice_val = None

        if use_dice_val:
            self.dice_val = sharedX(1.)

        self.use_dice_val = use_dice_val
        if bow_size is None:
            raise ValueError("bow_size should be specified.")

        if name is None:
            raise ValueError("name should not be empty.")

        self.n_hids = n_hids
        self.mem_size = mem_size
        self.use_deepout = use_deepout
        self.mem_nel = mem_nel
        self.n_out = n_out
        self.use_out_mem = use_out_mem
        self.use_multiscale_shifts = use_multiscale_shifts
        self.address_size = address_size
        self.n_read_heads = n_read_heads
        self.n_write_heads = n_write_heads
        self.learn_h0 = learn_h0
        self.use_adv_indexing = use_adv_indexing
        self.softmax = softmax
        self.use_bow_input = use_bow_input
        self.use_cost_mask = use_cost_mask
        self.deep_out_size = deep_out_size
        self.controller_activ = controller_activ
        self.mem_gater_activ = mem_gater_activ
        self.weight_initializer = weight_initializer
        self.bias_initializer = bias_initializer

        if batch_size:
            self.batch_size = batch_size
        else:
            self.batch_size = inps[0].shape[1]

        #assert self.batch_size >= self.sub_mb_size, ("Minibatch size should be "
        #                                             " greater than the sub minibatch size")
        self.comp_grad_fn = None
        self.name = name
        self.use_noise = use_noise
        self.train_timer = Timer("Training function")
        self.gradfn_timer = Timer("Gradient function")
        self.grads_timer = Timer("Computing the grads")
        self.reset()

        self.seq_len = TT.iscalar('seq_len')
        self.__convert_inps_to_list()

        if debug:
            if self.use_gru_inp_rep or self.use_bow_input:
                self.seq_len.tag.test_value = self.inps[
                    0].tag.test_value.shape[1]
            else:
                self.seq_len.tag.test_value = self.inps[
                    0].tag.test_value.shape[0]

        self.learning_rule = learning_rule
        if self.predict_bow_out:
            self.bow_out_w = TT.fscalar("bow_out_w")
            if debug:
                self.bow_out_w.tag.test_value = np.float32(1.0)
        else:
            self.bow_out_w = 0
示例#29
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Compute the AdaDelta updates

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        updates = OrderedDict()

        tot_norm_up = 0

        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(gshared[param])
            )

            # Compute update
            epsilon = learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * gshared[param]

            # Accumulate updates
            new_mean_square_dx = (
                self.decay * mean_square_dx +
                (1 - self.decay) * T.sqr(delta_x_t)
            )

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

            tot_norm_up += delta_x_t.norm(2)

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#30
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        tot_norm_up = 0

        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\
                (1 - self.averaging_coeff) * T.sqr(gshared[param])
            if self.use_first_order:
                avg_grad = sharedX(np.zeros_like(param.get_value()))
                if param.name is not None:
                    avg_grad.name = 'avg_grad_' + param.name
                new_avg_grad = self.averaging_coeff * avg_grad +\
                    (1 - self.averaging_coeff) * gshared[param]
                rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2)
                updates[avg_grad] = new_avg_grad
            else:
                rms_grad_t = T.sqrt(new_avg_grad_sqr)

            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = gshared[param] / (rms_grad_t)
            new_velocity = self.momentum * velocity[param] -\
                learning_rate * normalized_grad
            tot_norm_up += new_velocity.norm(2)

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + new_velocity

        if self.momentum_clipping is not None:
            tot_norm_up = 0

            new_mom_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [updates[velocity[param]] for param in grads.keys()])
            )
            new_mom_norm = T.sqrt(new_mom_norm)
            scaling_den = T.maximum(self.momentum_clipping, new_mom_norm)
            scaling_num = self.momentum_clipping

            for param in grads.keys():
                if self.bound_inc:
                    updates[velocity[param]] *= (scaling_num / scaling_den)
                    updates[param] = param + updates[velocity[param]]
                else:
                    update_step = updates[velocity[param]] * (scaling_num / scaling_den)
                    tot_norm_up += update_step.norm(2)
                    updates[param] = param + update_step

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#31
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()])
            )
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping
            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(gshared[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * gshared[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_norm_up += learning_rate*normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + normalized_velocities[param]

        updates[counter] = counter + 1
        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#32
0
    def __call__(self, probs,
                 samples,
                 baseline,
                 updates,
                 cost = None,
                 mask=None,
                 seq_len=20,
                 batch_size=140,
                 deterministic=False):

        if input is None:
            raise ValueError("input for the %s should"
                             " not be empty." % __class__.__name__)
        step = 0
        key_step = get_key_byname_from_dict(updates, "step")
        if key_step:
            step = updates[key_step]
        else:
            step = sharedX(0., name="step")
            updates[step] = step + as_floatX(1)

        key_center = get_key_byname_from_dict(updates, "center")
        if key_center:
            center = updates[key_center]
            new_center = center
        else:
            center = sharedX(0.08 + self.eps, name="center")
            new_center = as_floatX(self.decay) * center + as_floatX(1 - self.decay) * cost.sum(0).mean()
            updates[center] = new_center

        key_cvar = get_key_byname_from_dict(updates, "cost_var")
        if key_cvar:
            cost_var = updates[key_cvar]
            new_cost_var = cost_var
        else:
            cost_var_tot = (cost.sum(0).mean() - new_center)**2
            cost_var = sharedX(as_floatX(0.5), name="cost_var")
            new_cost_var = as_floatX(self.decay) * cost_var + as_floatX(1 - self.decay) * \
                    cost_var_tot
            updates[cost_var] = new_cost_var

        lambda2_reg = self.lambda2_reg

        if not self.schedule_h_opts:
            start = self.schedule_h_opts["lambda2_reg_start"]
            nbatches = self.schedule_h_opts["end_nbatches"]
            end = self.lambda2_reg
            assert start > end
            lambda2_reg = TT.minimum(((start - end) * step / nbatches) + start,
                                       end)

        action_probs = samples * probs
        if samples.ndim == 4:
            reward = cost.dimshuffle(0, 'x', 1, 'x')
            policy = (TT.log(probs + 1e-8) * samples).mean((2, 3)).sum()
        else:
            if cost.ndim == 2:
                reward = cost.dimshuffle(0, 1, 'x')
            elif cost.ndim == 1:
                reward = cost.dimshuffle('x', 0, 'x')
                baseline = baseline.dimshuffle(1, 0, 2)

            policy = (TT.log(probs + 1e-8) * samples).mean((1, 2)).sum()

        cost_std = TT.maximum(TT.sqrt(new_cost_var + 1e-8), 1e-6)
        centered_reward = (reward - baseline - new_center) / cost_std
        N = probs.shape[-1]

        gradp = self.lambda1_reg * (centered_reward) * \
                (samples / (probs + 1e-8)) + lambda2_reg * (TT.log(probs + 1e-6) + as_floatX(1))

        if mask is not None:
            gradp = mask.dimshuffle(0, 1, 'x') * gradp / N

        known_grads = {probs: gradp}
        return updates, known_grads, new_center, cost_std, policy, lambda2_reg
示例#33
0
    def get_funcs(self,
                  learning_rate,
                  grads,
                  inp,
                  cost,
                  errors,
                  lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        tot_norm_up = 0

        gshared = OrderedDict({
            p: sharedX(p.get_value() * 0., name='%s_grad' % p.name)
            for p, g in grads.iteritems()
        })

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        for param in gshared.keys():
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr +\
                (1 - self.averaging_coeff) * T.sqr(gshared[param])
            if self.use_first_order:
                avg_grad = sharedX(np.zeros_like(param.get_value()))
                if param.name is not None:
                    avg_grad.name = 'avg_grad_' + param.name
                new_avg_grad = self.averaging_coeff * avg_grad +\
                    (1 - self.averaging_coeff) * gshared[param]
                rms_grad_t = T.sqrt(new_avg_grad_sqr - new_avg_grad**2)
                updates[avg_grad] = new_avg_grad
            else:
                rms_grad_t = T.sqrt(new_avg_grad_sqr)

            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = gshared[param] / (rms_grad_t)
            new_velocity = self.momentum * velocity[param] -\
                learning_rate * normalized_grad
            tot_norm_up += new_velocity.norm(2)

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity
            updates[param] = param + new_velocity

        if self.momentum_clipping is not None:
            tot_norm_up = 0

            new_mom_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [updates[velocity[param]] for param in grads.keys()]))
            new_mom_norm = T.sqrt(new_mom_norm)
            scaling_den = T.maximum(self.momentum_clipping, new_mom_norm)
            scaling_num = self.momentum_clipping

            for param in grads.keys():
                if self.bound_inc:
                    updates[velocity[param]] *= (scaling_num / scaling_den)
                    updates[param] = param + updates[velocity[param]]
                else:
                    update_step = updates[velocity[param]] * (scaling_num /
                                                              scaling_den)
                    tot_norm_up += update_step.norm(2)
                    updates[param] = param + update_step

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#34
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        velocity = OrderedDict()
        normalized_velocities = OrderedDict()

        counter = sharedX(0, 'counter')
        tot_norm_up = 0
        tot_param_norm = 0

        if self.gradient_clipping is not None:
            grads_norm = sum(
                map(lambda X: T.sqr(X).sum(),
                    [grads[param] for param in grads.keys()]))
            grads_norm = T.sqrt(grads_norm)
            scaling_den = T.maximum(self.gradient_clipping, grads_norm)
            scaling_num = self.gradient_clipping

            for param in grads.keys():
                grads[param] = scaling_num * grads[param] / scaling_den

        for param in grads.keys():

            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            velocity[param] = sharedX(np.zeros_like(param.get_value()))

            next_counter = counter + 1.

            fix_first_moment = 1. - self.momentum**next_counter
            fix_second_moment = 1. - self.averaging_coeff**next_counter

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff*avg_grad_sqr \
                + (1 - self.averaging_coeff)*T.sqr(grads[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            new_velocity = self.momentum * velocity[param] \
                - (1 - self.momentum) * grads[param]
            normalized_velocity = (new_velocity * T.sqrt(fix_second_moment)) \
                / (rms_grad_t * fix_first_moment)

            tot_param_norm += param.norm(2)
            tot_norm_up += learning_rate * normalized_velocity.norm(2)

            normalized_velocities[param] = normalized_velocity
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[velocity[param]] = new_velocity

        update_param_norm_ratio = tot_norm_up / (tot_param_norm + 1e-7)

        new_lr = ifelse.ifelse(
            T.ge(update_param_norm_ratio, self.update_param_norm_ratio),
            as_floatX(learning_rate * self.update_param_norm_ratio) /
            update_param_norm_ratio, as_floatX(learning_rate))

        new_lr = ifelse.ifelse(T.ge(counter, 6000), new_lr,
                               as_floatX(learning_rate))

        for param in grads.keys():
            updates[param] = param + new_lr * normalized_velocities[param]

        updates[counter] = counter + 1
        return updates, tot_norm_up, tot_param_norm
示例#35
0
def lstm_tied_layer(tparams,
                    state_below,
                    options,
                    prefix='lstm_tied',
                    mask=None,
                    one_step=False,
                    init_state=None,
                    init_memory=None,
                    nsteps=None,
                    **kwargs):

    if nsteps is None:
        nsteps = state_below.shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    param = lambda name: tparams[prfx(prefix, name)]
    dim = param('U').shape[0]

    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # initial/previous state
    if init_state is None:
        if not options['learn_h0']:
            init_state = tensor.alloc(0., n_samples, dim)
        else:
            init_state0 = sharedX(numpy.zeros((options['dim'])),
                                 name=prfx(prefix, "h0"))
            init_state = tensor.concatenate([[init_state0] \
                                                for i in xrange(options['batch_size'])],
                                            axis=0)
            tparams[prfx(prefix, 'h0')] = init_state0

    # initial/previous memory
    if init_memory is None:
        init_memory = tensor.alloc(0., n_samples, dim)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    def _step(mask, sbelow, sbefore, cell_before):
        preact = dot(sbefore, param('U'))
        preact += sbelow
        preact += tparams[prfx(prefix, 'b')]

        f = Sigmoid(_slice(preact, 0, dim))
        o = Sigmoid(_slice(preact, 1, dim))
        c = Tanh(_slice(preact, 2, dim))

        c = f * cell_before + (1 - f) * c
        c = mask * c + (1. - mask) * cell_before
        h = o * tensor.tanh(c)
        h = mask * h + (1. - mask) * sbefore

        return h, c

    state_below = dot(state_below, param('W')) + param('b')

    if one_step:
        mask = mask.dimshuffle(0, 'x')
        h, c = _step(mask, state_below, init_state, init_memory)
        rval = [h, c]
    else:
        if mask.ndim == 3 and mask.ndim == state_below.ndim:
            mask = mask.reshape((mask.shape[0], mask.shape[1]*mask.shape[2])).dimshuffle(0, 1, 'x')
        elif mask.ndim == 2:
            mask = mask.dimshuffle(0, 1, 'x')
        rval, updates = theano.scan(_step,
                                    sequences=[mask, state_below],
                                    outputs_info=[init_state,
                                                  init_memory],
                                    name=prfx(prefix, '_layers'),
                                    n_steps=nsteps)
    return rval
示例#36
0
 def __init__(self, init_momentum, nesterov_momentum=False):
     assert init_momentum >= 0.
     assert init_momentum < 1.
     self.momentum = sharedX(init_momentum, 'momentum')
     self.nesterov_momentum = nesterov_momentum
示例#37
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a
            learning rate to be defined.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]),
                T.isnan(grads[p])), 0, grads[p]) for
                p in grads.keys()})

        # Block-normalize gradients:
        nparams = len(grads.keys())

        # Apply the gradient clipping, this is only sometimes
        # necessary for RNNs and sometimes for very deep networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."

            gnorm = sum([g.norm(2) for g in grads.values()])
            notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm))

            for p, g in grads.iteritems():
                tmpg = T.switch(gnorm / nparams > self.grad_clip,
                                 g * self.grad_clip * nparams / gnorm , g)
                grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg)

        tot_norm_up = 0
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gshared[p], g) for p, g in grads.iteritems()]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)

        fix_decay = self.slow_decay**(step + 1)

        for param in gshared.keys():
            gshared[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name)
            gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name)

            prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps),
                                 name="prod_taus_x_t_" + param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" % param.name)

            """
               Initialization of accumulators
            """
            taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant,
                               name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name)

            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = gshared[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            gnorm = T.sqr(norm_grad).sum()

            cond = T.eq(step, 0)
            gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr
            gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay)

            norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx

            new_prod_taus = (
                prod_taus * (1 - 1 / taus_x_t)
            )

            """
                Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (
                mean_square_grad * (1 - 1 / taus_x_t)  +
                T.sqr(norm_grad) / (taus_x_t)
            )
            new_mean_squared_grad.name = "msg_" + param.name

            # E[g_i]_t
            new_mean_grad = (
                mean_grad * (1 - 1 / taus_x_t) +
                norm_grad / taus_x_t
            )

            new_mean_grad.name = "nmg_" + param.name
            mg = new_mean_grad / (1 - new_prod_taus)
            mgsq = new_mean_squared_grad / (1 - new_prod_taus)

            new_gnorm_sqr = (
                    gnorm_sqr_o * self.slow_decay +
                    T.sqr(norm_grad).sum() * (1 - self.slow_decay)
            )

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (
                gamma_nume_sqr * (1 - 1 / taus_x_t) +
                T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t
            )
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (
                gamma_deno_sqr * (1 - 1 / taus_x_t) +
                T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t
            )

            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \
                    self.gamma_reg)

            gamma.name = "gamma_" + param.name

            if self.gamma_clip and self.gamma_clip > -1:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (
                    sum_square_grad + T.sqr(g)
                )
                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            #Use the gradients from the previous update
            #to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature)
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (
                mean_curvature * (1 - 1 / taus_x_t) +
                (cur_curvature / taus_x_t)
            )
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave / (1 - new_prod_taus)

            new_curvature_sqr_ave = (
                mean_curvature_sqr * (1 - 1 / taus_x_t) +
                (cur_curvature_sqr / taus_x_t)
            )
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus)

            epsilon = 1e-7
            #lr_scalers.get(param, 1.) * learning_rate
            scaled_lr = sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                logger.info("Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                logger.info("Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    logger.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (
                 msdx * (1 - 1 / taus_x_t) +
                 (T.sqr(delta_x_t) / taus_x_t)
             )

            #To compute the E[\Delta]_t
            new_mean_dx = (
                mdx * (1 - 1 / taus_x_t) +
                (delta_x_t / (taus_x_t))
            )

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq  - mg**2)),
                                        abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                                        T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (
                cov_num_t * (1 - 1 / taus_x_t) +
                (delta_x_t * cur_curvature) * (1 / taus_x_t)
            )

            update_step = delta_x_t

            tot_norm_up += update_step.norm(2)
            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gnorm_sqr] = new_gnorm_sqr
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave

            if self.perform_update:
                updates[param] = param + update_step

            updates[step] = step + 1
            updates[prod_taus] = new_prod_taus

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        f_update = theano.function([learning_rate], [tot_norm_up],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
示例#38
0
def gru_layer(tparams,
              state_below,
              options,
              prefix='gru',
              mask=None,
              nsteps=None,
              truncate=None,
              init_state=None,
              **kwargs):

    if nsteps is None:
        nsteps = state_below.shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    param = lambda name: tparams[prfx(prefix, name)]
    dim = param('Ux').shape[1]

    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    if mask.ndim == 3 and mask.ndim == state_below.ndim:
        mask = mask.reshape((mask.shape[0], \
                mask.shape[1] * mask.shape[2])).dimshuffle(0, 1, 'x')
    elif mask.ndim == 2:
        mask = mask.dimshuffle(0, 1, 'x')

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    state_below_ = dot(state_below, param('W')) + param('b')
    state_belowx = dot(state_below, param('Wx')) + param('bx')

    # initial/previous state
    if init_state is None:
        if not options['learn_h0']:
            init_state = tensor.alloc(0., n_samples, dim)
        else:
            init_state0 = sharedX(numpy.zeros((options['dim'])),
                                 name=prfx(prefix, "h0"))
            init_state = tensor.concatenate([[init_state0] \
                                                for i in xrange(options['batch_size'])],
                                            axis=0)
            tparams[prfx(prefix, 'h0')] = init_state0

    U = tparams[prfx(prefix, 'U')]
    Ux = tparams[prfx(prefix, 'Ux')]

    def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux):
        preact = dot(sbefore, U)
        preact += sbelow

        r = Sigmoid(_slice(preact, 0, dim))
        u = Sigmoid(_slice(preact, 1, dim))

        preactx = dot(r * sbefore, Ux)

        # preactx = preactx
        preactx = preactx + sbelowx

        h = Tanh(preactx)

        h = u * sbefore + (1. - u) * h
        h = mask[:, None] * h + (1. - mask)[:, None] * sbefore

        return h

    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=[init_state],
                                non_sequences=[U, Ux],
                                name=prfx(prefix, '_layers'),
                                n_steps=nsteps,
                                truncate_gradient=truncate,
                                profile=profile,
                                strict=True)
    rval = [rval]
    return rval
示例#39
0
 def __init__(self, init_momentum, nesterov_momentum=False):
     assert init_momentum >= 0.
     assert init_momentum < 1.
     self.momentum = sharedX(init_momentum, 'momentum')
     self.nesterov_momentum = nesterov_momentum