Exemplo n.º 1
0
    def updates(self, cost):
        grad = T.grad(cost, self.param)
        grad2 = hessian_diagonal(cost, self.param, grad=grad)
        # calculate memory constants
        tau_rec = 1.0 / self.tau
        tau_inv_rec = 1.0 - tau_rec

        # new moving average of gradient
        g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad
        # new moving average of squared gradient
        v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2
        # new moving average of hessian diagonal
        h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2)

        rate_unsafe = (g_avg_new ** 2) / (v_avg_new * h_avg_new)
        rate = T.switch(T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate, rate_unsafe)

        tau_unsafe = (1 - (g_avg_new ** 2) / v_avg_new) * self.tau + 1
        tau_new = T.switch(T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe)

        return [(self.g_avg, g_avg_new),
                (self.v_avg, v_avg_new),
                (self.h_avg, h_avg_new),
                (self.tau, tau_new),
                (self.last_grad, grad),
                (self.last_grad2, grad2),
                (self.last_rate, rate),
                (self.param, self.param - rate * grad)]
Exemplo n.º 2
0
    def updates(self, cost):
        grad = T.grad(cost, self.param)
        grad2 = hessian_diagonal(cost, self.param, grad=grad)
        # calculate memory constants
        tau_rec = 1.0 / self.tau
        tau_inv_rec = 1.0 - tau_rec

        # new moving average of gradient
        g_avg_new = tau_inv_rec * self.g_avg + tau_rec * grad
        # new moving average of squared gradient
        v_avg_new = tau_inv_rec * self.v_avg + tau_rec * grad**2
        # new moving average of hessian diagonal
        h_avg_new = tau_inv_rec * self.h_avg + tau_rec * T.abs_(grad2)

        rate_unsafe = (g_avg_new**2) / (v_avg_new * h_avg_new)
        rate = T.switch(
            T.isinf(rate_unsafe) | T.isnan(rate_unsafe), self.learning_rate,
            rate_unsafe)

        tau_unsafe = (1 - (g_avg_new**2) / v_avg_new) * self.tau + 1
        tau_new = T.switch(
            T.isnan(tau_unsafe) | T.isinf(tau_unsafe), self.tau, tau_unsafe)

        return [(self.g_avg, g_avg_new), (self.v_avg, v_avg_new),
                (self.h_avg, h_avg_new), (self.tau, tau_new),
                (self.last_grad, grad), (self.last_grad2, grad2),
                (self.last_rate, rate), (self.param, self.param - rate * grad)]
Exemplo n.º 3
0
    def from_partial(self, X, dX):
        eps = 1e-10
        U, S, V = X
        dU, dS, dV = dX

        umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU)
                                              )  # indicators of nan/inf values
        vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV)
                                              )  # indicators of nan/inf values

        # U S V => U mask product by columns, V by rows
        smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask,
                                                                 axis=1)
        S = tensor.diag(S)

        dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0)
        S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0)
        S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0)
        S_pinv = tensor.diag(S_pinv)
        dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0)
        ZV = dU.dot(S_pinv)
        UtZV = dS
        ZtU = S_pinv.dot(dV)

        Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V)))
        return Zproj
Exemplo n.º 4
0
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu):
    """Do SGD updates with Nesterov momentum."""
    updates = []
    for p, g, v in zip(param_list, gradients, velocities):
        new_v = mu * v - lr * g
        new_p = p - mu * v + (1 + mu) * new_v
        has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) +
                          T.any(T.isnan(new_v) + T.isinf(new_v)))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        updates.append((v, ifelse(has_non_finite, v, new_v)))
    return updates
Exemplo n.º 5
0
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0)

    i = shared(floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - (1.0 - b1) ** i_t
    fix2 = 1.0 - (1.0 - b2) ** i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.0)
        v = shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        # e_t = shared(p.get_value() * 0.)
        # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        # p_t = p_t + de_t
        # updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemplo n.º 6
0
def clip_grad_remove_nan(grads,
                         clip_c_shared,
                         mt_tparams,
                         freeze_word_emb=False,
                         only_word_att=False,
                         gated_att=False):
    g2 = 0.
    for g in grads:
        g2 += (g * g).sum()
    not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2))
    if clip_c_shared.get_value() > 0.:
        new_grads = []
        for g, p in zip(
                grads,
                itemlist(mt_tparams, freeze_word_emb, only_word_att,
                         gated_att)):
            tmpg = tensor.switch(g2 > (clip_c_shared * clip_c_shared),
                                 g / tensor.sqrt(g2) * clip_c_shared, g)
            new_grads.append(
                tensor.switch(not_finite,
                              np.float32(.1) * p, tmpg))

        return new_grads, tensor.sqrt(g2)
    else:
        return grads, tensor.sqrt(g2)
Exemplo n.º 7
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        """
        for p, g in grads.items():
            grads[p] = g / self.batch_size
        g_norm = 0.
        for g in grads.values():
            g_norm += (g**2).sum()
        """
        g_norm = 0.
        for p, g in grads.items():
            g /= self.batch_size
            grads[p] = g
            g_norm += (g**2).sum()
        not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)
        for p, g in grads.items():
            grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        mainloop.grads = grads
Exemplo n.º 8
0
def pseudograd(loss, params, srng=None, temperature = 1.0e-1,
               learning_rate=1.0e-2, rho2=0.95):


  one = T.constant(1.0)
  zero = T.constant(0.0)

  deltas = [ make_normal(param, srng=srng) for param in params ]
  momentum = [ make_copy(param) for param in params ]

  new_params = [
    param + learning_rate * delta
    for param, delta, m in zip(params, deltas, momentum)
  ]

  new_loss = theano.clone(
    loss, replace=dict(zip(params, new_params))
  )

  accepting_p = T.exp((loss - new_loss) / temperature)
  u = srng.uniform(size=(), dtype=loss.dtype)

  cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss))
  step = T.switch(cond, zero, one)

  updates = OrderedDict()

  for m, delta in zip(momentum, deltas):
    updates[m] = m * rho2 + (one - rho2) * delta * step

  for param, m in zip(params, momentum):
    updates[param] = param + learning_rate * m

  return updates
Exemplo n.º 9
0
    def get_clip_sgd_updates(self,
                             params,
                             cost,
                             learning_rate,
                             momentum,
                             rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            # clip gradient directly, not momentum etc.
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * gparam
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step
        return updates
Exemplo n.º 10
0
    def get_gradients(self, model, data, **kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad**2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs, 1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite, .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
Exemplo n.º 11
0
 def updates(self, params, grads, learning_rate, momentum, rescale=5.):
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (1. -
                                                  combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg**2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
Exemplo n.º 12
0
    def shade(self, shape, lights, camera):
        # See: http://en.wikipedia.org/wiki/Phong_reflection_model#Description

        # Since our material params are 1d we calculate bw shadings first and
        # convert to color after
        light = lights[0]
        material = shape.material
        normals = shape.normals(camera.rays)

        ambient_light = material.ka

        # diffuse (lambertian)
        diffuse_shadings = material.kd*T.tensordot(normals, -light.normed_dir(), 1)

        # specular
        rm = 2.0*(T.tensordot(normals, -light.normed_dir(), 1).dimshuffle(
            0, 1, 'x'))*normals + light.normed_dir()
        specular_shadings = material.ks*(T.tensordot(rm, camera.look_at, 1) ** material.shininess)

        # phong
        phong_shadings = ambient_light + diffuse_shadings + specular_shadings

        colorized = phong_shadings.dimshuffle(0, 1, 'x') * material.color.dimshuffle('x', 'x', 0) * light.intensity.dimshuffle('x', 'x', 0)
        clipped = T.clip(colorized, 0, 1)
        distances = shape.distance(camera.rays)
        return broadcasted_switch(T.isinf(distances), [0., 0., 0.], clipped)
Exemplo n.º 13
0
    def compute_step(self, param, previous_step):
        not_finite = tensor.any(
            tensor.or_(tensor.isnan(previous_step),
                       tensor.isinf(previous_step)))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Exemplo n.º 14
0
def recurrence(log_p_curr, log_p_prev, skip_mask=None):
    if skip_mask is None:
        skip_mask = T.ones_like(log_p_curr[:, 1:-2:2])


    # normalise and bring back to p space
    k = T.max(log_p_prev, axis=1, keepdims=True)
    norm_p_prev = T.switch(
        T.isinf(log_p_prev), 0, T.exp(log_p_prev - k))  # set -inf to 0

    # previous
    _result = norm_p_prev
    # add shift of previous
    _result = T.inc_subtensor(_result[:, 1:],   norm_p_prev[:, :-1])
    # add skips of previous
    _result = T.inc_subtensor(_result[:, 3::2],
            T.switch(skip_mask,norm_p_prev[:, 1:-2:2],0))
    # current
    # log(p) should be 0 for first 2 terms
    result = T.switch(
        T.eq(_result, 0),
        -np.inf,
        log_p_curr + T.log(_result) + k
    )
    return result
Exemplo n.º 15
0
Arquivo: sgd.py Projeto: frsong/pyrl
    def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999,
                    epsilon=1e-8, grads=None):
        # Gradients
        if grads is None:
            grads = tensor.grad(loss, self.trainables)

        # Clipping
        norm  = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads]))
        m     = theanotools.clipping_multiplier(norm, max_norm)
        grads = [m*g for g in grads]

        # Safeguard against numerical instability
        new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)),
                              tensor.or_(norm < 0, norm > 1e10))
        grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads]

        # Safeguard against numerical instability
        #cond  = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm)))
        #grads = [tensor.switch(cond, np.float32(0), g) for g in grads]

        # New values
        t       = self.time + 1
        lr_t    = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t)
        means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)]
        vars_t  = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)]
        steps   = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon)
                   for m_t, v_t in zip(means_t, vars_t)]

        # Updates
        updates  = [(x, x - step) for x, step in zip(self.trainables, steps)]
        updates += [(m, m_t) for m, m_t in zip(self.means, means_t)]
        updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)]
        updates += [(self.time, t)]

        return norm, grads, updates
Exemplo n.º 16
0
 def graves_rmsprop_updates(self, params, grads, learning_rate=1e-4, alpha=0.9, epsilon=1e-4, chi=0.95):
     """
     Alex Graves' RMSProp [1]_.
     .. math ::
         n_{i} &= \chi * n_i-1 + (1 - \chi) * grad^{2}\\
         g_{i} &= \chi * g_i-1 + (1 - \chi) * grad\\
         \Delta_{i} &= \alpha * Delta_{i-1} - learning_rate * grad /
                 sqrt(n_{i} - g_{i}^{2} + \epsilon)\\
         w_{i} &= w_{i-1} + \Delta_{i}
     References
     ----------
     .. [1] Graves, Alex.
         "Generating Sequences With Recurrent Neural Networks", p.23
         arXiv:1308.0850
     """
     updates = []
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param, grad)
         old_square = self.running_square_[n]
         old_avg = self.running_avg_[n]
         old_memory = self.memory_[n]
         new_square = chi * old_square + (1. - chi) * grad ** 2
         new_avg = chi * old_avg + (1. - chi) * grad
         new_memory = alpha * old_memory - learning_rate * grad / T.sqrt(new_square - \
                     new_avg ** 2 + epsilon)
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((old_memory, new_memory))
         updates.append((param, param + new_memory))
     return updates
Exemplo n.º 17
0
    def surface_pts(self, rayField):

        rf = self.w2o(rayField)

        distance = self.distance(rayField)
        stabilized = T.switch(T.isinf(distance), 1000, distance)
        return rf.origin + (stabilized.dimshuffle(0, 1, 'x') * rays)
Exemplo n.º 18
0
    def compute_updates(self, training_cost, params):
        updates = []

        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))

        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []

        norm_gs = T.sqrt(sum(T.sum(g**2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))

        for p, g in grads.items():
            clip_grads.append((p,
                               T.switch(notfinite,
                                        numpy.float32(.1) * p,
                                        g * normalization)))

        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!")
        return updates
Exemplo n.º 19
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
 def minimize(self, loss, momentum, rescale):
     super(RMSPropOptimizer, self).minimize(loss)
     grads = self.gradparams
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     params = self.params
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - self.lr * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * self.lr * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     
     return updates
Exemplo n.º 21
0
    def surface_pts(self, rayField):

        rf = self.w2o(rayField)

        distance = self.distance(rayField)
        stabilized = T.switch(T.isinf(distance), 1000, distance)
        return rf.origin + (stabilized.dimshuffle(0, 1, 'x') * rays)
Exemplo n.º 22
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.

        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()

        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))

        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)

        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler

        mainloop.grads = grads
Exemplo n.º 23
0
 def compute_step(self, parameter, previous_step):
     step_sum = tensor.sum(previous_step)
     not_finite = (tensor.isnan(step_sum) +
                   tensor.isinf(step_sum))
     step = tensor.switch(
         not_finite > 0, (1 - self.scaler) * parameter, previous_step)
     return step, []
Exemplo n.º 24
0
    def compute_step(self, param, previous_step):
        grad_norm = l2_norm([previous_step])
        not_finite = tensor.or_(tensor.isnan(grad_norm),
                                tensor.isinf(grad_norm))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Exemplo n.º 25
0
def nan_shield(parameters, deltas, other_updates):
    delta_sum = sum(T.sum(d) for d in deltas)
    not_finite = T.isnan(delta_sum) | T.isinf(delta_sum)
    parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d))
                         for p, d in izip(parameters, deltas)]
    other_updates = [(p, T.switch(not_finite, p, u)) for p, u in other_updates]
    return parameter_updates, other_updates
Exemplo n.º 26
0
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
Exemplo n.º 27
0
def adamgc(cost,
           params,
           lr=0.0002,
           b1=0.1,
           b2=0.001,
           e=1e-8,
           max_magnitude=5.0,
           infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude),
                           max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemplo n.º 28
0
    def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None):
        (v, v_sample, cost, monitor, params, updates_train,
         v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm,
                                                        l1_norm=l1_norm)

        for param in params:
            gradient = T.grad(cost, param, consider_constant=[v_sample])

            # remove nan and inf values
            not_finite = T.or_(T.isnan(gradient), T.isinf(gradient))
            gradient = T.switch(not_finite, 0.1 * param, gradient)
            # max_grad = param * 1e-3
            # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient)

            # momentum
            # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape)
            # update = param - T.cast(lr, dtype=dtype) * gradient
            # x = momentum * velocity + update - param
            # updates_train[velocity] = x
            # updates_train[param] = momentum * x + update

            # rmsprop
            accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape)
            accu_new = 0.9 * accu + 0.1 * gradient ** 2
            updates_train[accu] = accu_new
            updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6))
        self.params = params
        self.train_function = theano.function([v], monitor, updates=updates_train)
        self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
Exemplo n.º 29
0
        def lda_logp(rt, gaze, values, error_lls, s_condition_index,
                     s_subject_index, v_condition_index, v_subject_index,
                     tau_condition_index, tau_subject_index,
                     gamma_condition_index, gamma_subject_index,
                     t0_condition_index, t0_subject_index, zerotol):

            # compute drifts
            drift = glam.components.expdrift(
                v[tt.cast(v_subject_index, dtype='int32'),
                  tt.cast(v_condition_index, dtype='int32')][:, None],
                tau[tt.cast(tau_subject_index, dtype='int32'),
                    tt.cast(tau_condition_index, dtype='int32')][:, None],
                gamma[tt.cast(gamma_subject_index, dtype='int32'),
                      tt.cast(gamma_condition_index, dtype='int32')][:, None],
                values, gaze, zerotol)
            glam_ll = glam.components.tt_wienerrace_pdf(
                rt[:, None], drift,
                s[tt.cast(s_subject_index, dtype='int32'),
                  tt.cast(s_condition_index, dtype='int32')][:, None], b,
                t0[tt.cast(t0_subject_index, dtype='int32'),
                   tt.cast(t0_condition_index, dtype='int32')][:,
                                                               None], zerotol)

            # mix likelihoods
            mixed_ll = ((1 - p_error) * glam_ll +
                        p_error * error_lls[subject_idx])

            mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll)
            mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll)
            return tt.sum(tt.log(mixed_ll + zerotol))
Exemplo n.º 30
0
 def compute_step(self, parameter, previous_step):
     step_sum = tensor.sum(previous_step)
     not_finite = (tensor.isnan(step_sum) +
                   tensor.isinf(step_sum))
     step = tensor.switch(
         not_finite > 0, (1 - self.scaler) * parameter, previous_step)
     return step, []
Exemplo n.º 31
0
Arquivo: ext.py Projeto: Beronx86/cle
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = 0.

        for p, g in grads.items():
            g /= T.cast(self.batch_size, dtype=theano.config.floatX)
            grads[p] = g
            g_norm += (g**2).sum()

        if self.check_nan:
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))

        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)

        if self.check_nan:
            for p, g in grads.items():
                grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        else:
            for p, g in grads.items():
                grads[p] = g * scaler

        mainloop.grads = grads
Exemplo n.º 32
0
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
Exemplo n.º 33
0
        def lda_logp(rt, gaze, values, error_ll, v_index, tau_index,
                     gamma_index, s_index, t0_index, is_multiplicative,
                     zerotol):

            # compute drifts
            ## Select the right drift function
            drift = ifelse(
                is_multiplicative,
                glam.components.tt_drift_multiplicative(
                    v[0, tt.cast(v_index, dtype='int32')][:, None],
                    tau[0, tt.cast(tau_index, dtype='int32')][:, None],
                    gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
                    values, gaze, zerotol),
                glam.components.tt_drift_additive(
                    v[0, tt.cast(v_index, dtype='int32')][:, None],
                    tau[0, tt.cast(tau_index, dtype='int32')][:, None],
                    gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
                    values, gaze, zerotol))
            # drift = driftfun(v[0, tt.cast(v_index, dtype='int32')][:, None],
            #                  tau[0, tt.cast(tau_index, dtype='int32')][:, None],
            #                  gamma[0, tt.cast(gamma_index, dtype='int32')][:, None],
            #                  values,
            #                  gaze,
            #                  zerotol)
            glam_ll = glam.components.tt_wienerrace_pdf(
                rt[:, None], drift,
                s[0, tt.cast(s_index, dtype='int32')][:, None], b,
                t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol)

            # mix likelihoods
            mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll)

            mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll)
            mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll)
            return tt.log(mixed_ll + zerotol)
Exemplo n.º 34
0
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Exemplo n.º 35
0
def find_sigma(X_shared, sigma_shared, N, perplexity, sigma_iters, verbose=0):
    X = T.fmatrix('X')
    sigma = T.fvector('sigma')

    target = np.log(perplexity)

    P = T.maximum(p_ij_conditional_var(X, sigma), epsilon)

    entropy = -T.sum(P * T.log(P), axis=1)

    # Setting update for binary search interval
    sigmin_shared = theano.shared(np.full(N, np.sqrt(epsilon), dtype=floath))
    sigmax_shared = theano.shared(np.full(N, np.inf, dtype=floath))

    sigmin = T.fvector('sigmin')
    sigmax = T.fvector('sigmax')

    upmin = T.switch(T.lt(entropy, target), sigma, sigmin)
    upmax = T.switch(T.gt(entropy, target), sigma, sigmax)

    givens = {
        X: X_shared,
        sigma: sigma_shared,
        sigmin: sigmin_shared,
        sigmax: sigmax_shared
    }
    updates = [(sigmin_shared, upmin), (sigmax_shared, upmax)]

    update_intervals = theano.function([],
                                       entropy,
                                       givens=givens,
                                       updates=updates)

    # Setting update for sigma according to search interval
    upsigma = T.switch(T.isinf(sigmax), sigma * 2, (sigmin + sigmax) / 2.)

    givens = {
        sigma: sigma_shared,
        sigmin: sigmin_shared,
        sigmax: sigmax_shared
    }
    updates = [(sigma_shared, upsigma)]

    update_sigma = theano.function([], sigma, givens=givens, updates=updates)

    for i in range(sigma_iters):
        e = update_intervals()
        update_sigma()
        if verbose:
            print(
                'Finding sigmas... Iteration {0}/{1}: Perplexities in [{2:.4f}, {3:.4f}].'
                .format(i + 1, sigma_iters, np.exp(e.min()), np.exp(e.max())),
                end='\r')
        if np.any(np.isnan(np.exp(e))):
            raise SigmaTooLowException(
                'Invalid sigmas. The perplexity is probably too low.')
    if verbose:
        print('\nDone. Perplexities in [{0:.4f}, {1:.4f}].'.format(
            np.exp(e.min()), np.exp(e.max())))
Exemplo n.º 36
0
def nan_shield(parameters, deltas, other_updates):
    delta_sum = sum(T.sum(d) for d in deltas)
    not_finite = T.isnan(delta_sum) | T.isinf(delta_sum)
    parameter_updates = [(p, T.switch(not_finite, 0.9 * p, p - d))
                         for p, d in izip(parameters, deltas)]
    other_updates = [(p, T.switch(not_finite, p, u))
                     for p, u in other_updates]
    return parameter_updates, other_updates
Exemplo n.º 37
0
def acc_cost(log_probs, label_mask, frame_mask, skip_mask=None):
    seq_acc_logp = forward_backward_pass(log_probs, label_mask, frame_mask,
                                         skip_mask)
    k = T.max(seq_acc_logp, axis=2, keepdims=True)
    log_sum_p = T.log(
        T.sum(T.switch(T.isinf(seq_acc_logp), 0, T.exp(seq_acc_logp - k)),
              axis=2)) + k.dimshuffle(0, 1)
    return T.sum(log_sum_p, axis=0)
Exemplo n.º 38
0
def get_vanilla_sgd_updates(param_list, gradients, lr):
    """Do SGD updates with vanilla step rule."""
    updates = []
    for p, g in zip(param_list, gradients):
        new_p = p - lr * g
        has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
    return updates
Exemplo n.º 39
0
def clip(clip_size,parameters,gradients):
    grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters))
    exploded = T.isnan(grad_mag) | T.isinf(grad_mag)
    scale = clip_size / T.maximum(clip_size,grad_mag)

    return [ T.switch(exploded,
                    0.1 * p,
                    scale * g
                ) for p,g in zip(parameters,gradients) ]
Exemplo n.º 40
0
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4):
    ident_matrix = cast_float(T.eye(inverse_hessian.shape[0]))
    maxrho = cast_float(maxrho)
    rho = cast_float(1.) / gradient_delta.dot(weight_delta)
    rho = ifelse(T.isinf(rho), maxrho * T.sgn(rho), rho)
    param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho
    param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho
    param3 = rho * T.outer(weight_delta, weight_delta)
    return param1.dot(inverse_hessian).dot(param2) + param3
Exemplo n.º 41
0
def clip(clip_size, parameters, gradients):
    grad_mag = T.sqrt(sum(T.sum(T.sqr(w)) for w in parameters))
    exploded = T.isnan(grad_mag) | T.isinf(grad_mag)
    scale = clip_size / T.maximum(clip_size, grad_mag)

    return [
        T.switch(exploded, 0.1 * p, scale * g)
        for p, g in zip(parameters, gradients)
    ]
Exemplo n.º 42
0
def replace_nans(tensor):
    """
    convert nans and infs to float_max.
    convert -infs to float_min.
    """
    tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor)
    return T.switch(
        T.isinf(tensor),
        T.switch(T.lt(tensor, 0), sys.float_info.min, sys.float_info.max),
        tensor)
Exemplo n.º 43
0
def gradient_clipping(grads, tparams, clip_c=1.0):
    g2 = 0.
    for g in grads:
        g2 += (g**2).sum()
    g2 = tensor.sqrt(g2)
    not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2))
    new_grads = []
    for p, g in zip(tparams.values(), grads):
        new_grads.append(tensor.switch(g2 > clip_c, g * (clip_c / g2), g))
    return new_grads, not_finite, tensor.lt(clip_c, g2)
Exemplo n.º 44
0
    def normals(self, rayField):
        """Returns the sphere normals at each hit point."""

        rf = self.w2o(rayField)

        distance = self.distance(rayField)
        distance = T.switch(T.isinf(distance), 0, distance)
        projections = (rf.origin) + (distance.dimshuffle(0, 1, 'x') * rf.rays)
        normals = projections / T.sqrt(T.sum(projections**2, 2)).dimshuffle(
            0, 1, 'x')
        return normals  # need to fix
Exemplo n.º 45
0
def replace_nans(tensor):
    """
    convert nans and infs to float_max.
    convert -infs to float_min.
    """
    tensor = T.switch(T.isnan(tensor), sys.float_info.max, tensor)
    return T.switch(T.isinf(tensor),
                    T.switch(T.lt(tensor, 0),
                             sys.float_info.min,
                             sys.float_info.max),
                    tensor)
Exemplo n.º 46
0
    def normals(self, rayField):
        """Returns the sphere normals at each hit point."""

        rf = self.w2o(rayField)

        distance = self.distance(rayField)
        distance = T.switch(T.isinf(distance), 0, distance)
        projections = (rf.origin) + (distance.dimshuffle(0, 1, 'x') * rf.rays)
        normals = projections / T.sqrt(
            T.sum(projections ** 2, 2)).dimshuffle(0, 1, 'x')
        return normals # need to fix
Exemplo n.º 47
0
def step_clipping(params, gparams, scale=1.):
    grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
    notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
    multiplier = T.switch(grad_norm < scale, 1., scale / grad_norm)
    _g = []
    for param, gparam in izip(params, gparams):
        tmp_g = gparam * multiplier
        _g.append(T.switch(notfinite, param * 0.1, tmp_g))

    params_clipping = _g

    return params_clipping
Exemplo n.º 48
0
 def get_output_for(self, input, **kwargs):
     # batch_size, n_channels, n_rows, n_cols = self.input_shape
     input = input - input.min(axis=1, keepdims=True)
     output = input / input.sum(axis=1, keepdims=True)
     # deal with NaN produced because of dividing by 0
     nan_mask = T.isnan(output)
     nan_idx = nan_mask.nonzero()
     output_without_nan = T.set_subtensor(output[nan_idx], 0)
     inf_mask = T.isinf(output_without_nan)
     inf_idx = inf_mask.nonzero()
     output_without_inf = T.set_subtensor(output_without_nan[inf_idx], 0)
     return output_without_inf
Exemplo n.º 49
0
def step_clipping(params, gparams, scale=1.0):
    grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
    notfinite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
    multiplier = T.switch(grad_norm < scale, 1.0, scale / grad_norm)
    _g = []
    for param, gparam in izip(params, gparams):
        tmp_g = gparam * multiplier
        _g.append(T.switch(notfinite, param * 0.1, tmp_g))

    params_clipping = _g

    return params_clipping
Exemplo n.º 50
0
def acc_cost(log_probs, label_mask, frame_mask, skip_mask=None):
    seq_acc_logp = forward_backward_pass(
        log_probs,
        label_mask,
        frame_mask,
        skip_mask
    )
    k = T.max(seq_acc_logp, axis=2, keepdims=True)
    log_sum_p = T.log(T.sum(
        T.switch(T.isinf(seq_acc_logp), 0, T.exp(seq_acc_logp - k)),
        axis=2
    )) + k.dimshuffle(0, 1)
    return T.sum(log_sum_p, axis=0)
Exemplo n.º 51
0
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        for p, g in grads.items():
            g /= self.batch_size
            g_norm = T.sqrt((g**2).sum())
            not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
            scaler = self.scaler / T.maximum(self.scaler, g_norm)
            grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        mainloop.grads = grads
    def from_partial(self, X, dX):
        eps=1e-10
        U, S, V = X
        dU, dS, dV = dX

        umask = 1 - (1 - tensor.isnan(dU)) * (1 - tensor.isinf(dU)) # indicators of nan/inf values
        vmask = 1 - (1 - tensor.isnan(dV)) * (1 - tensor.isinf(dV)) # indicators of nan/inf values

        # U S V => U mask product by columns, V by rows
        smask = 1 - tensor.prod(1 - umask, axis=0) * tensor.prod(1 - vmask, axis=1)
        S = tensor.diag(S)

        dU = tensor.set_subtensor(dU[umask.nonzero()], 0.0)
        S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0)
        S_pinv = tensor.set_subtensor(S_pinv[smask.nonzero()], 0.0)
        S_pinv = tensor.diag(S_pinv)
        dV = tensor.set_subtensor(dV[vmask.nonzero()], 0.0)
        ZV = dU.dot(S_pinv)
        UtZV = dS
        ZtU = S_pinv.dot(dV)

        Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V)))
        return Zproj
Exemplo n.º 53
0
Arquivo: ext.py Projeto: lipengyu/cle
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        grads = mainloop.grads
        g_norm = sum([T.sqr(x/self.batch_size).sum()
                      for x in grads.values()])
        not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm))
        g_norm = T.sqrt(g_norm)
        scaler = self.scaler / T.maximum(self.scaler, g_norm)
        for p, g in grads.items():
            grads[p] = T.switch(not_finite, 0.1 * p, g * scaler)
        mainloop.grads = grads
Exemplo n.º 54
0
 def gradient_descent(self, loss):
     """Momentum GD with gradient clipping."""
     grad = T.grad(loss, self.params)
     self.momentum_velocity_ = [0.0] * len(grad)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad)))
     updates = OrderedDict()
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     scaling_den = T.maximum(5.0, grad_norm)
     for n, (param, grad) in enumerate(zip(self.params, grad)):
         grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den))
         velocity = self.momentum_velocity_[n]
         update_step = self.momentum * velocity - self.learning_rate * grad
         self.momentum_velocity_[n] = update_step
         updates[param] = param + update_step
     return updates
Exemplo n.º 55
0
def find_sigma(X_shared, sigma_shared, N, perplexity, sigma_iters,
               metric, verbose=0):
    """Binary search on sigma for a given perplexity."""
    X = T.fmatrix('X')
    sigma = T.fvector('sigma')

    target = np.log(perplexity)

    P = T.maximum(p_Xp_given_X_var(X, sigma, metric), epsilon)

    entropy = -T.sum(P*T.log(P), axis=1)

    # Setting update for binary search interval
    sigmin_shared = theano.shared(np.full(N, np.sqrt(epsilon), dtype=floath))
    sigmax_shared = theano.shared(np.full(N, np.inf, dtype=floath))

    sigmin = T.fvector('sigmin')
    sigmax = T.fvector('sigmax')

    upmin = T.switch(T.lt(entropy, target), sigma, sigmin)
    upmax = T.switch(T.gt(entropy, target), sigma, sigmax)

    givens = {X: X_shared, sigma: sigma_shared, sigmin: sigmin_shared,
              sigmax: sigmax_shared}
    updates = [(sigmin_shared, upmin), (sigmax_shared, upmax)]

    update_intervals = theano.function([], entropy, givens=givens,
                                       updates=updates)

    # Setting update for sigma according to search interval
    upsigma = T.switch(T.isinf(sigmax), sigma*2, (sigmin + sigmax)/2.)

    givens = {sigma: sigma_shared, sigmin: sigmin_shared,
              sigmax: sigmax_shared}
    updates = [(sigma_shared, upsigma)]

    update_sigma = theano.function([], sigma, givens=givens, updates=updates)

    for i in range(sigma_iters):
        e = update_intervals()
        update_sigma()
        if verbose:
            print('Iteration: {0}.'.format(i+1))
            print('Perplexities in [{0:.4f}, {1:.4f}].'.format(np.exp(e.min()),
                  np.exp(e.max())))

    if np.any(np.isnan(np.exp(e))):
        raise Exception('Invalid sigmas. The perplexity is probably too low.')
Exemplo n.º 56
0
def bfgs(inverse_hessian, weight_delta, gradient_delta, maxrho=1e4):
    ident_matrix = T.eye(inverse_hessian.shape[0])

    maxrho = asfloat(maxrho)
    rho = asfloat(1.) / gradient_delta.dot(weight_delta)

    rho = ifelse(
        T.isinf(rho),
        maxrho * T.sgn(rho),
        rho,
    )

    param1 = ident_matrix - T.outer(weight_delta, gradient_delta) * rho
    param2 = ident_matrix - T.outer(gradient_delta, weight_delta) * rho
    param3 = rho * T.outer(weight_delta, weight_delta)

    return param1.dot(inverse_hessian).dot(param2) + param3
Exemplo n.º 57
0
    def getUpdates(self):
        params=self.params
        lr=self.lr
        momentum=self.momentum
        rescale=self.rescale
        gparams =self.gparams
        updates = OrderedDict()

        if not hasattr(self, "running_average_"):
            self.running_square_ = [0.] * len(gparams)
            self.running_avg_ = [0.] * len(gparams)
            self.updates_storage_ = [0.] * len(gparams)

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            combination_coeff = 0.9
            minimum_grad = 1e-4
            old_square = self.running_square_[n]
            new_square = combination_coeff * old_square + (
                1. - combination_coeff) * T.sqr(gparam)
            old_avg = self.running_avg_[n]
            new_avg = combination_coeff * old_avg + (
                1. - combination_coeff) * gparam
            rms_grad = T.sqrt(new_square - new_avg ** 2)
            rms_grad = T.maximum(rms_grad, minimum_grad)
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - lr * (
                gparam / rms_grad)
            self.running_square_[n] = new_square
            self.running_avg_[n] = new_avg
            self.updates_storage_[n] = update_step
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step

        return updates
Exemplo n.º 58
0
    def ada_delta(self, loss, rho=0.95, eps=1e-8):
        '''AdaDelta with Gradient Clipping'''
        grad = T.grad(loss, self.params)
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        scaling_den = T.maximum(5.0, grad_norm)
        updates = OrderedDict()

        for n, (param, grad, grad_sq_old, delta_sq_old) in enumerate(zip(self.params, grad, self.gradients_sq, self.deltas_sq)):
            grad = T.switch(not_finite, 0.01 * param,
                            grad * (5.0 / scaling_den))
            grad_sq_new = rho*grad_sq_old + (1-rho)*(grad**2)
            delta = (T.sqrt(delta_sq_old+eps)/T.sqrt(grad_sq_new+eps))*grad
            delta_sq_new = rho*delta_sq_old + (1-rho)*delta**2
            updates[param] = param - delta
            updates[grad_sq_old] = grad_sq_new
            updates[delta_sq_old] = delta_sq_new
        return updates
Exemplo n.º 59
0
def clip_gradients_norm(gradients, threshold, parameters, fix_nan = False):
	gradient_sqr_vec = T.concatenate([T.sqr(g.flatten()) for g in gradients])
	gradient_norm = T.sqrt(gradient_sqr_vec.sum())
	rescale = T.maximum(gradient_norm, threshold)
	if fix_nan:
		isnan = T.or_(T.isnan(gradient_norm), T.isinf(gradient_norm))
	else:
		isnan = None
	rv = []
	for i, g in enumerate(gradients):
		if fix_nan:
			alt_g = 0.1 * parameters[i]
			print_alt_g = Print("NaN detected! Fixing with pseudogradient with mean:", ["mean"])(alt_g)
			new_g = T.switch(isnan, print_alt_g, g / rescale)
		else:
			new_g = g / rescale
		rv.append(new_g)
	return rv