Пример #1
0
    def wake(self, wake_data, it):

        ddc = self.reg
        self.wake_data = wake_data.copy()

        gs = []

        nl_obs = self.nlayer - 1

        if self.layer_plastic[-1]:

            grad_name = "x%d->dlogp%d" % (nl_obs, nl_obs)
            g = self.reg.predict(self.wake_data, grad_name)
            self.wake_data["dlogp%d" % (nl_obs)] = g
            gs.insert(0, g.mean(0))

            # NOT NECESSARY
            grad_name = "x%d->dnorm%d" % (nl_obs, nl_obs)
            g = self.reg.predict(self.wake_data, grad_name)
            self.wake_data["dnorm%d" % (nl_obs)] = g

            grad_name = "x%d->dnatsuff%d" % (nl_obs, nl_obs)
            g = self.reg.predict(self.wake_data, grad_name)
            self.wake_data["dnatsuff%d" % (nl_obs)] = g

        else:
            gs.insert(0, np.zeros_like(self.model.dists[-1].ps))

        if self.nlayer > 1:

            for i in range(self.nlayer - 2, -1, -1):
                mean_name = "mx%d_x%d" % (i, nl_obs)
                if i == self.nlayer - 2:
                    fun_name = "x%d->x%d" % (nl_obs, self.nlayer - 2)
                else:
                    fun_name = "mx%d_x%d->x%d" % (i + 1, nl_obs, i)
                self.wake_data[mean_name] = ddc.predict(
                    self.wake_data, fun_name)

                if self.layer_plastic[i]:

                    grad_name = "x%d->dlogp%d" % (i, i)
                    g = self.approx_E(mean_name, grad_name)
                    self.wake_data["dlogp%d" % i] = g
                    gs.insert(0, g.mean(0))

                    # NOT NECESSARY
                    grad_name = "x%d->dnorm%d" % (i, i)
                    g = self.approx_E(mean_name, grad_name)
                    self.wake_data["dnorm%d" % (i)] = g

                    grad_name = "x%d->dnatsuff%d" % (i, i)
                    g = self.approx_E(mean_name, grad_name)
                    self.wake_data["dnatsuff%d" % i] = g

                else:
                    gs.insert(0, np.zeros_like(self.model.dists[i].ps))
        gs = np.concatenate(gs)

        self.gradient_step(gs, it)
Пример #2
0
 def _setup(self, network):
     self.accu = defaultdict(dict)
     self.d_accu = defaultdict(dict)
     for i, layer in enumerate(network.parametric_layers):
         for n in layer.parameters.keys():
             self.accu[i][n] = np.zeros_like(layer.parameters[n])
             self.d_accu[i][n] = np.zeros_like(layer.parameters[n])
Пример #3
0
def adam(data,
         paramvec,
         loss,
         batch_size,
         rate,
         epochs=1,
         b1=0.9,
         b2=0.999,
         epsilon=1e-8,
         callback=None):
    m = np.zeros_like(paramvec)
    v = np.zeros_like(paramvec)
    vals = []
    i = 0

    for epoch in range(epochs):
        for minibatch in make_batches(batch_size, data):
            val, g = vgrad(loss)(paramvec, *minibatch)
            m = (1. - b1) * g + b1 * m
            v = (1. - b2) * g**2 + b2 * v
            mhat = m / (1 - b1**(i + 1))
            vhat = v / (1 - b2**(i + 1))
            paramvec -= rate * mhat / (np.sqrt(vhat) + epsilon)
            vals.append(val)
            i += 1
        if callback: callback(epoch, paramvec, vals)
    return paramvec
Пример #4
0
def viterbi(log_pi0, log_Ps, ll):
    """
    Find the most likely state sequence

    This is modified from pyhsmm.internals.hmm_states
    by Matthew Johnson.
    """
    T, K = ll.shape

    # Check if the transition matrices are stationary or
    # time-varying (hetero)
    hetero = (log_Ps.shape[0] == T - 1)
    if not hetero:
        assert log_Ps.shape[0] == 1

    # Pass max-sum messages backward
    scores = np.zeros_like(ll)
    args = np.zeros_like(ll, dtype=int)
    for t in range(T - 2, -1, -1):
        vals = log_Ps[t * hetero] + scores[t + 1] + ll[t + 1]
        args[t + 1] = vals.argmax(axis=1)
        scores[t] = vals.max(axis=1)

    # Now maximize forwards
    z = np.zeros(T, dtype=int)
    z[0] = (scores[0] + log_pi0 + ll[0]).argmax()
    for t in range(1, T):
        z[t] = args[t, z[t - 1]]

    return z
Пример #5
0
def _make_grad_hmm_normalizer(argnum, ans, log_pi0, log_Ps, ll):
    # Unbox the inputs if necessary
    unbox = lambda x: x if isinstance(x, np.ndarray) else x._value
    log_pi0 = unbox(log_pi0)
    log_Ps = unbox(log_Ps)
    ll = unbox(ll)

    # Make sure everything is C contiguous
    to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS'
                                                          ] else arr
    log_pi0 = to_c(log_pi0)
    log_Ps = to_c(log_Ps)
    ll = to_c(ll)

    dlog_pi0 = np.zeros_like(log_pi0)
    dlog_Ps = np.zeros_like(log_Ps)
    dll = np.zeros_like(ll)
    T, K = ll.shape

    # Forward pass to get alphas
    alphas = np.zeros((T, K))
    forward_pass(log_pi0, log_Ps, ll, alphas)
    grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll)

    if argnum == 0:
        return lambda g: g * dlog_pi0
    if argnum == 1:
        return lambda g: g * dlog_Ps
    if argnum == 2:
        return lambda g: g * dll
Пример #6
0
    def fit_adam(self,X,Y,disp=False,n_epochs=10,batch_size = 8,drop_first=None,beta_1=0.9,beta_2=0.999,learning_rate = 1e-3):
        
        
        m=int(X.shape[0]/batch_size)
        ps = self.get_params()
        mt=np.zeros_like(ps)
        vt=np.zeros_like(ps)
        args=[X,Y,drop_first]
        
        loss=1e3
        
        t=0
        for i in range(0,n_epochs):
            for j in range(0,m):
                
                bx = X[j*batch_size:(j+1)*batch_size]
                by = Y[j*batch_size:(j+1)*batch_size]
                args=[bx,by,drop_first]
                t=t+1
                ps = self.get_params()
                g = self.errf_grad(ps,args)
                mt = beta_1*mt + (1.0-beta_1)*g
                vt = beta_2*vt + (1.0-beta_2)*g**2

                amt = mt/(1.0-beta_1**t)
                avt = vt/(1.0-beta_2**t)
                ps_new = ps - learning_rate*amt/(np.sqrt(avt)+1e-8)
                self.set_params(ps_new)
            args=[X,Y,drop_first]  
            loss = self.errf(ps_new,args)
            print("epoch",i,"loss",loss)
        def compute_f_fprime_t_avg_(W, perturbation, burn_in=0.5, max_dist=1):
            W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W(
                W)
            fval = compute_f_(Eta, Xi, s02)
            fprimeval = compute_fprime_(Eta, Xi, s02)
            u0, u1, u2, u3 = compute_us(W, fval, fprimeval)
            resEta = Eta - u0 - u2
            resXi = Xi - u1 - u3
            YY = fval + perturbation
            YYp = fprimeval + 0

            YYmean = np.zeros_like(Eta)
            YYprimemean = np.zeros_like(Eta)

            def dYYdt(YY, Eta1, Xi1):
                return -YY + compute_f_(Eta1, Xi1, s02)

            def dYYpdt(YYp, Eta1, Xi1):
                return -YYp + compute_fprime_(Eta1, Xi1, s02)

            for t in range(niter):
                if np.mean(np.abs(YY - fval)) < max_dist:
                    u0, u1, u2, u3 = compute_us(W, YY, YYp)
                    Eta1 = resEta + u0 + u2
                    Xi1 = resXi + u1 + u3
                    YY = YY + dt * dYYdt(YY, Eta1, Xi1)
                    YYp = YYp + dt * dYYpdt(YYp, Eta1, Xi1)
                elif np.remainder(t, 500) == 0:
                    print('unstable fixed point?')
                if t > niter * burn_in:
                    YYmean = YYmean + 1 / niter / burn_in * YY
                    YYprimemean = YYprimemean + 1 / niter / burn_in * YYp

            return YYmean, YYprimemean
Пример #8
0
  def __init__(self, W, learning_rate=10e-5, decay=0.95, blend=0.95):
    """
    This is the Alex Graves RMSProp variant from
    Generating Sequences with Recurrent Neural Networks.

    It scales parameter updates by a running estimate of the variance
    of the parameter rather than just a running estimate of the magnitude.

    decay governs how fast the momentum falls off.

    blend governs the extent to which we take the current parameter value
      into account when updating our estimate of variance.
    """
    self.lr = learning_rate
    self.d = decay
    self.b = blend

    self.ns  = {} # store the mean of the square
    self.gs  = {} # store the mean, which will later be squared
    self.ms  = {} # momentum
    self.qs  = {} # update norm over param norm - ideally this stays around 10e-3
    for k, v in W.iteritems():
      self.ns[k]  = np.zeros_like(v)
      self.gs[k]  = np.zeros_like(v)
      self.ms[k]  = np.zeros_like(v)
      self.qs[k]  = self.lr
def apply_gradient_adam(x,
                        g,
                        i_batch,
                        m=None,
                        v=None,
                        step_size=0.001,
                        b1=0.9,
                        b2=0.999,
                        eps=1e-7,
                        verbose=True):

    g = np.array(g)
    if m is None or v is None:
        m = np.zeros_like(x)
        v = np.zeros_like(v)
    m = (1 - b1) * g + b1 * m  # First moment estimate.
    v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
    mhat = m / (1 - b1**(i_batch + 1))  # Bias correction.
    vhat = v / (1 - b2**(i_batch + 1))
    d = step_size * mhat / (np.sqrt(vhat) + eps)
    x = x - d
    if verbose:
        try:
            print_flush(
                '  Step size modifier is {}.'.format(
                    np.mean(mhat / (np.sqrt(vhat) + eps))), 0, comm.Get_rank())
        except:
            print('  Step size modifier is {}.'.format(
                np.mean(mhat / (np.sqrt(vhat) + eps))))
    return x, m, v
Пример #10
0
 def compute_f_fprime_t_avg_(W,perturbation,burn_in=0.5,max_dist=1):
     Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h = parse_W(W)
     fval = compute_f_(Eta,Xi,s02)
     fprimeval = compute_fprime_(Eta,Xi,s02)
     resEta = Eta - u_fn(XX,fval,Wmx,Wmy,K,kappa,T)
     resXi  = Xi - u_fn(XX,fval,Wsx,Wsy,K,kappa,T)
     YY = fval + perturbation
     YYp = fprimeval
     YYmean = np.zeros_like(Eta)
     YYprimemean = np.zeros_like(Eta)
     def dYYdt(YY,Eta1,Xi1):
         return -YY + compute_f_(Eta1,Xi1,s02)
     def dYYpdt(YYp,Eta1,Xi1):
         return -YYp + compute_fprime_(Eta1,Xi1,s02)
     for t in range(niter):
         if np.mean(np.abs(YY-fval)) < max_dist:
             Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,K,kappa,T)
             Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,K,kappa,T)
             YY = YY + dt*dYYdt(YY,Eta1,Xi1)
             YYp = YYp + dt*dYYpdt(YYp,Eta1,Xi1)
         else:
             print('unstable fixed point?')
         #Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,K,kappa,T)
         #Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,K,kappa,T)
         #YY = YY + dt*dYYdt(YY,Eta1,Xi1)
         if t>niter*burn_in:
             #YYp = compute_fprime_(Eta1,Xi1,s02)
             YYmean = YYmean + 1/niter/burn_in*YY
             YYprimemean = YYprimemean + 1/niter/burn_in*YYp
         
     return YYmean,YYprimemean
Пример #11
0
def _make_grad_hmm_normalizer(argnum, ans, pi0, Ps, ll):
    # Make sure everything is C contiguous and unboxed
    pi0 = to_c(pi0)
    Ps = to_c(Ps)
    ll = to_c(ll)

    dlog_pi0 = np.zeros_like(pi0)
    dlog_Ps = np.zeros_like(Ps)
    dll = np.zeros_like(ll)
    T, K = ll.shape

    # Forward pass to get alphas
    alphas = np.zeros((T, K))
    forward_pass(pi0, Ps, ll, alphas)
    log_Ps = np.log(Ps + LOG_EPS) - logsumexp(Ps, axis=1, keepdims=True)
    grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll)

    # Compute necessary gradient
    # Account for the log transformation
    # df/dP = df/dlogP * dlogP/dP = df/dlogP * 1 / P
    if argnum == 0:
        return lambda g: g * dlog_pi0 / (pi0 + DIV_EPS)
    if argnum == 1:
        return lambda g: g * dlog_Ps / (Ps + DIV_EPS)
    if argnum == 2:
        return lambda g: g * dll
    def compute_eig_penalty(W):
        # still need to finish! Hopefully won't need
        W0mx,W0my,W0sx,W0sy,s020,K0,kappa0,T0,XX0,XXp0,Eta0,Xi0,h10,h20,Eta10,Eta20 = parse_W(W)
        eig_penalty_dir_w,eig_penalty_dir_k,eig_penalty_dir_kappa = compute_eig_penalty_(W0my,k0,kappa0)
        eig_penalty_W = unparse_W(np.zeros_like(W0mx),eig_penalty_dir_w,np.zeros_like(W0sx),np.zeros_like(W0sy),np.zeros_like(s020),eig_penalty_dir_k,eig_penalty_dir_kappa,np.zeros_like(XX0),np.zeros_like(XXp0),np.zeros_like(Eta0),np.zeros_like(Xi0))
#         assert(True==False)
        return eig_penalty_W
Пример #13
0
def stf_4dim_time_day(tensor, r, random_seed=0, num_iter=100, eps=1e-8, lr=1):
    np.random.seed(random_seed)
    args_num = [1, 2, 3, 4]

    def cost(tensor, home, appliance, day, hour):
        pred = np.einsum('Hr, Ar, ADr, ATr ->HADT', home, appliance, day, hour)
        mask = ~np.isnan(tensor)
        error = (pred - tensor)[mask].flatten()

        return np.sqrt((error**2).mean())

    mg = multigrad(cost, argnums=args_num)
    sizes = [(x, r) for x in tensor.shape]
    # ADr
    sizes[-2] = (tensor.shape[1], tensor.shape[-2], r)
    # ATr
    sizes[-1] = (tensor.shape[1], tensor.shape[-1], r)
    home = np.random.rand(*sizes[0])
    appliance = np.random.rand(*sizes[1])
    day = np.random.rand(*sizes[2])
    hour = np.random.rand(*sizes[3])

    sum_home = np.zeros_like(home)
    sum_appliance = np.zeros_like(appliance)
    sum_day = np.zeros_like(day)
    sum_hour = np.zeros_like(hour)

    # GD procedure
    for i in range(num_iter):
        del_home, del_appliance, del_day, del_hour = mg(
            tensor, home, appliance, day, hour)

        sum_home += eps + np.square(del_home)
        lr_home = np.divide(lr, np.sqrt(sum_home))
        home -= lr_home * del_home

        sum_appliance += eps + np.square(del_appliance)
        lr_appliance = np.divide(lr, np.sqrt(sum_appliance))
        appliance -= lr_appliance * del_appliance

        sum_day += eps + np.square(del_day)
        lr_day = np.divide(lr, np.sqrt(sum_day))
        day -= lr_day * del_day

        sum_hour += eps + np.square(del_hour)
        lr_hour = np.divide(lr, np.sqrt(sum_hour))
        hour -= lr_hour * del_hour

        # Projection to non-negative space
        home[home < 0] = 1e-8
        appliance[appliance < 0] = 1e-8
        day[day < 0] = 1e-8
        hour[hour < 0] = 1e-8

        if i % 50 == 0:
            print(cost(tensor, home, appliance, day, hour), i)
            sys.stdout.flush()

    return home, appliance, day, hour
Пример #14
0
 def _control(self, xs, us, k, K, alpha=1):
     xs_new = np.zeros_like(xs)
     us_new = np.zeros_like(us)
     xs_new[0] = xs[0].copy()
     for i in range(self.horizon):
         us_new[i] = us[i] + alpha * k[i] + K[i].dot(xs_new[i] - xs[i])
         xs_new[i+1] = self.dynamics(xs_new[i], us_new[i])
     return xs_new, us_new
Пример #15
0
def job_per_round(f, x0, obsv, avg=True, decay=True, callback=None, **kwargs):
    '''ASGD, requiring index of observation be passed to loss func'''
    x_avg = x0.copy()  # running avg optimal
    x_hat = x0.copy()  # per-round optimal

    reg = kwargs.get('reg', 1e-2)
    n_rep = kwargs.get('n_rep', 10)  # gradient steps per observation
    γ0 = kwargs.get('learning_rate', 0.1)

    epochs = kwargs.get('epochs', 2)

    # for ADAM
    b1 = kwargs.get('b1', 0.9)
    b2 = kwargs.get('b2', 0.999)
    eps = kwargs.get('eps', 10**-8)

    η = γ0  # init
    m = np.zeros_like(x0)
    v = np.zeros_like(x0)
    μ = 1

    if callback is None:
        callback = lambda *args, **kws: None

    for epoch in range(epochs):
        samp = random.sample(obsv, k=len(obsv))
        for n, a in tqdm(enumerate(samp, 1), total=len(samp) - 1):
            n_iter = (epoch * len(obsv) + n) * n_rep
            for i in range(n_iter, n_iter + n_rep):
                f_inst = partial(f, obsv, n - 1)
                g = grad(f_inst)(x_hat, reg=reg)

                m = (1 - b1) * g + b1 * m  # First  moment estimate.
                v = (1 - b2) * (g**2) + b2 * v  # Second moment estimate.
                mhat = m / (1 - b1**(i + 1))  # Bias correction.
                vhat = v / (1 - b2**(i + 1))

                # exp. learning rate decay
                if decay:
                    η = γ0 * (1 + γ0 * reg * i / len(samp))**(-.75)

                # step w/ momentum
                x_hat = x_hat - η * mhat / (np.sqrt(vhat) + eps)
                # x_hat = x_hat - η * g  # no momentum

                # Averaging
                if avg:
                    μ = 1. / np.max([1., i - x_hat.size, i - len(obsv)])
                    x_avg = x_avg + μ * (x_hat - x_avg)
                else:
                    x_avg = x_hat

            P = loss._softmax(loss._symmetrize(x_avg), axis=1)
            callback(n_iter, P, text=f'η={η:.2e}\nμ={μ:.2e}')

    return x_avg
Пример #16
0
def factorization(tensor,
                  num_latent,
                  num_iter=2000,
                  lr=1,
                  dis=False,
                  random_seed=0,
                  eps=1e-8,
                  T_known=None):
    np.random.seed(random_seed)
    cost = cost_abs

    args_num = [0, 1, 2]
    mg = autograd.multigrad(cost, argnums=args_num)
    M, N, K = tensor.shape

    H = np.random.rand(M, num_latent)
    A = np.random.rand(N, num_latent)
    T = np.random.rand(K, num_latent)

    sum_square_gradients_A = np.zeros_like(A)
    sum_square_gradients_H = np.zeros_like(H)
    sum_square_gradients_T = np.zeros_like(T)
    if T_known is not None:
        T = set_known(T, T_known)

    # GD procedure
    for i in range(num_iter):
        del_h, del_a, del_t = mg(H, A, T, tensor)

        sum_square_gradients_A += eps + np.square(del_a)
        lr_a = np.divide(lr, np.sqrt(sum_square_gradients_A))
        A -= lr_a * del_a

        sum_square_gradients_H += eps + np.square(del_h)
        sum_square_gradients_T += eps + np.square(del_t)

        lr_h = np.divide(lr, np.sqrt(sum_square_gradients_H))
        lr_t = np.divide(lr, np.sqrt(sum_square_gradients_T))

        H -= lr_h * del_h
        T -= lr_t * del_t

        if T_known is not None:
            T = set_known(T, T_known)

        # Projection to non-negative space
        H[H < 0] = 1e-8
        A[A < 0] = 1e-8
        T[T < 0] = 1e-8

        if i % 500 == 0:
            if dis:
                print(cost(H, A, T, tensor))

    return H, A, T
def _get_grad_log_post(W1D, Wprior, H, y, X, testing=False):
    """Returns multinomial gradient of the negative log posterior probability with C classes.

   Parameters
   ----------
   W1D : array-like, shape (C*p, )
       Flattened vector of parameters at which the negative log posterior is to be evaluated
   Wprior : array-like, shape (C, p)
       vector of prior means on the parameters to be fit
   H : array-like, shape (C*p, C*p) or independent between classes (C, p, p)
       Array of prior Hessian (inverse covariance of prior distribution of parameters)
   y : array-like, shape (N, ) starting at 0
       vector of binary ({0, 1, ... C} possible responses)
   X : array-like, shape (N, p)
       array of features

   Returns
   -------
    grad_log_post1D : array-like, shape (C*p, )
            Flattened gradient of negative log posterior

   References
   ----------
   Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
   Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)
    """

    # calculate gradient log posterior
    C, p = Wprior.shape
    W = W1D.reshape(C, p)

    mu = _get_softmax_probs(X, W)  # shape (N, C)
    grad_log_likelihood = np.zeros_like(W)
    grad_log_prior = np.zeros_like(W)

    for c in range(C):
        if H.shape == (C, p, p):
            grad_log_likelihood[:, c] = X.T @ (mu[:, c] - np.int32(y == c))
            K = (W[c] - Wprior[c]).reshape(-1)
            grad_log_prior[c] = H[c] @ K
        elif H.shape == (C * p, C * p):
            grad_log_likelihood[c] = X.T @ (mu[:, c] - np.int32(y == c))

    if H.shape == (C * p, C * p):
        K = (W - Wprior).reshape(-1)  # change to shape (C*p, )
        grad_log_prior = H @ K
        grad_log_prior = grad_log_prior.reshape(C, p)  # change to shape (C, p)

    grad_log_posterior = grad_log_likelihood + grad_log_prior
    grad_log_post1D = grad_log_posterior.reshape(-1)

    if testing:
        return [grad_log_post1D, grad_log_likelihood.reshape(-1), grad_log_prior.reshape(-1)]
    else:
        return grad_log_post1D
Пример #18
0
def test_grad_hmm_normalizer(T=10, K=3):
    pi0, Ps, ll = make_parameters(T, K)
    dlogpi0, dlogPs, dll = np.zeros_like(pi0), np.zeros_like(Ps), np.zeros_like(ll)

    alphas = np.zeros((T, K))
    forward_pass(pi0, Ps, ll, alphas)
    grad_hmm_normalizer(np.log(Ps), alphas, dlogpi0, dlogPs, dll)

    assert np.allclose(dlogpi0 / pi0, grad(hmm_normalizer_np, argnum=0)(pi0, Ps, ll))
    assert np.allclose(dlogPs / Ps, grad(hmm_normalizer_np, argnum=1)(pi0, Ps, ll))
    assert np.allclose(dll, grad(hmm_normalizer_np, argnum=2)(pi0, Ps, ll))
Пример #19
0
def test_grad_hmm_normalizer(T=1000, K=3):
    log_pi0, log_Ps, ll = make_parameters(T, K)
    dlog_pi0, dlog_Ps, dll = np.zeros_like(log_pi0), np.zeros_like(log_Ps), np.zeros_like(ll)

    alphas = np.zeros((T, K))
    forward_pass(-np.log(K) * np.ones(K), log_Ps, ll, alphas)
    grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll)

    assert np.allclose(dlog_pi0, grad(hmm_normalizer_np, argnum=0)(log_pi0, log_Ps, ll))
    assert np.allclose(dlog_Ps, grad(hmm_normalizer_np, argnum=1)(log_pi0, log_Ps, ll))
    assert np.allclose(dll, grad(hmm_normalizer_np, argnum=2)(log_pi0, log_Ps, ll))
    def compute_f_fprime_t_avg_12_(W1,
                                   W2,
                                   perturbation,
                                   max_dist=1,
                                   burn_in=0.5):  # max dist added 10/14/20
        #Wmx,Wmy,Wsx,Wsy,s02,Kin,Kout,kappa,Tin,Tout,XX,XXp,Eta,Xi,h1,h2 = parse_W(W)
        W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, W0xrun, W0yrun, s02, Kin0, Kin1, Kxout0, Kyout0, Kxout1, Kyout1, kappa, Tin0, Tin1, Txout0, Tyout0, Txout1, Tyout1, h1, h2, bl, amp = parse_W1(
            W1)
        XX, XXp, Eta, Xi = parse_W2(W2)
        fval = compute_f_(Eta, Xi, s02)
        fprimeval = compute_fprime_(Eta, Xi, s02)
        u0, u1, u2, u3 = compute_us(W1, W2, fval, fprimeval)
        if share_residuals:
            resEta = Eta - u0 - u2
            resXi = Xi - u1 - u3
            resEta12 = np.concatenate((resEta, resEta), axis=0)
            resXi12 = np.concatenate((resXi, resXi), axis=0)
        else:
            resEta12 = 0
            resXi12 = 0
        dHH = np.zeros((nN, nQ * nS * nT))
        dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1
        dHH = np.concatenate((dHH * h1, dHH * h2), axis=0)
        YY = fval + perturbation
        YYp = fprimeval
        XX12 = np.concatenate((XX, XX), axis=0)
        YY12 = np.concatenate((YY, YY), axis=0)
        YYp12 = np.concatenate((YYp, YYp), axis=0)
        YYmean = np.zeros_like(YY12)
        YYprimemean = np.zeros_like(YY12)

        def dYYdt(YY, Eta1, Xi1):
            return -YY + compute_f_(Eta1, Xi1, s02)

        def dYYpdt(YYp, Eta1, Xi1):
            return -YYp + compute_fprime_(Eta1, Xi1, s02)

        for t in range(niter):
            if np.mean(np.abs(YY - fval)) < max_dist:
                u0, u1, u2, u3 = compute_us(W1, W2, YY12, YYp12)
                Eta121 = resEta12 + u0 + u2 + dHH
                Xi121 = resXi12 + u1 + u3
                YY12 = YY12 + dt * dYYdt(YY12, Eta121, Xi121)
                YYp12 = YYp12 + dt * dYYpdt(YYp12, Eta121, Xi121)
            elif np.remainder(t, 500) == 0:
                print('unstable fixed point?')
            if t > niter * burn_in:
                YYmean = YYmean + 1 / niter / burn_in * YY12
                YYprimemean = YYprimemean + 1 / niter / burn_in * YYp12

        #YYmean = YYmean + np.tile(bl,nS*nT)[np.newaxis,:]

        return YYmean, YYprimemean
Пример #21
0
def apply_gradient_adam(x, g, i_batch, m=None, v=None, step_size=0.001, b1=0.9, b2=0.999, eps=1e-8):

    g = np.array(g)
    if m is None or v is None:
        m = np.zeros_like(x)
        v = np.zeros_like(v)
    m = (1 - b1) * g + b1 * m  # First  moment estimate.
    v = (1 - b2) * (g ** 2) + b2 * v  # Second moment estimate.
    mhat = m / (1 - b1 ** (i_batch + 1))  # Bias correction.
    vhat = v / (1 - b2 ** (i_batch + 1))
    x = x - step_size * mhat / (np.sqrt(vhat) + eps)
    return x, m, v
Пример #22
0
    def StepDescent(self, parameters):
        self.para_log = np.log(parameters)
        gradient = self.grad_exp(self.para_log)
        if self.m_para is None:
            self.m_para = np.zeros_like(self.para_log)
            self.s_para = np.zeros_like(self.para_log)

        self.m_para = self.beta1 * self.m_para - (1 - self.beta1) * gradient
        self.s_para = self.beta2 * self.s_para + \
            (1 - self.beta2) * gradient * gradient

        para_temp = self.para_log - self.step_size * self.m_para / np.sqrt(
            self.s_para + 1e-10)
        return np.exp(para_temp), gradient
Пример #23
0
 def neg_ll(self, x, c, n, *params):
     f = np.zeros_like(self.p)
     params = np.reshape(params, (self.m, self.dist.k + 1))
     f = np.zeros_like(x)
     for i in range(self.m):
         like = self.dist.like(x, c, n, *params[i, 1::])
         like = np.multiply(params[i, 0], like)
         f = f + like
     f = np.where(f <= 0, surpyval.TINIEST, f)
     f = np.where(f < 1, f, 1)
     f = np.log(f)
     f = np.multiply(n, f)
     f = -np.sum(f)
     return f
Пример #24
0
    def wake(self, wake_data, it):

        reg = self.reg
        self.wake_data = wake_data.copy()

        gs = []

        nl_obs = (self.nlayer - 1)
        mean_name = "mx%dx%d_x%d" % (nl_obs - 1, nl_obs, nl_obs)
        fun_name = "x%d->x%dx%d" % (nl_obs, nl_obs - 1, nl_obs)
        self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name)

        if self.layer_plastic[-1]:
            grad_name = "x%d->dlogp%d" % (nl_obs, nl_obs)
            g = reg.predict(self.wake_data, grad_name)
            #g = self.approx_E(mean_name, grad_name)
            self.wake_data["dlogp%d" % (nl_obs)] = g
            gs.insert(0, g.mean(0))
        else:
            gs.insert(0, np.zeros_like(self.model.dists[-1].ps))

        for i in range(nl_obs - 1, 0, -1):

            mean_name = "mx%dx%d_x%d" % (i - 1, i, nl_obs)
            fun_name = "mx%dx%d_x%d->x%dx%d" % (i, i + 1, nl_obs, i - 1, i)
            self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name)
            if self.layer_plastic[i]:

                grad_name = "x%dx%d->dlogp%d" % (i - 1, i, i)
                g = self.approx_E(mean_name, grad_name)
                self.wake_data["dlogp%d" % i] = g
                gs.insert(0, g.mean(0))

            else:
                gs.insert(0, np.zeros_like(self.model.dists[i].ps))

        if self.layer_plastic[0]:
            mean_name = "mx0_x%d" % (nl_obs)
            fun_name = "mx0x1_x%d->x0" % (nl_obs)
            self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name)
            g = self.approx_E(mean_name, "x0->dlogp0")
            self.wake_data["dlogp0"] = g
            gs.insert(0, g.mean(0))
        else:
            gs.insert(0, np.zeros_like(self.model.dists[0].ps))

        gs = np.concatenate(gs)

        self.gradient_step(gs, it)
Пример #25
0
def gen_traces(datafiles,
               blcutoff=blcutoff,
               blspan=blspan):  #nbefore=nbefore,nafter=nafter
    trialwise = np.array(())
    ctrialwise = np.array(())
    strialwise = np.array(())
    dfofall = np.array(())
    baselineall = np.array(())
    for datafile in datafiles:
        frm = sio.loadmat(datafile.replace('.rois', '.mat'),
                          squeeze_me=True)['info']['frame'][()][1:]
        with h5py.File(datafile, mode='r') as f:
            to_add = f['corrected'][:].T
            to_add[np.isnan(to_add)] = np.nanmin(to_add)
            #             baseline = np.percentile(to_add,blcutoff,axis=1)
            baseline = sfi.percentile_filter(to_add[:, ::ds], blcutoff,
                                             (1, int(blspan / ds)))
            baseline = np.repeat(baseline, ds, axis=1)
            for i in range(baseline.shape[0]):
                baseline[i] = sfi.gaussian_filter1d(baseline[i], blspan / 2)
#             if baseline.shape[1]<to_add.shape[1]:
#                 baseline = np.hstack((baseline,np.repeat(baseline[:,-1],to_add.shape[1]-baseline.shape[1])))
            if baseline.shape[1] > to_add.shape[1]:
                baseline = baseline[:, :to_add.shape[1]]
            c = np.zeros_like(to_add)
            s = np.zeros_like(to_add)
            dfof = np.zeros_like(to_add)
            for i in range(c.shape[0]):
                #                 dfof = (to_add[i]-baseline[i,np.newaxis])/baseline[i,np.newaxis]
                dfof[i] = (to_add[i] - baseline[i, :]) / (baseline[i, :])
                #try:
                c[i], s[i], _, _, _ = deconvolve(dfof[i].astype(np.float64),
                                                 penalty=1,
                                                 sn=5e-3)
                #except:
                #    print("in "+datafile+" couldn't do "+str(i))
            try:
                trialwise = np.concatenate((trialwise, to_add), axis=0)
                ctrialwise = np.concatenate((ctrialwise, c), axis=0)
                strialwise = np.concatenate((strialwise, s), axis=0)
                dfofall = np.concatenate((dfofall, dfof), axis=0)
                baselineall = np.concatenate((baselineall, baseline), axis=0)
            except:
                trialwise = to_add.copy()
                ctrialwise = c.copy()
                strialwise = s.copy()
                dfofall = dfof.copy()
                baselineall = baseline.copy()
    return trialwise, ctrialwise, strialwise, dfofall, baselineall
    def compute_f_fprime_t_avg_12_(W1,
                                   W2,
                                   perturbation,
                                   max_dist=1,
                                   burn_in=0.5):  # max dist added 10/14/20
        #Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W)
        Wmx, Wmy, Wsx, Wsy, s02, K, kappa, T, h1, h2 = parse_W1(W1)
        XX, XXp, Eta, Xi = parse_W2(W2)
        fval = compute_f_(Eta, Xi, s02)
        fprimeval = compute_fprime_(Eta, Xi, s02)
        if share_residuals:
            resEta = Eta - u_fn(XX, fval, Wmx, Wmy, K, kappa, T)
            resXi = Xi - u_fn(XX, fval, Wsx, Wsy, K, kappa, T)
            resEta12 = np.concatenate((resEta, resEta), axis=0)
            resXi12 = np.concatenate((resXi, resXi), axis=0)
        else:
            resEta12 = 0
            resXi12 = 0
        dHH = np.zeros((nN, nQ * nS * nT))
        dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1
        dHH = np.concatenate((dHH * h1, dHH * h2), axis=0)
        YY = fval + perturbation
        YYp = fprimeval
        XX12 = np.concatenate((XX, XX), axis=0)
        YY12 = np.concatenate((YY, YY), axis=0)
        YYp12 = np.concatenate((YYp, YYp), axis=0)
        YYmean = np.zeros_like(YY12)
        YYprimemean = np.zeros_like(YY12)

        def dYYdt(YY, Eta1, Xi1):
            return -YY + compute_f_(Eta1, Xi1, s02)

        def dYYpdt(YYp, Eta1, Xi1):
            return -YYp + compute_fprime_(Eta1, Xi1, s02)

        for t in range(niter):
            if np.mean(np.abs(YY - fval)) < max_dist:
                Eta121 = resEta12 + u_fn(XX12, YY12, Wmx, Wmy, K, kappa,
                                         T) + dHH
                Xi121 = resXi12 + u_fn(XX12, YY12, Wmx, Wmy, K, kappa, T)
                YY12 = YY12 + dt * dYYdt(YY12, Eta121, Xi121)
                YYp12 = YYp12 + dt * dYYpdt(YYp12, Eta121, Xi121)
            elif np.remainder(t, 500) == 0:
                print('unstable fixed point?')
            if t > niter * burn_in:
                YYmean = YYmean + 1 / niter / burn_in * YY12
                YYprimemean = YYprimemean + 1 / niter / burn_in * YYp12

        return YYmean, YYprimemean
Пример #27
0
    def updateEmissionDist( self, msg, node_smoothed ):

        emission_dist_numerator = np.zeros_like( msg.emission_dist )
        emission_dist_denominator = np.zeros_like( msg.pi0 )

        # Update the emission distribution
        for node, ys in zip( msg.nodes, msg.ys ):
            measurements = ys.shape[ 0 ]

            for y in ys:
                emission_dist_numerator[ :, y ] += node_smoothed[ node ]
            emission_dist_denominator += node_smoothed[ node ] * measurements

        self.emission_dist.params = ( emission_dist_numerator / emission_dist_denominator[ :, None ], )
        assert np.allclose( self.emission_dist.params[ 0 ].sum( axis=-1 ), 1.0 )
Пример #28
0
    def marginal(self, kernel):
        """
        calculates marginal likelihood
        Args:
            Ks_new: new covariance if needed
        Returns: np.array for marginal likelihood

        """

        if kernel.params is not None:
            self.Ks = self.construct_Ks()
            self.alpha = np.zeros([self.X.shape[0]])
            self.W = np.zeros([self.X.shape[0]])
            self.grads = np.zeros([self.X.shape[0]])
            self.f = self.mu
            self.f_pred = self.f
            self.run(10)

        Ks = self.Ks
        eigs = [np.expand_dims(np.linalg.eig(K)[0], 1) for K in Ks]
        eig_K = np.squeeze(kron_list(eigs))
        self.eig_K = eig_K

        if self.obs_idx is not None:
            f_lim = self.f[self.obs_idx]
            alpha_lim = self.alpha[self.obs_idx]
            mu_lim = self.mu[self.obs_idx]
            W_lim = self.W[self.obs_idx]
            eig_k_lim = eig_K[self.obs_idx]

            pen = -0.5 * np.sum(np.multiply(alpha_lim,
                                       f_lim - mu_lim))
            pen = np.where(np.isnan(pen), np.zeros_like(pen), pen)
            eigs = 0.5 * np.sum(np.log(1 + np.multiply(eig_k_lim,
                                       W_lim)))
            eigs = np.where(np.isnan(eigs), np.zeros_like(eigs), eigs)
            like = np.sum(self.likelihood.log_like(f_lim, self.y))
            like = np.where(np.isnan(like), np.zeros_like(like), like)

            return -(pen+eigs+like)

        pen = -0.5 * np.sum(np.multiply(self.alpha,
                                   self.f - self.mu))
        eigs = - 0.5*np.sum(np.log(1 +
                                   np.multiply(eig_K, self.W)))
        like = np.sum(self.likelihood.log_like(self.f, self.y))

        return -(pen+eigs+like)
Пример #29
0
def adadelta(paramvec, loss, batches, epochs=1, rho=0.95, epsilon=1e-6, callback=None):
    sum_gsq = np.zeros_like(paramvec)
    sum_usq = np.zeros_like(paramvec)
    vals = []

    for epoch in range(epochs):
        permuted_batches = [batches[i] for i in npr.permutation(len(batches))]
        for im, angle in permuted_batches:
            val, g = vgrad(loss)(paramvec, im, angle)
            sum_gsq = rho*sum_gsq + (1.-rho)*g**2
            ud = -np.sqrt(sum_usq + epsilon) / np.sqrt(sum_gsq + epsilon) * g
            sum_usq = rho*sum_usq + (1.-rho)*ud**2
            paramvec = paramvec + ud
            vals.append(val)
        if callback: callback(epoch, paramvec, vals, permuted_batches)
    return paramvec
Пример #30
0
    def backward_pass(self, delta):
        if len(delta.shape) == 2:
            delta = delta[:, np.newaxis, :]
        n_samples, n_timesteps, input_shape = delta.shape
        p = self._params

        # Temporal gradient arrays
        grad = {k: np.zeros_like(p[k]) for k in p.keys()}

        dh_next = np.zeros((n_samples, input_shape))
        output = np.zeros((n_samples, n_timesteps, self.input_dim))

        # Backpropagation through time
        for i in reversed(range(n_timesteps)):
            dhi = self.activation_d(self.states[:, i, :]) * (delta[:, i, :] + dh_next)

            grad['W'] += np.dot(self.last_input[:, i, :].T, dhi)
            grad['b'] += delta[:, i, :].sum(axis=0)
            grad['U'] += np.dot(self.states[:, i - 1, :].T, dhi)

            dh_next = np.dot(dhi, p['U'].T)

            d = np.dot(delta[:, i, :], p['U'].T)
            output[:, i, :] = np.dot(d, p['W'].T)

        # Change actual gradient arrays
        for k in grad.keys():
            self._params.update_grad(k, grad[k])
        return output
Пример #31
0
def to_unconstrained_arr(p):
    """ Numerically stable transform from positive reals to real line

    Implements ag_np.log(ag_np.exp(x) - 1.0)

    Autograd friendly and fully vectorized

    Args
    ----
    p : array of values in (0, +\infty)

    Returns
    -------
    ans : array of values in (-\infty, +\infty), same size as p
    """
    ## Handle numpy array case
    if not isinstance(p, float):
        mask1 = p > 10.0
        mask0 = ag_np.logical_not(mask1)
        out = ag_np.zeros_like(p)
        out[mask0] =  ag_np.log(ag_np.expm1(p[mask0]))
        out[mask1] = p[mask1] + ag_np.log1p(-ag_np.exp(-p[mask1]))
        return out
    ## Handle scalar float case
    else:
        if p > 10:
            return p + ag_np.log1p(-ag_np.exp(-p))
        else:
            return ag_np.log(ag_np.expm1(p))
Пример #32
0
def to_common_arr(x):
    """ Numerically stable transform from real line to positive reals

    Returns ag_np.log(1.0 + ag_np.exp(x))

    Autograd friendly and fully vectorized

    Args
    ----
    x : array of values in (-\infty, +\infty)

    Returns
    -------
    ans : array of values in (0, +\infty), same size as x
    """
    if not isinstance(x, float):
        mask1 = x > 0
        mask0 = ag_np.logical_not(mask1)
        out = ag_np.zeros_like(x)
        out[mask0] = ag_np.log1p(ag_np.exp(x[mask0]))
        out[mask1] = x[mask1] + ag_np.log1p(ag_np.exp(-x[mask1]))
        return out
    if x > 0:
        return x + ag_np.log1p(ag_np.exp(-x))
    else:
        return ag_np.log1p(ag_np.exp(x))
Пример #33
0
def lorenz96_ode(t, x, params):
    '''
    This is the ODE function in eq.(19) of the paper.
    This function will be sent as a parameter to the scipy.integrate.ode().
    This function is called in the simulate.py file.

    Input:
        t: time. This should always be the first parameter.
            This is required by the scipy.integrate.ode().
        x: d-dimensional state at time t.
        params: 1-dimensional parameter.

    Output:
        d-dimensional derivative dx/dt=[x0_dot, x1_dot,...].
    '''

    F = params[0]
    T = x.shape[0]
    xdot = np.zeros_like(x)
    xdot[0] = ((x[1] - x[T - 2]) * x[T - 1]) - x[0]
    xdot[1] = ((x[2] - x[T - 1]) * x[0]) - x[1]
    xdot[T - 1] = ((x[0] - x[T - 3]) * x[T - 2]) - x[T - 1]
    for i in range(2, T - 1):
        xdot[i] = ((x[i + 1] - x[i - 2]) * x[i - 1]) - x[i]
    xdot = xdot + F
    return xdot
Пример #34
0
def adam(data, paramvec, loss, batch_size, rate,
         epochs=1, b1=0.9, b2=0.999, epsilon=1e-8, callback=None):
    m = np.zeros_like(paramvec)
    v = np.zeros_like(paramvec)
    vals = []
    i = 0

    for epoch in range(epochs):
        for minibatch in make_batches(batch_size, data):
            val, g = vgrad(loss)(paramvec, *minibatch)
            m = (1. - b1)*g    + b1*m
            v = (1. - b2)*g**2 + b2*v
            mhat = m / (1 - b1**(i+1))
            vhat = v / (1 - b2**(i+1))
            paramvec -= rate * mhat / (np.sqrt(vhat) + epsilon)
            vals.append(val)
            i += 1
        if callback: callback(epoch, paramvec, vals)
    return paramvec
Пример #35
0
def monomial(x, y, x_test):
    n = len(x)

    A = np.vander(x, increasing=True)
    c = np.linalg.solve(A, y)

    y_test = np.zeros_like(x_test)
    for j in xrange(n-1, -1, -1):
        y_test = np.multiply(y_test, x_test) + c[j]

    return y_test
Пример #36
0
    def manual_grads(params):
      """
      Compute the gradient of the loss WRT the parameters
      Ordering of the operations is reverse of that in fprop()
      """
      deltas = {}
      for key, val in params.iteritems():
        deltas[key] = np.zeros_like(val)

      loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases = self.stats
      dd = [{} for _ in range(self.heads)]
      for t in reversed(xrange(len(targets))):
        if t < len(inputs) - 1:
          for idx in range(self.heads):
            # grab gradient from the future
            dnext = dd[idx][t+1]
            # propagate the gradients to the first input of read().
            dread1 = np.dot(mems[t-1], dnext)
            # propagate the gradients to the second input of read().
            dread2 = np.dot(w_rs[idx][t], dnext.T)
            # TODO: propagate the gradients through write()
            pass

        ts = np.reshape(np.array(targets[t]),(self.out_size,1))
        # gradient of cross entropy loss function.
        dt = (ps[t] - ts) / (math.log(2) * ps[t] * (1 - ps[t]))

        # propagate the gradient backwards through the flow graph,
        # updating parameters as we go
        dt *= sigmoid_prime(ys[t])
        deltas['oy'] = np.dot(dt, os[t].T)
        deltas['by'] = dt

        if t < len(inputs) - 1:
          for idx in range(self.heads):
            # TODO: Update parameters oadds, oerases, ok_r, bbeta_r, og_r, os_r, ok_w ...
            # use dread1 and dread2 computed above as the starting gradients
            pass

        dt = np.dot(params['oy'].T, dt)
        dt *= tanh_prime(zos[t])
        deltas['ho'] = np.dot(dt, hs[t].T)
        deltas['bo'] = dt

        dt = np.dot(params['ho'].T, dt)
        dt *= tanh_prime(zhs[t])
        deltas['xh'] = np.dot(dt, xs[t].T)
        deltas['bh'] = dt

        for idx in range(self.heads):
            deltas['rh' + str(idx)] += np.dot(dt, rs[idx][t-1].reshape((self.M, 1)).T)
            # save the gradient for propagating backwards through time
            dd[idx][t] = np.dot(params['rh' + str(idx)].T, dt)
      return deltas
Пример #37
0
def rmsprop(data, paramvec, loss, batch_size, rate,
            epochs=1, rho=0.9, epsilon=1e-6, callback=None):
    sumsq = np.zeros_like(paramvec)
    vals = []

    for epoch in range(epochs):
        for minibatch in make_batches(batch_size, data):
            val, g = vgrad(loss)(paramvec, *minibatch)
            sumsq = rho*sumsq + (1.-rho)*g**2
            paramvec = paramvec - rate * g / np.sqrt(sumsq + epsilon)
            vals.append(val)
        if callback: callback(epoch, paramvec, vals)
    return paramvec
Пример #38
0
    def compute_rotated_map(self, rotation):
        """
        Compute stellar maps projected on the plane of the sky for a given rotation of the star
        Args:
            rotation (float) : rotation around the star in degrees given as [longitude, latitude] in degrees
        
        Returns:
            pixel_unique (int) : vector with the "active" healpix pixels
            pixel_map (int) : map showing the healpix pixel projected on the plane of the sky
            mu_pixel (float): map of the astrocentric angle for each pixel on the plane of the sky (zero for pixels not in the star)
            T_pixel (float): map of temperatures for each pixel on the plane of the sky
        """
        mu_pixel = np.zeros_like(self.mu_angle)
        T_pixel = np.zeros_like(self.mu_angle)

# Get the projection of the healpix pixel indices on the plane of the sky
        pixel_map = self.projector.projmap(self.indices, self.f_vec2pix, rot=rotation)[:,0:int(self.npix/2)]

# Get the unique elements in the vector
        pixel_unique = np.unique(pixel_map)
        
# Now loop over all unique pixels, filling up the array of the projected map with the mu and temeperature values
        for j in range(len(pixel_unique)):
            ind = np.where(pixel_map == pixel_unique[j])            

            if (np.all(np.isfinite(self.mu_angle[ind[0],ind[1]]))):
                if (self.mu_angle[ind[0],ind[1]].size == 0):
                    value = 0.0
                else:                    
                    value = np.nanmean(self.mu_angle[ind[0],ind[1]])
                    mu_pixel[ind[0],ind[1]] = value

                    T_pixel[ind[0],ind[1]] = self.temperature_map[int(pixel_unique[j])]
            else:
                mu_pixel[ind[0],ind[1]] = 0.0
                T_pixel[ind[0],ind[1]] = 0.0

        return pixel_unique, pixel_map, mu_pixel, T_pixel
Пример #39
0
    def backward_pass(self, delta):
        if len(delta.shape) == 2:
            delta = delta[:, np.newaxis, :]

        n_samples, n_timesteps, input_shape = delta.shape

        # Temporal gradient arrays
        grad = {k: np.zeros_like(self._params[k]) for k in self._params.keys()}

        dh_next = np.zeros((n_samples, input_shape))
        output = np.zeros((n_samples, n_timesteps, self.input_dim))

        # Backpropagation through time
        for i in reversed(range(n_timesteps)):
            dhi = delta[:, i, :] * self.gates['o'][:, i, :] * self.activation_d(self.states[:, i, :]) + dh_next

            og = delta[:, i, :] * self.activation(self.states[:, i, :])
            de_o = og * self.sigmoid_d(self.gates['o'][:, i, :])

            grad['W_o'] += np.dot(self.last_input[:, i, :].T, de_o)
            grad['U_o'] += np.dot(self.outputs[:, i - 1, :].T, de_o)
            grad['b_o'] += de_o.sum(axis=0)

            de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(self.gates['f'][:, i, :])
            grad['W_f'] += np.dot(self.last_input[:, i, :].T, de_f)
            grad['U_f'] += np.dot(self.outputs[:, i - 1, :].T, de_f)
            grad['b_f'] += de_f.sum(axis=0)

            de_i = (dhi * self.gates['c'][:, i, :]) * self.sigmoid_d(self.gates['i'][:, i, :])
            grad['W_i'] += np.dot(self.last_input[:, i, :].T, de_i)
            grad['U_i'] += np.dot(self.outputs[:, i - 1, :].T, de_i)
            grad['b_i'] += de_i.sum(axis=0)

            de_c = (dhi * self.gates['i'][:, i, :]) * self.activation_d(self.gates['c'][:, i, :])
            grad['W_c'] += np.dot(self.last_input[:, i, :].T, de_c)
            grad['U_c'] += np.dot(self.outputs[:, i - 1, :].T, de_c)
            grad['b_c'] += de_c.sum(axis=0)

            dh_next = dhi * self.gates['f'][:, i, :]

        # TODO: propagate error to the next layer

        # Change actual gradient arrays
        for k in grad.keys():
            self._params.update_grad(k, grad[k])
        return output
Пример #40
0
def getDiffs(model, deltas, inputs, targets, epsilon):
  """
  For every (weight,delta) combo in zip(weights, deltas):
    Add epsilon to that weight and compute the loss (first_loss)
    Remove epsilon from that weight and compute the loss (second_loss)
    Check how close (first loss - second loss) / 2h is to the delta from bprop
  """

  diff_tensors = []
  for D in deltas:
    diff_tensors.append(np.zeros_like(D))

  for W,D,diffs in zip(model.weights, deltas, diff_tensors):
  # for each weight tensor in our model

    for i in range(W.shape[0]):
      for j in range(W.shape[1]):
        # for each weight in that tensor

        # compute f(x+h) for that weight
        W[i,j] += epsilon
        loss, _, _, _, _, _, _  = model.lossFun(inputs, targets, False)
        loss_plus = np.sum(loss)

        # compute f(x - h) for that weight
        W[i,j] -= epsilon*2
        loss, _, _, _, _, _, _ = model.lossFun(inputs, targets, False)
        loss_minus = np.sum(loss)

        # grad check must leave weights unchanged
        # so reset the weight that we changed
        W[i,j] += epsilon

        # compute the numerical grad w.r.t. this param
        grad = (loss_plus - loss_minus) / (2 * epsilon)
        diffs[i,j] = grad - D[i,j]

  return diff_tensors
Пример #41
0
    def manual_grads(params):
      """
      Compute the gradient of the loss WRT the parameters
      Ordering of the operations is reverse of that in fprop()
      """
      deltas = {}
      for key, val in params.iteritems():
        deltas[key] = np.zeros_like(val)

      [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs,
       w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws,
       zbeta_rs, zbeta_ws, zs_rs, zs_ws, wg_rs, wg_ws] = self.stats
      dd = {}
      drs = {}
      dzh = {}
      dmem = {} # might not need this, since we have dmemtilde
      dmemtilde = {}
      du_r = {}
      du_w = {}
      dwg_r = {}
      dwg_w = {}
      for t in reversed(xrange(len(targets))):

        dy = np.copy(ps[t])
        dy -= targets[t].T # backprop into y

        deltas['oy'] += np.dot(dy, os[t].T)
        deltas['by'] += dy

        if t < len(targets) - 1:
          # r[t] affects cost through zh[t+1] via Wrh
          drs[t] = np.dot(self.W['rh'].T, dzh[t + 1])

          # right now, mems[t] influences cost through rs[t+1], via w_rs[t+1]
          dmem[t] = np.dot( w_rs[t + 1], drs[t + 1].reshape((self.M,1)).T )
          # and also through mems at next step
          W = np.reshape(w_ws[t+1], (w_ws[t+1].shape[0], 1))
          E = np.reshape(erases[t+1], (erases[t+1].shape[0], 1))
          WTE = np.dot(W, E.T)
          KEEP = np.ones(mems[0].shape) - WTE
          dmem[t] += np.multiply(dmemtilde[t+1], KEEP)
          # and also through its influence on the content weighting next step
          dmem[t] += du_r[t+1] + du_w[t+1]

          dmemtilde[t] = dmem[t]

          # erases[t] affects cost through mems[t], via w_ws[t]
          derase = np.dot(np.multiply(dmemtilde[t], -mems[t-1]).T, w_ws[t])

          # zerase affects just erases through a sigmoid
          dzerase = derase * (erases[t] * (1 - erases[t]))

          # adds[t] affects costs through mems[t], via w_ws
          dadd = np.dot(dmem[t].T, w_ws[t])

          # zadds affects just adds through a tanh
          dzadd = dadd * (1 - adds[t] * adds[t])

          # dbadds is just dzadds
          deltas['badds'] += dzadd

          deltas['oadds'] += np.dot(dzadd, os[t].T)

          deltas['berases'] += dzerase

          deltas['oerases'] += np.dot(dzerase, os[t].T)

          # # read weights affect what is read, via what's in mems[t-1]
          # dwc_r = np.dot(mems[t-1], drs[t])

          # # write weights affect mem[t] through adding
          # dwc_w = np.dot(dmem[t], adds[t])
          # # they also affect memtilde[t] through erasing
          # dwc_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t])

          dw_r = np.dot(mems[t-1], drs[t])
          dw_r += dwg_r[t+1] * (1 - g_rs[t+1])

          # write weights affect mem[t] through adding
          dw_w = np.dot(dmem[t], adds[t])
          # they also affect memtilde[t] through erasing
          dw_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t])
          dw_w += dwg_w[t+1] * (1 - g_ws[t+1])

          sgwr = np.zeros((self.N, self.N))
          sgww = np.zeros((self.N, self.N))
          for i in range(self.N):
            sgwr[i,i] = softmax(zs_rs[t])[0]
            sgwr[i,(i+1) % self.N] = softmax(zs_rs[t])[2]
            sgwr[i,(i-1) % self.N] = softmax(zs_rs[t])[1]

            sgww[i,i] = softmax(zs_ws[t])[0]
            sgww[i,(i+1) % self.N] = softmax(zs_ws[t])[2]
            sgww[i,(i-1) % self.N] = softmax(zs_ws[t])[1]

          # right now, shifted weights are final weight
          dws_r = dw_r
          dws_w = dw_w

          dwg_r[t] = np.dot(sgwr.T, dws_r)
          dwg_w[t] = np.dot(sgww.T, dws_w)

          dwc_r = dwg_r[t] * g_rs[t]
          dwc_w = dwg_w[t] * g_ws[t]


          """
          We need dw/dK
          now w has N elts and K has N elts
          and we want, for every elt of W, the grad of that elt w.r.t. each
          of the N elts of K. that gives us N * N things
          """
          # first, we must build up the K values (should be taken from fprop)
          K_rs = []
          K_ws = []
          for i in range(self.N):
            K_rs.append(cosine_sim(mems[t-1][i, :], k_rs[t]))
            K_ws.append(cosine_sim(mems[t-1][i, :], k_ws[t]))

          # then, we populate the grads
          dwdK_r = np.zeros((self.N, self.N))
          dwdK_w = np.zeros((self.N, self.N))
          # for every row in the memory
          for i in range(self.N):
            # for every element in the weighting
            for j in range(self.N):
              dwdK_r[i,j] += softmax_grads(K_rs, softplus(zbeta_rs[t]), i, j)
              dwdK_w[i,j] += softmax_grads(K_ws, softplus(zbeta_ws[t]), i, j)

          # compute dK for all i in N
          # K is the evaluated cosine similarity for the i-th row of mem matrix
          dK_r = np.zeros_like(w_rs[0])
          dK_w = np.zeros_like(w_ws[0])

          # for all i in N (for every row that we've simmed)
          for i in range(self.N):
            # for every j in N (for every elt of the weighting)
            for j in range(self.N):
              # specifically, dwdK_r will change, and for write as well
              dK_r[i] += dwc_r[j] * dwdK_r[i,j] 
              dK_w[i] += dwc_w[j] * dwdK_w[i,j]

          """
          dK_r_dk_rs is a list of N things
          each elt of the list corresponds to grads of K_idx
          w.r.t. the key k_t
          so it should be a length N list of M by 1 vectors
          """

          dK_r_dk_rs = []
          dK_r_dmem = []
          for i in range(self.N):
            # let k_rs be u, Mem[i] be v
            u = np.reshape(k_rs[t], (self.M,))
            v = mems[t-1][i, :]
            dK_r_dk_rs.append( dKdu(u,v) )
            dK_r_dmem.append( dKdu(v,u))

          dK_w_dk_ws = []
          dK_w_dmem = []
          for i in range(self.N):
            # let k_ws be u, Mem[i] be v
            u = np.reshape(k_ws[t], (self.M,))
            v = mems[t-1][i, :]
            dK_w_dk_ws.append( dKdu(u,v) )
            dK_w_dmem.append( dKdu(v,u))

          # compute delta for keys
          dk_r = np.zeros_like(k_rs[0])
          dk_w = np.zeros_like(k_ws[0])
          # for every one of M elt of dk_r
          for i in range(self.M):
            # for every one of the N Ks
            for j in range(self.N):
              # add delta K_r[j] * dK_r[j] / dk_r[i]
              # add influence on through K_r[j]
              dk_r[i] += dK_r[j] * dK_r_dk_rs[j][i]
              dk_w[i] += dK_w[j] * dK_w_dk_ws[j][i]

          # these represent influence of mem on next K
          """
          Let's let du_r[t] represent the
          influence of mems[t-1] on the cost through the K values
          this is analogous to dk_w, but, k only every affects that
          whereas mems[t-1] will also affect what is read at time t+1
          and through memtilde at time t+1
          """
          du_r[t] = np.zeros_like(mems[0])
          du_w[t] = np.zeros_like(mems[0])
          # for every row in mems[t-1]
          for i in range(self.N):
            # for every elt of this row (one of M)
            for j in range(self.M):
              du_r[t][i,j] = dK_r[i] * dK_r_dmem[i][j]
              du_w[t][i,j] = dK_w[i] * dK_w_dmem[i][j]

          # key values are activated as tanh
          dzk_r = dk_r * (1 - k_rs[t] * k_rs[t])
          dzk_w = dk_w * (1 - k_ws[t] * k_ws[t])

          deltas['ok_r'] += np.dot(dzk_r, os[t].T)
          deltas['ok_w'] += np.dot(dzk_w, os[t].T)

          deltas['bk_r'] += dzk_r
          deltas['bk_w'] += dzk_w

          dg_r = np.dot(dwg_r[t].T, (wc_rs[t] - w_rs[t-1]) )
          dg_w = np.dot(dwg_w[t].T, (wc_ws[t] - w_ws[t-1]) )

          # compute dzg_r, dzg_w
          dzg_r = dg_r * (g_rs[t] * (1 - g_rs[t]))
          dzg_w = dg_w * (g_ws[t] * (1 - g_ws[t]))

          deltas['og_r'] += np.dot(dzg_r, os[t].T)
          deltas['og_w'] += np.dot(dzg_w, os[t].T)

          deltas['bg_r'] += dzg_r
          deltas['bg_w'] += dzg_w

          # compute dbeta, which affects w_content through interaction with Ks

          dwcdbeta_r = np.zeros_like(w_rs[0])
          dwcdbeta_w = np.zeros_like(w_ws[0])
          for i in range(self.N):
            dwcdbeta_r[i] = beta_grads(K_rs, softplus(zbeta_rs[t]), i)
            dwcdbeta_w[i] = beta_grads(K_ws, softplus(zbeta_ws[t]), i)

          dbeta_r = np.zeros_like(zbeta_rs[0])
          dbeta_w = np.zeros_like(zbeta_ws[0])
          for i in range(self.N):
            dbeta_r[0] += dwc_r[i] * dwcdbeta_r[i]
            dbeta_w[0] += dwc_w[i] * dwcdbeta_w[i]

          # beta is activated from zbeta by softplus, grad of which is sigmoid
          dzbeta_r = dbeta_r * sigmoid(zbeta_rs[t])
          dzbeta_w = dbeta_w * sigmoid(zbeta_ws[t])

          deltas['obeta_r'] += np.dot(dzbeta_r, os[t].T)
          deltas['obeta_w'] += np.dot(dzbeta_w, os[t].T)

          deltas['bbeta_r'] += dzbeta_r
          deltas['bbeta_w'] += dzbeta_w

          sgsr = np.zeros((self.N, 3))
          sgsw = np.zeros((self.N, 3))
          for i in range(self.N):
            sgsr[i,1] = wg_rs[t][(i - 1) % self.N]
            sgsr[i,0] = wg_rs[t][i]
            sgsr[i,2] = wg_rs[t][(i + 1) % self.N]

            sgsw[i,1] = wg_ws[t][(i - 1) % self.N]
            sgsw[i,0] = wg_ws[t][i]
            sgsw[i,2] = wg_ws[t][(i + 1) % self.N]

          ds_r = np.dot(sgsr.T, dws_r)
          ds_w = np.dot(sgsw.T, dws_w)

          shift_act_jac_r = np.zeros((3,3))
          shift_act_jac_w = np.zeros((3,3))
          bf = np.array([[1.0]])
          for i in range(3):
            for j in range(3):
              shift_act_jac_r[i,j] = softmax_grads(zs_rs[t], bf, i, j)
              shift_act_jac_w[i,j] = softmax_grads(zs_ws[t], bf, i, j)

          dzs_r = np.dot(shift_act_jac_r.T, ds_r)
          dzs_w = np.dot(shift_act_jac_w.T, ds_w)

          deltas['os_r'] += np.dot(dzs_r, os[t].T)
          deltas['os_w'] += np.dot(dzs_w, os[t].T)

          deltas['bs_r'] += dzs_r
          deltas['bs_w'] += dzs_w

        else:
          drs[t] = np.zeros_like(rs[0])
          dmemtilde[t] = np.zeros_like(mems[0])
          du_r[t] = np.zeros_like(mems[0])
          du_w[t] = np.zeros_like(mems[0])
          dwg_r[t] = np.zeros_like(w_rs[0])
          dwg_w[t] = np.zeros_like(w_ws[0])

        # o affects y through Woy
        do = np.dot(params['oy'].T, dy)
        if t < len(targets) - 1:
          # and also zadd through Woadds
          do += np.dot(params['oadds'].T, dzadd)
          do += np.dot(params['oerases'].T, dzerase)
          # and also through the keys
          do += np.dot(params['ok_r'].T, dzk_r)
          do += np.dot(params['ok_w'].T, dzk_w)
          # and also through the interpolators
          do += np.dot(params['og_r'].T, dzg_r)
          do += np.dot(params['og_w'].T, dzg_w)
          # and also through beta
          do += np.dot(params['obeta_r'].T, dzbeta_r)
          do += np.dot(params['obeta_w'].T, dzbeta_w)
          # and also through the shift values
          do += np.dot(params['os_r'].T, dzs_r)
          do += np.dot(params['os_w'].T, dzs_w)


        # compute deriv w.r.t. pre-activation of o
        dzo = do * (1 - os[t] * os[t])

        deltas['ho'] += np.dot(dzo, hs[t].T)
        deltas['bo'] += dzo

        # compute hidden dh
        dh = np.dot(params['ho'].T, dzo)

        # compute deriv w.r.t. pre-activation of h
        dzh[t] = dh * (1 - hs[t] * hs[t])

        deltas['xh'] += np.dot(dzh[t], xs[t].T)
        deltas['bh'] += dzh[t]

        # Wrh affects zh via rs[t-1]
        deltas['rh'] += np.dot(dzh[t], rs[t-1].reshape((self.M, 1)).T)

      return deltas
Пример #42
0
    def precompute_rotation_maps(self, rotations=None):
        """
        Compute the averaged spectrum on the star for a given temperature map and for a given rotation
        Args:
            rotations (float) : [N_phases x 2] giving [longitude, latitude] in degrees for each phase
        
        Returns:
            None
        """
        if (rotations is None):
            print("Use some angles for the rotations")
            return

        self.n_phases = rotations.shape[0]

        self.avg_mu = [None] * self.n_phases
        self.avg_v = [None] * self.n_phases
        self.velocity = [None] * self.n_phases
        self.n_pixel_unique = [None] * self.n_phases
        self.n_pixels = [None] * self.n_phases
        self.pixel_unique = [None] * self.n_phases

        for loop in range(self.n_phases):
            mu_pixel = np.zeros_like(self.mu_angle)
            v_pixel = np.zeros_like(self.vel_projection)
        
            pixel_map = self.projector.projmap(self.indices, self.f_vec2pix, rot=rotations[loop,:])[:,0:int(self.npix/2)]
            pixel_unique = np.unique(pixel_map[np.isfinite(pixel_map)])

            for j in range(len(pixel_unique)):
                ind = np.where(pixel_map == pixel_unique[j])

                if (np.all(np.isfinite(self.mu_angle[ind[0],ind[1]]))):
                    if (self.mu_angle[ind[0],ind[1]].size == 0):
                        mu_pixel[ind[0],ind[1]] = 0.0
                        v_pixel[ind[0],ind[1]] = 0.0
                    else:                    
                        
                        if (self.clv):
                            value = np.nanmean(self.mu_angle[ind[0],ind[1]])
                        else:
                            value = 1.0

                        mu_pixel[ind[0],ind[1]] = value

                        value = np.nanmean(self.vel_projection[ind[0],ind[1]])
                        v_pixel[ind[0],ind[1]] = value
                else:
                    mu_pixel[ind[0],ind[1]] = 0.0
                    v_pixel[ind[0],ind[1]] = 0.0

            self.n_pixel_unique[loop] = len(pixel_unique)
            self.avg_mu[loop] = np.zeros(self.n_pixel_unique[loop])
            self.avg_v[loop] = np.zeros(self.n_pixel_unique[loop])
            self.velocity[loop] = np.zeros(self.n_pixel_unique[loop])
            self.n_pixels[loop] = np.zeros(self.n_pixel_unique[loop], dtype='int')
            self.pixel_unique[loop] = pixel_unique.astype('int')

            for i in range(len(pixel_unique)):
                ind = np.where(pixel_map == pixel_unique[i])
                self.n_pixels[loop][i] = len(ind[0])
                self.avg_mu[loop][i] = np.unique(mu_pixel[ind[0], ind[1]])
                self.avg_v[loop][i] = np.unique(v_pixel[ind[0], ind[1]])            
                self.velocity[loop][i] = self.avg_mu[loop][i] * self.avg_v[loop][i]
Пример #43
0
    return num / den2

if __name__ == "__main__":
    M = 5
    u = np.random.uniform(high=1, low=-1, size=(M,))
    v = np.random.uniform(high=1, low=-1, size=(M,))
    print cosine_sim(u,v)

    # compute deltas automatically
    # just with respect to u
    cs_grad = grad(cosine_sim, argnum=0)
    auto_deltas = cs_grad(u,v)

    # compute deltas manually
    manual_deltas = np.zeros_like(auto_deltas)

    # compute the denominator
    anorm = np.sqrt(np.sum(u*u))
    bnorm = np.sqrt(np.sum(v*v))
    den2 = (anorm * bnorm) + 1e-5

    a = v / den2
    b = u / np.sum(np.square(u))
    c = cosine_sim(u,v)
    manual_deltas = a - b*c
    

    print "auto deltas"
    print auto_deltas
    print "manual deltas"
Пример #44
0
 def init_grad(self):
     for key in self._params.keys():
         if key not in self._grads:
             self._grads[key] = np.zeros_like(self._params[key])