示例#1
0
def step_vae(model,
             x,
             v,
             w,
             n_batch=100,
             stepsize=1e-1,
             warmup=100,
             anneal=True,
             convertImgs=False,
             binarize=False):
    print 'Variational Auto-Encoder', n_batch, stepsize, warmup

    # We're using adagrad stepsizes
    gv_ss = ndict.cloneZeros(v)
    gw_ss = ndict.cloneZeros(w)

    nsteps = [0]

    def doStep(v, w):

        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i: x[i][:, idx_minibatch] for i in x}
        if convertImgs: x_minibatch['x'] = x_minibatch['x'] / 256.
        if binarize:
            x_minibatch['x'] = np.random.binomial(n=1, p=x_minibatch['x'])

        # Sample epsilon from prior
        z = model.gen_eps(n_batch)
        #for i in z: z[i] *= 0

        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(v, w, x_minibatch, z)
        _, _, gv_prior, gw_prior = model.dlogpw_dw(v, w)
        gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw}

        # Update parameters
        adagrad_reg = 1e-8
        c = 1.0
        if not anneal: c /= nsteps[0] + 1
        for i in gv:
            gv_ss[i] += gv[i]**2
            if nsteps[0] > warmup:
                v[i] += stepsize / np.sqrt(gv_ss[i] * c + adagrad_reg) * gv[i]

        for i in gw:
            gw_ss[i] += gw[i]**2
            if nsteps[0] > warmup:
                w[i] += stepsize / np.sqrt(gw_ss[i] * c + adagrad_reg) * gw[i]

        nsteps[0] += 1

        return z.copy(), logpx + logpz - logqz

    return doStep
示例#2
0
def step_wakesleep(model_q,
                   model_p,
                   x,
                   w_q,
                   n_batch=100,
                   ada_stepsize=1e-1,
                   warmup=100,
                   reg=1e-8,
                   convertImgs=False):
    print 'Wake-Sleep', ada_stepsize

    # We're using adagrad stepsizes
    gw_q_ss = ndict.cloneZeros(w_q)
    gw_p_ss = ndict.cloneZeros(model_p.init_w())

    nsteps = [0]

    do_adagrad = True

    def doStep(w_p):

        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i: x[i][:, idx_minibatch] for i in x}

        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]

        # Wake phase: use z ~ q(z|x) to update model_p
        _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch)
        _, logpz_q = model_q.logpxz(w_q, x_minibatch, z)
        logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w_p)
        gw = {i: gw_p[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_p}
        optimize(w_p, gw, gw_p_ss, ada_stepsize)

        # Sleep phase: use x ~ p(x|z) to update model_q
        x_p, z_p, _ = model_p.gen_xz(w_p, {}, {}, n_batch)
        _, _, gw_q, _ = model_q.dlogpxz_dwz(w_q, x_p, z_p)
        _, gw_prior = model_q.dlogpw_dw(w_q)
        gw = {i: gw_q[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_q}
        optimize(w_q, gw, gw_q_ss, ada_stepsize)

        nsteps[0] += 1

        return z.copy(), logpx_p + logpz_p - logpz_q

    return doStep
示例#3
0
    def doStep(w):

        grad = ndict.cloneZeros(v)
        gw = ndict.cloneZeros(w)

        for l in range(n_batch):
            n_tot = x.itervalues().next().shape[1]
            idx_minibatch = np.random.randint(0, n_tot, n_subbatch)
            x_minibatch = {i: x[i][:, idx_minibatch] for i in x}
            if convertImgs:
                x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch}

            # Use z ~ q(z|x) to compute d[LB]/d[gw]
            _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_subbatch)
            _, logpz_q = model_q.logpxz(v, x_minibatch, z)
            logpx_p, logpz_p, _gw, gz_p = model_p.dlogpxz_dwz(
                w, x_minibatch, z)
            for i in _gw:
                gw[i] += _gw[i]

            # Compute d[LB]/d[gv]  where gv = v (variational params)
            _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z)
            weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q)

            for i in v:
                f = gv[i] * weight
                h = gv[i]
                cv_cov[i] = cv_cov[i] + cv_lr * (f * h - cv_cov[i])
                cv_var[i] = cv_var[i] + cv_lr * (h**2 - cv_var[i])
                grad[i] += f - (cv_cov[i] / (cv_var[i] + 1e-8)) * h

        _, gwprior = model_p.dlogpw_dw(w)
        for i in gw:
            gw[i] += float(n_subbatch * n_batch) / n_tot * gwprior[i]

        def optimize(_w, _gw, gw_ss, stepsize):
            reg = 1e-8
            for i in _gw:
                gw_ss[i] += _gw[i]**2
                if nsteps[0] > warmup:
                    _w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * _gw[i]

        optimize(w, gw, gw_ss, ada_stepsize)
        optimize(v, grad, gv_ss, ada_stepsize)

        nsteps[0] += 1

        if ndict.hasNaN(grad):
            raise Exception()
        if ndict.hasNaN(v):
            raise Exception()

        return z.copy(), logpx_p + logpz_p - logpz_q
示例#4
0
def step_naivesvb(model_q, model_p, x, v, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False):
    print 'Naive SV Est', ada_stepsize
    
    # We're using adagrad stepsizes
    gv_ss = ndict.cloneZeros(v)
    gw_ss = ndict.cloneZeros(model_p.init_w())
    
    nsteps = [0]
    
    do_adagrad = True
    
    def doStep(w):
        
        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i:x[i][:,idx_minibatch] for i in x}
        if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch}
        
        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]
        
        # Phase 1: use z ~ q(z|x) to update model_p
        _, z, _  = model_q.gen_xz(v, x_minibatch, {}, n_batch)
        _, logpz_q = model_q.logpxz(v, x_minibatch, z)
        logpx_p, logpz_p, gw, _ = model_p.dlogpxz_dwz(w, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w)
        gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw}
        
        # Phase 2: use x ~ p(x|z) to update model_q
        _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z)
        #_, gw_prior = model_q.dlogpw_dw(w_q)
        #gw_q = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q}
        weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) - float(n_batch)
        gv = {i: gv[i] * weight for i in gv}
        
        optimize(w, gw, gw_ss, ada_stepsize)
        optimize(v, gv, gv_ss, ada_stepsize)
        
        nsteps[0] += 1
        
        return z.copy(), logpx_p + logpz_p - logpz_q
        
    return doStep
示例#5
0
    def doStep(w):
        
        grad = ndict.cloneZeros(v)
        gw = ndict.cloneZeros(w)

        for l in range(n_batch):
            n_tot = x.itervalues().next().shape[1]
            idx_minibatch = np.random.randint(0, n_tot, n_subbatch)
            x_minibatch = {i:x[i][:,idx_minibatch] for i in x}
            if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch}
            
            # Use z ~ q(z|x) to compute d[LB]/d[gw]
            _, z, _  = model_q.gen_xz(v, x_minibatch, {}, n_subbatch)
            _, logpz_q = model_q.logpxz(v, x_minibatch, z)
            logpx_p, logpz_p, _gw, gz_p = model_p.dlogpxz_dwz(w, x_minibatch, z)
            for i in _gw: gw[i] += _gw[i]
            
            # Compute d[LB]/d[gv]  where gv = v (variational params)
            _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z)
            weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q)
            
            for i in v:
                f = gv[i] * weight
                h = gv[i]
                cv_cov[i] = cv_cov[i] + cv_lr * (f * h - cv_cov[i])
                cv_var[i] = cv_var[i] + cv_lr * (h**2 - cv_var[i])
                grad[i] += f - (cv_cov[i]/(cv_var[i] + 1e-8)) * h
        
        _, gwprior = model_p.dlogpw_dw(w)
        for i in gw: gw[i] += float(n_subbatch*n_batch)/n_tot * gwprior[i]

        def optimize(_w, _gw, gw_ss, stepsize):
            reg=1e-8
            for i in _gw:
                gw_ss[i] += _gw[i]**2
                if nsteps[0] > warmup:
                    _w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * _gw[i]

        optimize(w, gw, gw_ss, ada_stepsize)
        optimize(v, grad, gv_ss, ada_stepsize)
        
        nsteps[0] += 1
        
        if ndict.hasNaN(grad):
            raise Exception()
        if ndict.hasNaN(v):
            raise Exception()
        
        return z.copy(), logpx_p + logpz_p - logpz_q
示例#6
0
def step_wakesleep(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False):
    print 'Wake-Sleep', ada_stepsize
    
    # We're using adagrad stepsizes
    gw_q_ss = ndict.cloneZeros(w_q)
    gw_p_ss = ndict.cloneZeros(model_p.init_w())
    
    nsteps = [0]
    
    do_adagrad = True
    
    def doStep(w_p):
        
        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i:x[i][:,idx_minibatch] for i in x}
        
        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]
        
        # Wake phase: use z ~ q(z|x) to update model_p
        _, z, _  = model_q.gen_xz(w_q, x_minibatch, {}, n_batch)
        _, logpz_q = model_q.logpxz(w_q, x_minibatch, z)
        logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w_p)
        gw = {i: gw_p[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_p}
        optimize(w_p, gw, gw_p_ss, ada_stepsize)
        
        # Sleep phase: use x ~ p(x|z) to update model_q
        x_p, z_p, _ = model_p.gen_xz(w_p, {}, {}, n_batch)
        _, _, gw_q, _ = model_q.dlogpxz_dwz(w_q, x_p, z_p)
        _, gw_prior = model_q.dlogpw_dw(w_q)
        gw = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q}
        optimize(w_q, gw, gw_q_ss, ada_stepsize)
        
        nsteps[0] += 1
        
        return z.copy(), logpx_p + logpz_p - logpz_q
        
    return doStep
示例#7
0
def step_vae(model, x, v, w, n_batch=100, stepsize=1e-1, warmup=100, anneal=True, convertImgs=False, binarize=False):
    print 'Variational Auto-Encoder', n_batch, stepsize, warmup
    
    # We're using adagrad stepsizes
    gv_ss = ndict.cloneZeros(v)
    gw_ss = ndict.cloneZeros(w)
    
    nsteps = [0]
    
    def doStep(v, w):
        
        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i:x[i][:,idx_minibatch] for i in x}
        if convertImgs: x_minibatch['x'] = x_minibatch['x']/256.
        if binarize: x_minibatch['x'] = np.random.binomial(n=1, p=x_minibatch['x'])
        
        # Sample epsilon from prior
        z = model.gen_eps(n_batch)
        #for i in z: z[i] *= 0
        
        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(v, w, x_minibatch, z)        
        _, _, gv_prior, gw_prior = model.dlogpw_dw(v, w)
        gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw}
        
        # Update parameters
        adagrad_reg = 1e-8
        c = 1.0
        if not anneal: c /= nsteps[0]+1
        for i in gv:
            gv_ss[i] += gv[i]**2
            if nsteps[0] > warmup:
                v[i] += stepsize / np.sqrt(gv_ss[i] * c + adagrad_reg) * gv[i]
        
        for i in gw:
            gw_ss[i] += gw[i]**2
            if nsteps[0] > warmup:
                w[i] += stepsize / np.sqrt(gw_ss[i] * c + adagrad_reg) * gw[i]
        
        nsteps[0] += 1
        
        return z.copy(), logpx + logpz - logqz
        
    return doStep
示例#8
0
 def doStep(w):
     
     f_holdout, gw_holdout = func_holdout(w)
     gw_holdout_norm = 0
     gw_holdout_effective = ndict.clone(gw_holdout)
     for i in w:
         m1_holdout[i] += lambd * (gw_holdout[i] - m1_holdout[i])
         m2_holdout[i] += lambd * (gw_holdout[i]**2 - m2_holdout[i])
         gw_holdout_effective[i] /= np.sqrt(m2_holdout[i] + 1e-8)
         gw_holdout_norm += (gw_holdout_effective[i]**2).sum()
     gw_holdout_norm = np.sqrt(gw_holdout_norm)
     
     f_tot = 0
     gw_tot = ndict.cloneZeros(w)
     alphas = []
     for j in range(len(funcs)):
         f, gw = funcs[j](w)
         f_tot += f
         
         gw_norm = 0
         gw_effective = ndict.clone(gw)
         for i in w:
             # Update first and second moments
             m1[j][i] += lambd * (gw[i] - m1[j][i])
             m2[j][i] += lambd * (gw[i]**2 - m2[j][i])
             gw_effective[i] /= np.sqrt(m2[j][i] + 1e-8)
             gw_norm += (gw_effective[i]**2).sum()
         gw_norm = np.sqrt(gw_norm)
         
         # Compute dot product with holdout gradient
         alpha = 0
         for i in w:
             alpha += (gw_effective[i] * gw_holdout_effective[i]).sum()
         alpha /= gw_holdout_norm * gw_norm
         
         alphas.append(alpha)
         
         #alpha = (alpha > 0) * 1.0
         
         for i in w:
             # Accumulate gradient of subobjective
             gw_tot[i] += alpha * gw[i] / np.sqrt(m2[j][i] + 1e-8)
         
     #print 'alphas:', alphas
     
     if batchi[0] > warmup:
         for i in w:
             w[i] += stepsize * gw_tot[i]
     
     
     batchi[0] += 1
     
     return f_tot
示例#9
0
def step_adadelta(func, w, gamma=0.05, eps=1e-6):
    print 'Adadelta', gamma, eps
    
    # mean square of gradients and delta's of z's
    gw_ms = ndict.cloneZeros(w)
    dw_ms = ndict.cloneZeros(w)
    dw = ndict.cloneZeros(w)
    
    batchi = [0]
    
    def doStep(w, z=None):
        if z == None: z = {}
        v, gw = func.subgrad(batchi[0]%func.n_minibatches, w, z)
        
        for i in gw:
            gw_ms[i] += gamma*(gw[i]**2 - gw_ms[i])
            dw[i] = np.sqrt(dw_ms[i] + eps)/np.sqrt(gw_ms[i] + eps) * gw[i]
            w[i] += dw[i]
            dw_ms[i] += gamma*(dw[i]**2 - dw_ms[i])
        batchi[0] += 1
        return v
    return doStep
示例#10
0
def step_adagrad(func, w, stepsize=0.1, warmup=10, anneal=True, decay=0):
    print 'Adagrad', stepsize
    # sum of squares of gradients and delta's of z's and w's
    gw_ss = ndict.cloneZeros(w)
    batchi = [0]
    def doStep(w, z=None):
        if z is None: z = {}
        logpwxz, gw = func.subgrad(batchi[0]%func.n_minibatches, w, z)
        c = 1
        if not anneal:
            c = 1./ (batchi[0]+1)    
        for i in gw:
            #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min()
            gw_ss[i] = (1-decay)*gw_ss[i] + gw[i]**2
            if batchi[0] < warmup: continue
            w[i] += stepsize / np.sqrt(gw_ss[i] * c + 1e-8) * gw[i]
        batchi[0] += 1
        return logpwxz
    return doStep
示例#11
0
def step_rmsprop(w, model, x, prior_sd=1, n_batch=100, stepsize=1e-2, lambd=1e-2, warmup=10):
    print 'RMSprop', stepsize
    # sum of squares of gradients and delta's of z's and w's
    gw_ss = ndict.cloneZeros(w)
    n_datapoints = x.itervalues().next().shape[1]
    
    batchi = [0]
    
    def doStep(w):
        
        # Pick random minibatch
        idx = np.random.randint(0, n_datapoints, size=(n_batch,))
        _x = ndict.getColsFromIndices(x, idx)
        
        # Evaluate likelihood and its gradient
        logpx, _, gw, _ = model.dlogpxz_dwz(w, _x, {})
        
        for i in w:
            gw[i] *= n_datapoints / n_batch
        
        # Evalute prior and its gradient
        logpw = 0
        for i in w:
            logpw -= (.5 * (w[i]**2) / (prior_sd**2)).sum()
            gw[i] -= w[i] / (prior_sd**2)
        
        for i in gw:
            #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min()
            gw_ss[i] += lambd * (gw[i]**2 - gw_ss[i])
            if batchi[0] < warmup: continue
            w[i] += stepsize * gw[i] / np.sqrt(gw_ss[i] + 1e-8)
        
        batchi[0] += 1
        
        return logpx + logpw

    return doStep
示例#12
0
def step_naivesvb(model_q,
                  model_p,
                  x,
                  v,
                  n_batch=100,
                  ada_stepsize=1e-1,
                  warmup=100,
                  reg=1e-8,
                  convertImgs=False):
    print 'Naive SV Est', ada_stepsize

    # We're using adagrad stepsizes
    gv_ss = ndict.cloneZeros(v)
    gw_ss = ndict.cloneZeros(model_p.init_w())

    nsteps = [0]

    do_adagrad = True

    def doStep(w):

        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i: x[i][:, idx_minibatch] for i in x}
        if convertImgs:
            x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch}

        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]

        # Phase 1: use z ~ q(z|x) to update model_p
        _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_batch)
        _, logpz_q = model_q.logpxz(v, x_minibatch, z)
        logpx_p, logpz_p, gw, _ = model_p.dlogpxz_dwz(w, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w)
        gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw}

        # Phase 2: use x ~ p(x|z) to update model_q
        _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z)
        #_, gw_prior = model_q.dlogpw_dw(w_q)
        #gw_q = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q}
        weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) - float(
            n_batch)
        gv = {i: gv[i] * weight for i in gv}

        optimize(w, gw, gw_ss, ada_stepsize)
        optimize(v, gv, gv_ss, ada_stepsize)

        nsteps[0] += 1

        return z.copy(), logpx_p + logpz_p - logpz_q

    return doStep
示例#13
0
def step_pvem(model_q,
              model_p,
              x,
              w_q,
              n_batch=100,
              ada_stepsize=1e-1,
              warmup=100,
              reg=1e-8,
              convertImgs=False):
    print 'Predictive VEM', ada_stepsize

    hmc_steps = 0
    hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1)

    # We're using adagrad stepsizes
    gw_q_ss = ndict.cloneZeros(w_q)
    gw_p_ss = ndict.cloneZeros(model_p.init_w())

    nsteps = [0]

    do_adagrad = True

    def doStep(w_p):

        #def fgrad(_z):
        #    logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z)
        #    return logpx + logpz, gz
        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i: x[i][:, idx_minibatch] for i in x}
        if convertImgs:
            x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch}

        # step 1A: sample z ~ p(z|x) from model_q
        _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch)

        # step 1B: update z using HMC
        def fgrad(_z):
            logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch)
            return logpx + logpz, gz

        if (hmc_steps > 0):
            logpxz, _, _ = hmc_dostep(fgrad, z)

        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]

        # step 2: use z to update model_p
        logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w_p)
        gw = {i: gw_p[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_p}
        optimize(w_p, gw, gw_p_ss, ada_stepsize)

        # step 3: use gradients of model_p to update model_q
        _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p)
        _, gw_prior = model_q.dlogpw_dw(w_q)
        gw = {i: -gw_q[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_q}
        optimize(w_q, gw, gw_q_ss, ada_stepsize)

        nsteps[0] += 1

        return z.copy(), logpx_p + logpz_p - logpz_q

    return doStep
示例#14
0
 def doStep(w, z=None):
     if z is not None: raise Exception()
     
     L = [0] # Lower bound
     g_mean = ndict.cloneZeros(w)
     if var == 'diag' or var == 'row_isotropic':
         g_logsd = ndict.cloneZeros(w)
     elif var == 'isotropic':
         g_logsd = {i:0 for i in w}
     
     # Loop over random datapoints
     for l in range(n_batch):
         
         # Pick random datapoint
         idx = np.random.randint(0, n_datapoints, size=(n_subbatch,))
         _x = ndict.getColsFromIndices(x, idx)
         
         # Function that adds gradients for given noise eps
         def add_grad(eps):
             # Compute noisy weights
             _w = {i: w[i] + np.exp(logsd[i]) * eps[i] for i in w}
             # Compute gradients of log p(x|theta) w.r.t. w
             logpx, logpz, g_w, g_z = model.dlogpxz_dwz(_w, _x, {})        
             for i in w:
                 cv = (_w[i] - w[i]) / np.exp(2*logsd[i])  #control variate
                 cov_mean[i] += cv_lr * (g_w[i]*cv - cov_mean[i])
                 var_mean[i] += cv_lr * (cv**2 - var_mean[i])
                 g_mean[i] += g_w[i] - cov_mean[i]/var_mean[i] * cv
                 
                 if var == 'diag' or var == 'row_isotropic':
                     grad = g_w[i] * eps[i] * np.exp(logsd[i])
                     cv = cv - 1 # this control variate (c.v.) is really similar to the c.v. for the mean!
                     cov_logsd[i] += cv_lr * (grad*cv - cov_logsd[i])
                     var_logsd[i] += cv_lr * (cv**2  - var_logsd[i])
                     g_logsd[i] += grad - cov_logsd[i]/var_logsd[i] * cv
                 elif var == 'isotropic':
                     g_logsd[i] += (g_w[i] * eps[i]).sum() * np.exp(logsd[i])
                 else: raise Exception()
                 
             L[0] += logpx.sum() + logpz.sum()
         
         # Gradients with generated noise
         eps = {i: np.random.standard_normal(size=w[i].shape) for i in w}
         if sgd: eps = {i: np.zeros(w[i].shape) for i in w}
         add_grad(eps)
         
         # Gradient with negative of noise
         if negNoise:
             for i in eps: eps[i] *= -1
             add_grad(eps)
     
     L = L[0]        
     L *= float(n_datapoints) / float(n_subbatch) / float(n_batch)
     if negNoise: L /= 2
     
     for i in w:
         c = float(n_datapoints) / (float(n_subbatch) * float(n_batch))
         if negNoise: c /= 2
         g_mean[i] *= c
         g_logsd[i] *= c
                     
         # Prior
         g_mean[i] += - w[i] / (prior_sd**2)
         g_logsd[i] += - np.exp(2 * logsd[i]) / (prior_sd**2)
         L += - (w[i]**2 + np.exp(2 * logsd[i])).sum() / (2 * prior_sd**2)
         L += - 0.5 * np.log(2 * np.pi * prior_sd**2) * float(w[i].size)
         
         # Entropy
         L += float(w[i].size) * 0.5 * math.log(2 * math.pi * np.pi)
         if var == 'diag' or var == 'row_isotropic':
             g_logsd[i] += 1 # dH(q)/d[logsd] = 1 (nice!)
             L += logsd[i].sum()
         elif var == 'isotropic':
             g_logsd[i] += float(w[i].size) # dH(q)/d[logsd] = 1 (nice!)
             L += logsd[i] * float(w[i].size)
         else: raise Exception()
         
     # Update variational parameters
     c = 1
     if not anneal:
         c = 1./ (batchi[0] + 1)
     
     # For isotropic row variance, sum gradients per row
     if var == 'row_isotropic':
         for i in w:
             g_sum = g_logsd[i].sum(axis=1).reshape(w[i].shape[0], 1)
             g_logsd[i] = np.dot(g_sum, np.ones((1, w[i].shape[1])))
     
     for i in w:
         #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min()
         g_w_ss[i] += g_mean[i]**2
         g_logsd_ss[i] += g_logsd[i]**2
         
         mom_w[i] += (1-momw) * (g_mean[i] - mom_w[i])
         mom_logsd[i] += (1-momsd) * (g_logsd[i] - mom_logsd[i])
         
         if batchi[0] < warmup: continue
         
         w[i] += stepsize / np.sqrt(g_w_ss[i] * c + 1e-8) * mom_w[i]
         logsd[i] += stepsize / np.sqrt(g_logsd_ss[i] * c + 1e-8) * mom_logsd[i]            
         
     batchi[0] += 1
     
     #print cov_mean['b0']/var_mean['b0']
     
     return L
示例#15
0
def step_adasgvb2(w, logsd, x, model, var='diag', negNoise=False, init_logsd=0, prior_sd=1, n_batch=1, n_subbatch=100, stepsize=1e-2, warmup=10, momw=0.75, momsd=0.75, anneal=False, sgd=False):
    print "SGVB + Adagrad", var, negNoise, init_logsd, prior_sd, n_batch, n_subbatch, stepsize, warmup, momw, momsd, anneal, sgd
    
    
    # w and logsd are the variational mean and log-variance that are learned
    
    g_w_ss = ndict.cloneZeros(w) # sum-of-squares for adagrad
    mom_w = ndict.cloneZeros(w) # momentum
    
    cv_lr = 0.1 # learning rate for control variates
    cov_mean = ndict.cloneZeros(w)
    var_mean = ndict.cloneZeros(w)
    cov_logsd = ndict.cloneZeros(w)
    var_logsd = ndict.cloneZeros(w)
    
    if var != 'diag':
        raise Exception('Didnt write control variate code for non-diag variance yet')
    
    if var == 'diag' or var == 'row_isotropic':
        #logsd = ndict.cloneZeros(w)
        for i in w: logsd[i] += init_logsd
        g_logsd_ss = ndict.cloneZeros(w)
        mom_logsd = ndict.cloneZeros(w)
    elif var == 'isotropic':
        logsd = {i: init_logsd for i in w}
        g_logsd_ss = {i: 0 for i in w}
        mom_logsd = {i: 0 for i in w}
    else: raise Exception("Unknown variance type")
    
    n_datapoints = x.itervalues().next().shape[1]
    
    batchi = [0]
    def doStep(w, z=None):
        if z is not None: raise Exception()
        
        L = [0] # Lower bound
        g_mean = ndict.cloneZeros(w)
        if var == 'diag' or var == 'row_isotropic':
            g_logsd = ndict.cloneZeros(w)
        elif var == 'isotropic':
            g_logsd = {i:0 for i in w}
        
        # Loop over random datapoints
        for l in range(n_batch):
            
            # Pick random datapoint
            idx = np.random.randint(0, n_datapoints, size=(n_subbatch,))
            _x = ndict.getColsFromIndices(x, idx)
            
            # Function that adds gradients for given noise eps
            def add_grad(eps):
                # Compute noisy weights
                _w = {i: w[i] + np.exp(logsd[i]) * eps[i] for i in w}
                # Compute gradients of log p(x|theta) w.r.t. w
                logpx, logpz, g_w, g_z = model.dlogpxz_dwz(_w, _x, {})        
                for i in w:
                    cv = (_w[i] - w[i]) / np.exp(2*logsd[i])  #control variate
                    cov_mean[i] += cv_lr * (g_w[i]*cv - cov_mean[i])
                    var_mean[i] += cv_lr * (cv**2 - var_mean[i])
                    g_mean[i] += g_w[i] - cov_mean[i]/var_mean[i] * cv
                    
                    if var == 'diag' or var == 'row_isotropic':
                        grad = g_w[i] * eps[i] * np.exp(logsd[i])
                        cv = cv - 1 # this control variate (c.v.) is really similar to the c.v. for the mean!
                        cov_logsd[i] += cv_lr * (grad*cv - cov_logsd[i])
                        var_logsd[i] += cv_lr * (cv**2  - var_logsd[i])
                        g_logsd[i] += grad - cov_logsd[i]/var_logsd[i] * cv
                    elif var == 'isotropic':
                        g_logsd[i] += (g_w[i] * eps[i]).sum() * np.exp(logsd[i])
                    else: raise Exception()
                    
                L[0] += logpx.sum() + logpz.sum()
            
            # Gradients with generated noise
            eps = {i: np.random.standard_normal(size=w[i].shape) for i in w}
            if sgd: eps = {i: np.zeros(w[i].shape) for i in w}
            add_grad(eps)
            
            # Gradient with negative of noise
            if negNoise:
                for i in eps: eps[i] *= -1
                add_grad(eps)
        
        L = L[0]        
        L *= float(n_datapoints) / float(n_subbatch) / float(n_batch)
        if negNoise: L /= 2
        
        for i in w:
            c = float(n_datapoints) / (float(n_subbatch) * float(n_batch))
            if negNoise: c /= 2
            g_mean[i] *= c
            g_logsd[i] *= c
                        
            # Prior
            g_mean[i] += - w[i] / (prior_sd**2)
            g_logsd[i] += - np.exp(2 * logsd[i]) / (prior_sd**2)
            L += - (w[i]**2 + np.exp(2 * logsd[i])).sum() / (2 * prior_sd**2)
            L += - 0.5 * np.log(2 * np.pi * prior_sd**2) * float(w[i].size)
            
            # Entropy
            L += float(w[i].size) * 0.5 * math.log(2 * math.pi * np.pi)
            if var == 'diag' or var == 'row_isotropic':
                g_logsd[i] += 1 # dH(q)/d[logsd] = 1 (nice!)
                L += logsd[i].sum()
            elif var == 'isotropic':
                g_logsd[i] += float(w[i].size) # dH(q)/d[logsd] = 1 (nice!)
                L += logsd[i] * float(w[i].size)
            else: raise Exception()
            
        # Update variational parameters
        c = 1
        if not anneal:
            c = 1./ (batchi[0] + 1)
        
        # For isotropic row variance, sum gradients per row
        if var == 'row_isotropic':
            for i in w:
                g_sum = g_logsd[i].sum(axis=1).reshape(w[i].shape[0], 1)
                g_logsd[i] = np.dot(g_sum, np.ones((1, w[i].shape[1])))
        
        for i in w:
            #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min()
            g_w_ss[i] += g_mean[i]**2
            g_logsd_ss[i] += g_logsd[i]**2
            
            mom_w[i] += (1-momw) * (g_mean[i] - mom_w[i])
            mom_logsd[i] += (1-momsd) * (g_logsd[i] - mom_logsd[i])
            
            if batchi[0] < warmup: continue
            
            w[i] += stepsize / np.sqrt(g_w_ss[i] * c + 1e-8) * mom_w[i]
            logsd[i] += stepsize / np.sqrt(g_logsd_ss[i] * c + 1e-8) * mom_logsd[i]            
            
        batchi[0] += 1
        
        #print cov_mean['b0']/var_mean['b0']
        
        return L

    return doStep
示例#16
0
def step_pvem(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False):
    print 'Predictive VEM', ada_stepsize
    
    hmc_steps=0
    hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1)
    
    # We're using adagrad stepsizes
    gw_q_ss = ndict.cloneZeros(w_q)
    gw_p_ss = ndict.cloneZeros(model_p.init_w())
    
    nsteps = [0]
    
    do_adagrad = True
    
    def doStep(w_p):
        
        #def fgrad(_z):
        #    logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z)
        #    return logpx + logpz, gz
        n_tot = x.itervalues().next().shape[1]
        idx_minibatch = np.random.randint(0, n_tot, n_batch)
        x_minibatch = {i:x[i][:,idx_minibatch] for i in x}
        if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch}
            
        # step 1A: sample z ~ p(z|x) from model_q
        _, z, _  = model_q.gen_xz(w_q, x_minibatch, {}, n_batch)
        
        # step 1B: update z using HMC
        def fgrad(_z):
            logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch)
            return logpx + logpz, gz
        if (hmc_steps > 0):
            logpxz, _, _ = hmc_dostep(fgrad, z)

        def optimize(w, gw, gw_ss, stepsize):
            if do_adagrad:
                for i in gw:
                    gw_ss[i] += gw[i]**2
                    if nsteps[0] > warmup:
                        w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i]
                    #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean()
            else:
                for i in gw:
                    w[i] += 1e-4 * gw[i]
        
        # step 2: use z to update model_p
        logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z)
        _, gw_prior = model_p.dlogpw_dw(w_p)
        gw = {i: gw_p[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_p}
        optimize(w_p, gw, gw_p_ss, ada_stepsize)
        
        # step 3: use gradients of model_p to update model_q
        _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p)
        _, gw_prior = model_q.dlogpw_dw(w_q)
        gw = {i: -gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q}
        optimize(w_q, gw, gw_q_ss, ada_stepsize)
        
        nsteps[0] += 1
        
        return z.copy(), logpx_p + logpz_p - logpz_q
        
    return doStep