def step_hmc_wz(model, x, z, hmc_stepsize=1e-2, hmc_steps=20): print 'step_hmc_wz', hmc_stepsize, hmc_steps n_batch = x.itervalues().next().shape[1] hmc_dostep_z = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) hmc_dostep_w = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) def dostep(w): def fgrad_z(_z): logpx, logpz, gw, gz = model.dlogpxz_dwz(w, x, _z) return logpx + logpz, gz logpxz, acceptRate, stepsize = hmc_dostep_z(fgrad_z, z) shapes_w = ndict.getShapes(w) def vectorize(d): v = {} for i in d: v[i] = d[i].reshape((d[i].size, -1)) return v def fgrad_w(_w): _w = ndict.setShapes(_w, shapes_w) logpx, logpz, gw, gz = model.dlogpxz_dwz(_w, x, z) gw = vectorize(gw) return logpx + logpz, gw _w = vectorize(w) hmc_dostep_w(fgrad_w, _w) return z.copy(), logpxz.copy() return dostep
def step_batch_mcem(model_p, x, z_mcmc, dostep_m, hmc_stepsize=1e-2, hmc_steps=20, m_steps=5): print 'Batch MCEM', hmc_stepsize, hmc_steps, m_steps n_batch = x.itervalues().next().shape[1] hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) def doStep(w): def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) return logpx + logpz, gz # E-step logpxz, acceptRate, stepsize = hmc_dostep(fgrad, z_mcmc) # M-step for i in range(m_steps): #print 'm-step:', i dostep_m(w, z_mcmc) return z_mcmc.copy(), logpxz.copy() return doStep
def step_hmc_wz(model, x, z, hmc_stepsize=1e-2, hmc_steps=20): print 'step_hmc_wz', hmc_stepsize, hmc_steps n_batch = x.itervalues().next().shape[1] hmc_dostep_z = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) hmc_dostep_w = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) def dostep(w): def fgrad_z(_z): logpx, logpz, gw, gz = model.dlogpxz_dwz(w, x, _z) return logpx + logpz, gz logpxz, acceptRate, stepsize = hmc_dostep_z(fgrad_z, z) shapes_w = ndict.getShapes(w) def vectorize(d): v = {} for i in d: v[i] = d[i].reshape((d[i].size, -1)) return v def fgrad_w(_w): _w = ndict.setShapes(_w, shapes_w) logpx, logpz, gw, gz = model.dlogpxz_dwz(_w, x, z) gw = vectorize(gw) return logpx + logpz, gw _w = vectorize(w) hmc_dostep_w(fgrad_w, _w) return z.copy(), logpxz.copy() return dostep
def step_batch_mcem(model_p, x, z_mcmc, dostep_m, hmc_stepsize=1e-2, hmc_steps=20, m_steps=5): print 'Batch MCEM', hmc_stepsize, hmc_steps, m_steps n_batch = x.itervalues().next().shape[1] hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=hmc_stepsize) def doStep(w): def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) return logpx + logpz, gz # E-step logpxz, acceptRate, stepsize = hmc_dostep(fgrad, z_mcmc) # M-step for i in range(m_steps): #print 'm-step:', i dostep_m(w, z_mcmc) return z_mcmc.copy(), logpxz.copy() return doStep
def step_pvem(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Predictive VEM', ada_stepsize hmc_steps = 0 hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1) # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): #def fgrad(_z): # logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) # return logpx + logpz, gz n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} if convertImgs: x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch} # step 1A: sample z ~ p(z|x) from model_q _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) # step 1B: update z using HMC def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch) return logpx + logpz, gz if (hmc_steps > 0): logpxz, _, _ = hmc_dostep(fgrad, z) def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # step 2: use z to update model_p logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # step 3: use gradients of model_p to update model_q _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: -gw_q[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def step_pvem(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Predictive VEM', ada_stepsize hmc_steps=0 hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1) # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): #def fgrad(_z): # logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) # return logpx + logpz, gz n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch} # step 1A: sample z ~ p(z|x) from model_q _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) # step 1B: update z using HMC def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch) return logpx + logpz, gz if (hmc_steps > 0): logpxz, _, _ = hmc_dostep(fgrad, z) def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # step 2: use z to update model_p logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # step 3: use gradients of model_p to update model_q _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: -gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep