def step_vae(model, x, v, w, n_batch=100, stepsize=1e-1, warmup=100, anneal=True, convertImgs=False, binarize=False): print 'Variational Auto-Encoder', n_batch, stepsize, warmup # We're using adagrad stepsizes gv_ss = ndict.cloneZeros(v) gw_ss = ndict.cloneZeros(w) nsteps = [0] def doStep(v, w): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} if convertImgs: x_minibatch['x'] = x_minibatch['x'] / 256. if binarize: x_minibatch['x'] = np.random.binomial(n=1, p=x_minibatch['x']) # Sample epsilon from prior z = model.gen_eps(n_batch) #for i in z: z[i] *= 0 # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(v, w, x_minibatch, z) _, _, gv_prior, gw_prior = model.dlogpw_dw(v, w) gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw} # Update parameters adagrad_reg = 1e-8 c = 1.0 if not anneal: c /= nsteps[0] + 1 for i in gv: gv_ss[i] += gv[i]**2 if nsteps[0] > warmup: v[i] += stepsize / np.sqrt(gv_ss[i] * c + adagrad_reg) * gv[i] for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] * c + adagrad_reg) * gw[i] nsteps[0] += 1 return z.copy(), logpx + logpz - logqz return doStep
def step_wakesleep(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Wake-Sleep', ada_stepsize # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # Wake phase: use z ~ q(z|x) to update model_p _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) _, logpz_q = model_q.logpxz(w_q, x_minibatch, z) logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # Sleep phase: use x ~ p(x|z) to update model_q x_p, z_p, _ = model_p.gen_xz(w_p, {}, {}, n_batch) _, _, gw_q, _ = model_q.dlogpxz_dwz(w_q, x_p, z_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: gw_q[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def doStep(w): grad = ndict.cloneZeros(v) gw = ndict.cloneZeros(w) for l in range(n_batch): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_subbatch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} if convertImgs: x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch} # Use z ~ q(z|x) to compute d[LB]/d[gw] _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_subbatch) _, logpz_q = model_q.logpxz(v, x_minibatch, z) logpx_p, logpz_p, _gw, gz_p = model_p.dlogpxz_dwz( w, x_minibatch, z) for i in _gw: gw[i] += _gw[i] # Compute d[LB]/d[gv] where gv = v (variational params) _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z) weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) for i in v: f = gv[i] * weight h = gv[i] cv_cov[i] = cv_cov[i] + cv_lr * (f * h - cv_cov[i]) cv_var[i] = cv_var[i] + cv_lr * (h**2 - cv_var[i]) grad[i] += f - (cv_cov[i] / (cv_var[i] + 1e-8)) * h _, gwprior = model_p.dlogpw_dw(w) for i in gw: gw[i] += float(n_subbatch * n_batch) / n_tot * gwprior[i] def optimize(_w, _gw, gw_ss, stepsize): reg = 1e-8 for i in _gw: gw_ss[i] += _gw[i]**2 if nsteps[0] > warmup: _w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * _gw[i] optimize(w, gw, gw_ss, ada_stepsize) optimize(v, grad, gv_ss, ada_stepsize) nsteps[0] += 1 if ndict.hasNaN(grad): raise Exception() if ndict.hasNaN(v): raise Exception() return z.copy(), logpx_p + logpz_p - logpz_q
def step_naivesvb(model_q, model_p, x, v, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Naive SV Est', ada_stepsize # We're using adagrad stepsizes gv_ss = ndict.cloneZeros(v) gw_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch} def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # Phase 1: use z ~ q(z|x) to update model_p _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_batch) _, logpz_q = model_q.logpxz(v, x_minibatch, z) logpx_p, logpz_p, gw, _ = model_p.dlogpxz_dwz(w, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w) gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw} # Phase 2: use x ~ p(x|z) to update model_q _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z) #_, gw_prior = model_q.dlogpw_dw(w_q) #gw_q = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q} weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) - float(n_batch) gv = {i: gv[i] * weight for i in gv} optimize(w, gw, gw_ss, ada_stepsize) optimize(v, gv, gv_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def doStep(w): grad = ndict.cloneZeros(v) gw = ndict.cloneZeros(w) for l in range(n_batch): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_subbatch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch} # Use z ~ q(z|x) to compute d[LB]/d[gw] _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_subbatch) _, logpz_q = model_q.logpxz(v, x_minibatch, z) logpx_p, logpz_p, _gw, gz_p = model_p.dlogpxz_dwz(w, x_minibatch, z) for i in _gw: gw[i] += _gw[i] # Compute d[LB]/d[gv] where gv = v (variational params) _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z) weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) for i in v: f = gv[i] * weight h = gv[i] cv_cov[i] = cv_cov[i] + cv_lr * (f * h - cv_cov[i]) cv_var[i] = cv_var[i] + cv_lr * (h**2 - cv_var[i]) grad[i] += f - (cv_cov[i]/(cv_var[i] + 1e-8)) * h _, gwprior = model_p.dlogpw_dw(w) for i in gw: gw[i] += float(n_subbatch*n_batch)/n_tot * gwprior[i] def optimize(_w, _gw, gw_ss, stepsize): reg=1e-8 for i in _gw: gw_ss[i] += _gw[i]**2 if nsteps[0] > warmup: _w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * _gw[i] optimize(w, gw, gw_ss, ada_stepsize) optimize(v, grad, gv_ss, ada_stepsize) nsteps[0] += 1 if ndict.hasNaN(grad): raise Exception() if ndict.hasNaN(v): raise Exception() return z.copy(), logpx_p + logpz_p - logpz_q
def step_wakesleep(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Wake-Sleep', ada_stepsize # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # Wake phase: use z ~ q(z|x) to update model_p _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) _, logpz_q = model_q.logpxz(w_q, x_minibatch, z) logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # Sleep phase: use x ~ p(x|z) to update model_q x_p, z_p, _ = model_p.gen_xz(w_p, {}, {}, n_batch) _, _, gw_q, _ = model_q.dlogpxz_dwz(w_q, x_p, z_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def step_vae(model, x, v, w, n_batch=100, stepsize=1e-1, warmup=100, anneal=True, convertImgs=False, binarize=False): print 'Variational Auto-Encoder', n_batch, stepsize, warmup # We're using adagrad stepsizes gv_ss = ndict.cloneZeros(v) gw_ss = ndict.cloneZeros(w) nsteps = [0] def doStep(v, w): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} if convertImgs: x_minibatch['x'] = x_minibatch['x']/256. if binarize: x_minibatch['x'] = np.random.binomial(n=1, p=x_minibatch['x']) # Sample epsilon from prior z = model.gen_eps(n_batch) #for i in z: z[i] *= 0 # Get gradient logpx, logpz, logqz, gv, gw = model.dL_dw(v, w, x_minibatch, z) _, _, gv_prior, gw_prior = model.dlogpw_dw(v, w) gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv} gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw} # Update parameters adagrad_reg = 1e-8 c = 1.0 if not anneal: c /= nsteps[0]+1 for i in gv: gv_ss[i] += gv[i]**2 if nsteps[0] > warmup: v[i] += stepsize / np.sqrt(gv_ss[i] * c + adagrad_reg) * gv[i] for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] * c + adagrad_reg) * gw[i] nsteps[0] += 1 return z.copy(), logpx + logpz - logqz return doStep
def doStep(w): f_holdout, gw_holdout = func_holdout(w) gw_holdout_norm = 0 gw_holdout_effective = ndict.clone(gw_holdout) for i in w: m1_holdout[i] += lambd * (gw_holdout[i] - m1_holdout[i]) m2_holdout[i] += lambd * (gw_holdout[i]**2 - m2_holdout[i]) gw_holdout_effective[i] /= np.sqrt(m2_holdout[i] + 1e-8) gw_holdout_norm += (gw_holdout_effective[i]**2).sum() gw_holdout_norm = np.sqrt(gw_holdout_norm) f_tot = 0 gw_tot = ndict.cloneZeros(w) alphas = [] for j in range(len(funcs)): f, gw = funcs[j](w) f_tot += f gw_norm = 0 gw_effective = ndict.clone(gw) for i in w: # Update first and second moments m1[j][i] += lambd * (gw[i] - m1[j][i]) m2[j][i] += lambd * (gw[i]**2 - m2[j][i]) gw_effective[i] /= np.sqrt(m2[j][i] + 1e-8) gw_norm += (gw_effective[i]**2).sum() gw_norm = np.sqrt(gw_norm) # Compute dot product with holdout gradient alpha = 0 for i in w: alpha += (gw_effective[i] * gw_holdout_effective[i]).sum() alpha /= gw_holdout_norm * gw_norm alphas.append(alpha) #alpha = (alpha > 0) * 1.0 for i in w: # Accumulate gradient of subobjective gw_tot[i] += alpha * gw[i] / np.sqrt(m2[j][i] + 1e-8) #print 'alphas:', alphas if batchi[0] > warmup: for i in w: w[i] += stepsize * gw_tot[i] batchi[0] += 1 return f_tot
def step_adadelta(func, w, gamma=0.05, eps=1e-6): print 'Adadelta', gamma, eps # mean square of gradients and delta's of z's gw_ms = ndict.cloneZeros(w) dw_ms = ndict.cloneZeros(w) dw = ndict.cloneZeros(w) batchi = [0] def doStep(w, z=None): if z == None: z = {} v, gw = func.subgrad(batchi[0]%func.n_minibatches, w, z) for i in gw: gw_ms[i] += gamma*(gw[i]**2 - gw_ms[i]) dw[i] = np.sqrt(dw_ms[i] + eps)/np.sqrt(gw_ms[i] + eps) * gw[i] w[i] += dw[i] dw_ms[i] += gamma*(dw[i]**2 - dw_ms[i]) batchi[0] += 1 return v return doStep
def step_adagrad(func, w, stepsize=0.1, warmup=10, anneal=True, decay=0): print 'Adagrad', stepsize # sum of squares of gradients and delta's of z's and w's gw_ss = ndict.cloneZeros(w) batchi = [0] def doStep(w, z=None): if z is None: z = {} logpwxz, gw = func.subgrad(batchi[0]%func.n_minibatches, w, z) c = 1 if not anneal: c = 1./ (batchi[0]+1) for i in gw: #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min() gw_ss[i] = (1-decay)*gw_ss[i] + gw[i]**2 if batchi[0] < warmup: continue w[i] += stepsize / np.sqrt(gw_ss[i] * c + 1e-8) * gw[i] batchi[0] += 1 return logpwxz return doStep
def step_rmsprop(w, model, x, prior_sd=1, n_batch=100, stepsize=1e-2, lambd=1e-2, warmup=10): print 'RMSprop', stepsize # sum of squares of gradients and delta's of z's and w's gw_ss = ndict.cloneZeros(w) n_datapoints = x.itervalues().next().shape[1] batchi = [0] def doStep(w): # Pick random minibatch idx = np.random.randint(0, n_datapoints, size=(n_batch,)) _x = ndict.getColsFromIndices(x, idx) # Evaluate likelihood and its gradient logpx, _, gw, _ = model.dlogpxz_dwz(w, _x, {}) for i in w: gw[i] *= n_datapoints / n_batch # Evalute prior and its gradient logpw = 0 for i in w: logpw -= (.5 * (w[i]**2) / (prior_sd**2)).sum() gw[i] -= w[i] / (prior_sd**2) for i in gw: #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min() gw_ss[i] += lambd * (gw[i]**2 - gw_ss[i]) if batchi[0] < warmup: continue w[i] += stepsize * gw[i] / np.sqrt(gw_ss[i] + 1e-8) batchi[0] += 1 return logpx + logpw return doStep
def step_naivesvb(model_q, model_p, x, v, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Naive SV Est', ada_stepsize # We're using adagrad stepsizes gv_ss = ndict.cloneZeros(v) gw_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w): n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} if convertImgs: x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch} def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # Phase 1: use z ~ q(z|x) to update model_p _, z, _ = model_q.gen_xz(v, x_minibatch, {}, n_batch) _, logpz_q = model_q.logpxz(v, x_minibatch, z) logpx_p, logpz_p, gw, _ = model_p.dlogpxz_dwz(w, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w) gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw} # Phase 2: use x ~ p(x|z) to update model_q _, _, gv, _ = model_q.dlogpxz_dwz(v, x_minibatch, z) #_, gw_prior = model_q.dlogpw_dw(w_q) #gw_q = {i: gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q} weight = np.sum(logpx_p) + np.sum(logpz_p) - np.sum(logpz_q) - float( n_batch) gv = {i: gv[i] * weight for i in gv} optimize(w, gw, gw_ss, ada_stepsize) optimize(v, gv, gv_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def step_pvem(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Predictive VEM', ada_stepsize hmc_steps = 0 hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1) # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): #def fgrad(_z): # logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) # return logpx + logpz, gz n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i: x[i][:, idx_minibatch] for i in x} if convertImgs: x_minibatch = {i: x_minibatch[i] / 256. for i in x_minibatch} # step 1A: sample z ~ p(z|x) from model_q _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) # step 1B: update z using HMC def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch) return logpx + logpz, gz if (hmc_steps > 0): logpxz, _, _ = hmc_dostep(fgrad, z) def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i] + reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # step 2: use z to update model_p logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # step 3: use gradients of model_p to update model_q _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: -gw_q[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep
def doStep(w, z=None): if z is not None: raise Exception() L = [0] # Lower bound g_mean = ndict.cloneZeros(w) if var == 'diag' or var == 'row_isotropic': g_logsd = ndict.cloneZeros(w) elif var == 'isotropic': g_logsd = {i:0 for i in w} # Loop over random datapoints for l in range(n_batch): # Pick random datapoint idx = np.random.randint(0, n_datapoints, size=(n_subbatch,)) _x = ndict.getColsFromIndices(x, idx) # Function that adds gradients for given noise eps def add_grad(eps): # Compute noisy weights _w = {i: w[i] + np.exp(logsd[i]) * eps[i] for i in w} # Compute gradients of log p(x|theta) w.r.t. w logpx, logpz, g_w, g_z = model.dlogpxz_dwz(_w, _x, {}) for i in w: cv = (_w[i] - w[i]) / np.exp(2*logsd[i]) #control variate cov_mean[i] += cv_lr * (g_w[i]*cv - cov_mean[i]) var_mean[i] += cv_lr * (cv**2 - var_mean[i]) g_mean[i] += g_w[i] - cov_mean[i]/var_mean[i] * cv if var == 'diag' or var == 'row_isotropic': grad = g_w[i] * eps[i] * np.exp(logsd[i]) cv = cv - 1 # this control variate (c.v.) is really similar to the c.v. for the mean! cov_logsd[i] += cv_lr * (grad*cv - cov_logsd[i]) var_logsd[i] += cv_lr * (cv**2 - var_logsd[i]) g_logsd[i] += grad - cov_logsd[i]/var_logsd[i] * cv elif var == 'isotropic': g_logsd[i] += (g_w[i] * eps[i]).sum() * np.exp(logsd[i]) else: raise Exception() L[0] += logpx.sum() + logpz.sum() # Gradients with generated noise eps = {i: np.random.standard_normal(size=w[i].shape) for i in w} if sgd: eps = {i: np.zeros(w[i].shape) for i in w} add_grad(eps) # Gradient with negative of noise if negNoise: for i in eps: eps[i] *= -1 add_grad(eps) L = L[0] L *= float(n_datapoints) / float(n_subbatch) / float(n_batch) if negNoise: L /= 2 for i in w: c = float(n_datapoints) / (float(n_subbatch) * float(n_batch)) if negNoise: c /= 2 g_mean[i] *= c g_logsd[i] *= c # Prior g_mean[i] += - w[i] / (prior_sd**2) g_logsd[i] += - np.exp(2 * logsd[i]) / (prior_sd**2) L += - (w[i]**2 + np.exp(2 * logsd[i])).sum() / (2 * prior_sd**2) L += - 0.5 * np.log(2 * np.pi * prior_sd**2) * float(w[i].size) # Entropy L += float(w[i].size) * 0.5 * math.log(2 * math.pi * np.pi) if var == 'diag' or var == 'row_isotropic': g_logsd[i] += 1 # dH(q)/d[logsd] = 1 (nice!) L += logsd[i].sum() elif var == 'isotropic': g_logsd[i] += float(w[i].size) # dH(q)/d[logsd] = 1 (nice!) L += logsd[i] * float(w[i].size) else: raise Exception() # Update variational parameters c = 1 if not anneal: c = 1./ (batchi[0] + 1) # For isotropic row variance, sum gradients per row if var == 'row_isotropic': for i in w: g_sum = g_logsd[i].sum(axis=1).reshape(w[i].shape[0], 1) g_logsd[i] = np.dot(g_sum, np.ones((1, w[i].shape[1]))) for i in w: #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min() g_w_ss[i] += g_mean[i]**2 g_logsd_ss[i] += g_logsd[i]**2 mom_w[i] += (1-momw) * (g_mean[i] - mom_w[i]) mom_logsd[i] += (1-momsd) * (g_logsd[i] - mom_logsd[i]) if batchi[0] < warmup: continue w[i] += stepsize / np.sqrt(g_w_ss[i] * c + 1e-8) * mom_w[i] logsd[i] += stepsize / np.sqrt(g_logsd_ss[i] * c + 1e-8) * mom_logsd[i] batchi[0] += 1 #print cov_mean['b0']/var_mean['b0'] return L
def step_adasgvb2(w, logsd, x, model, var='diag', negNoise=False, init_logsd=0, prior_sd=1, n_batch=1, n_subbatch=100, stepsize=1e-2, warmup=10, momw=0.75, momsd=0.75, anneal=False, sgd=False): print "SGVB + Adagrad", var, negNoise, init_logsd, prior_sd, n_batch, n_subbatch, stepsize, warmup, momw, momsd, anneal, sgd # w and logsd are the variational mean and log-variance that are learned g_w_ss = ndict.cloneZeros(w) # sum-of-squares for adagrad mom_w = ndict.cloneZeros(w) # momentum cv_lr = 0.1 # learning rate for control variates cov_mean = ndict.cloneZeros(w) var_mean = ndict.cloneZeros(w) cov_logsd = ndict.cloneZeros(w) var_logsd = ndict.cloneZeros(w) if var != 'diag': raise Exception('Didnt write control variate code for non-diag variance yet') if var == 'diag' or var == 'row_isotropic': #logsd = ndict.cloneZeros(w) for i in w: logsd[i] += init_logsd g_logsd_ss = ndict.cloneZeros(w) mom_logsd = ndict.cloneZeros(w) elif var == 'isotropic': logsd = {i: init_logsd for i in w} g_logsd_ss = {i: 0 for i in w} mom_logsd = {i: 0 for i in w} else: raise Exception("Unknown variance type") n_datapoints = x.itervalues().next().shape[1] batchi = [0] def doStep(w, z=None): if z is not None: raise Exception() L = [0] # Lower bound g_mean = ndict.cloneZeros(w) if var == 'diag' or var == 'row_isotropic': g_logsd = ndict.cloneZeros(w) elif var == 'isotropic': g_logsd = {i:0 for i in w} # Loop over random datapoints for l in range(n_batch): # Pick random datapoint idx = np.random.randint(0, n_datapoints, size=(n_subbatch,)) _x = ndict.getColsFromIndices(x, idx) # Function that adds gradients for given noise eps def add_grad(eps): # Compute noisy weights _w = {i: w[i] + np.exp(logsd[i]) * eps[i] for i in w} # Compute gradients of log p(x|theta) w.r.t. w logpx, logpz, g_w, g_z = model.dlogpxz_dwz(_w, _x, {}) for i in w: cv = (_w[i] - w[i]) / np.exp(2*logsd[i]) #control variate cov_mean[i] += cv_lr * (g_w[i]*cv - cov_mean[i]) var_mean[i] += cv_lr * (cv**2 - var_mean[i]) g_mean[i] += g_w[i] - cov_mean[i]/var_mean[i] * cv if var == 'diag' or var == 'row_isotropic': grad = g_w[i] * eps[i] * np.exp(logsd[i]) cv = cv - 1 # this control variate (c.v.) is really similar to the c.v. for the mean! cov_logsd[i] += cv_lr * (grad*cv - cov_logsd[i]) var_logsd[i] += cv_lr * (cv**2 - var_logsd[i]) g_logsd[i] += grad - cov_logsd[i]/var_logsd[i] * cv elif var == 'isotropic': g_logsd[i] += (g_w[i] * eps[i]).sum() * np.exp(logsd[i]) else: raise Exception() L[0] += logpx.sum() + logpz.sum() # Gradients with generated noise eps = {i: np.random.standard_normal(size=w[i].shape) for i in w} if sgd: eps = {i: np.zeros(w[i].shape) for i in w} add_grad(eps) # Gradient with negative of noise if negNoise: for i in eps: eps[i] *= -1 add_grad(eps) L = L[0] L *= float(n_datapoints) / float(n_subbatch) / float(n_batch) if negNoise: L /= 2 for i in w: c = float(n_datapoints) / (float(n_subbatch) * float(n_batch)) if negNoise: c /= 2 g_mean[i] *= c g_logsd[i] *= c # Prior g_mean[i] += - w[i] / (prior_sd**2) g_logsd[i] += - np.exp(2 * logsd[i]) / (prior_sd**2) L += - (w[i]**2 + np.exp(2 * logsd[i])).sum() / (2 * prior_sd**2) L += - 0.5 * np.log(2 * np.pi * prior_sd**2) * float(w[i].size) # Entropy L += float(w[i].size) * 0.5 * math.log(2 * math.pi * np.pi) if var == 'diag' or var == 'row_isotropic': g_logsd[i] += 1 # dH(q)/d[logsd] = 1 (nice!) L += logsd[i].sum() elif var == 'isotropic': g_logsd[i] += float(w[i].size) # dH(q)/d[logsd] = 1 (nice!) L += logsd[i] * float(w[i].size) else: raise Exception() # Update variational parameters c = 1 if not anneal: c = 1./ (batchi[0] + 1) # For isotropic row variance, sum gradients per row if var == 'row_isotropic': for i in w: g_sum = g_logsd[i].sum(axis=1).reshape(w[i].shape[0], 1) g_logsd[i] = np.dot(g_sum, np.ones((1, w[i].shape[1]))) for i in w: #print i, np.sqrt(gw_ss[i]).max(), np.sqrt(gw_ss[i]).min() g_w_ss[i] += g_mean[i]**2 g_logsd_ss[i] += g_logsd[i]**2 mom_w[i] += (1-momw) * (g_mean[i] - mom_w[i]) mom_logsd[i] += (1-momsd) * (g_logsd[i] - mom_logsd[i]) if batchi[0] < warmup: continue w[i] += stepsize / np.sqrt(g_w_ss[i] * c + 1e-8) * mom_w[i] logsd[i] += stepsize / np.sqrt(g_logsd_ss[i] * c + 1e-8) * mom_logsd[i] batchi[0] += 1 #print cov_mean['b0']/var_mean['b0'] return L return doStep
def step_pvem(model_q, model_p, x, w_q, n_batch=100, ada_stepsize=1e-1, warmup=100, reg=1e-8, convertImgs=False): print 'Predictive VEM', ada_stepsize hmc_steps=0 hmc_dostep = hmc.hmc_step_autotune(n_steps=hmc_steps, init_stepsize=1e-1) # We're using adagrad stepsizes gw_q_ss = ndict.cloneZeros(w_q) gw_p_ss = ndict.cloneZeros(model_p.init_w()) nsteps = [0] do_adagrad = True def doStep(w_p): #def fgrad(_z): # logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w, x, _z) # return logpx + logpz, gz n_tot = x.itervalues().next().shape[1] idx_minibatch = np.random.randint(0, n_tot, n_batch) x_minibatch = {i:x[i][:,idx_minibatch] for i in x} if convertImgs: x_minibatch = {i:x_minibatch[i]/256. for i in x_minibatch} # step 1A: sample z ~ p(z|x) from model_q _, z, _ = model_q.gen_xz(w_q, x_minibatch, {}, n_batch) # step 1B: update z using HMC def fgrad(_z): logpx, logpz, gw, gz = model_p.dlogpxz_dwz(w_p, _z, x_minibatch) return logpx + logpz, gz if (hmc_steps > 0): logpxz, _, _ = hmc_dostep(fgrad, z) def optimize(w, gw, gw_ss, stepsize): if do_adagrad: for i in gw: gw_ss[i] += gw[i]**2 if nsteps[0] > warmup: w[i] += stepsize / np.sqrt(gw_ss[i]+reg) * gw[i] #print (stepsize / np.sqrt(gw_ss[i]+reg)).mean() else: for i in gw: w[i] += 1e-4 * gw[i] # step 2: use z to update model_p logpx_p, logpz_p, gw_p, gz_p = model_p.dlogpxz_dwz(w_p, x_minibatch, z) _, gw_prior = model_p.dlogpw_dw(w_p) gw = {i: gw_p[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_p} optimize(w_p, gw, gw_p_ss, ada_stepsize) # step 3: use gradients of model_p to update model_q _, logpz_q, fd, gw_q = model_q.dfd_dw(w_q, x_minibatch, z, gz_p) _, gw_prior = model_q.dlogpw_dw(w_q) gw = {i: -gw_q[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw_q} optimize(w_q, gw, gw_q_ss, ada_stepsize) nsteps[0] += 1 return z.copy(), logpx_p + logpz_p - logpz_q return doStep