def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1): if n_accum == 1: return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3) print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(it,n_accum), 0) update = T.eq(T.mod(it,n_accum), n_accum-1) ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0.) w_avg[i] = G.sharedf(_w.get_value()) g_sum = G.sharedf(_w.get_value() * 0.) new[g_sum] = ifelse(reset, _g, g_sum + _g) new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max) new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w) new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i]) ws_avg += [w_avg] return new, ws_avg
def Eve(w, w_avg, f, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, disconnected_inputs='raise'): print 'Eve', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3 mom = {} _max = {} delta = {} w_prime = {} for i in w: mom[i] = G.sharedf(w[i].get_value() * 0.) _max[i] = G.sharedf(w[i].get_value() * 0. + 1e-8) delta[i] = G.sharedf(w[i].get_value() * 0.) w_prime[i] = w[i] + (1-beta1)/beta1 * delta[i] train_cost = f(w_prime).mean() g = G.ndict.T_grad(train_cost, w, disconnected_inputs=disconnected_inputs) #warn/raise new = OrderedDict() for i in w: new[mom[i]] = (1-beta1) * mom[i] + beta1 * g[i] new[_max[i]] = T.maximum((1-beta2)*_max[i], abs(g[i]) + 1e-8) new[delta[i]] = alpha * new[mom[i]] / new[_max[i]] new[w[i]] = w[i] + new[delta[i]] for i in w: new[w_avg[i]] = beta3 * w[i] + (1.-beta3) * w_avg[i] return train_cost, new
def AdaMaxAvg(ws, ws_avg, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, update_keys=None, disconnected_inputs='raise'): print 'AdaMax_Avg', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3 gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs=disconnected_inputs) #warn/raise if update_keys is None: update_keys = [ws[j].keys() for j in range(len(ws))] new = OrderedDict() for j in range(len(ws)): if ws_avg is not None: w_avg = ws_avg[j] for i in update_keys[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0. + 1e-8) new[mom1] = (1-beta1) * mom1 + beta1 * _g new[_max] = T.maximum((1-beta2)*_max, abs(_g) + 1e-8) new[_w] = _w + alpha * new[mom1] / new[_max] if ws_avg is not None: new[w_avg[i]] = beta3 * _w + (1.-beta3) * w_avg[i] return new
def AdaMaxAvg(ws, ws_avg, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, update_keys=None, disconnected_inputs='raise'): print 'AdaMax_Avg', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3 gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs=disconnected_inputs) #warn/raise if update_keys is None: update_keys = [ws[j].keys() for j in range(len(ws))] new = OrderedDict() for j in range(len(ws)): if ws_avg is not None: w_avg = ws_avg[j] for i in update_keys[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0. + 1e-8) new[mom1] = (1 - beta1) * mom1 + beta1 * _g new[_max] = T.maximum((1 - beta2) * _max, abs(_g) + 1e-8) new[_w] = _w + alpha * new[mom1] / new[_max] if ws_avg is not None: new[w_avg[i]] = beta3 * _w + (1. - beta3) * w_avg[i] return new
def Eve(w, w_avg, f, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, disconnected_inputs='raise'): print 'Eve', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3 mom = {} _max = {} delta = {} w_prime = {} for i in w: mom[i] = G.sharedf(w[i].get_value() * 0.) _max[i] = G.sharedf(w[i].get_value() * 0. + 1e-8) delta[i] = G.sharedf(w[i].get_value() * 0.) w_prime[i] = w[i] + (1 - beta1) / beta1 * delta[i] train_cost = f(w_prime).mean() g = G.ndict.T_grad(train_cost, w, disconnected_inputs=disconnected_inputs) #warn/raise new = OrderedDict() for i in w: new[mom[i]] = (1 - beta1) * mom[i] + beta1 * g[i] new[_max[i]] = T.maximum((1 - beta2) * _max[i], abs(g[i]) + 1e-8) new[delta[i]] = alpha * new[mom[i]] / new[_max[i]] new[w[i]] = w[i] + new[delta[i]] for i in w: new[w_avg[i]] = beta3 * w[i] + (1. - beta3) * w_avg[i] return train_cost, new
def gsm(name, k, w={}, logvar_minmax=16): w[name + '_weight'] = G.sharedf(np.zeros((k, ))) w[name + '_logvar'] = G.sharedf(np.random.randn(k) * .1) def logp(v, w): mixtureweights = T.exp(w[name + '_weight']) mixtureweights /= mixtureweights.sum() logvar = logvar_minmax * w[name + '_logvar'] var = T.exp(logvar) if k == 0: return 0. if k == 1: return -.5 * (v**2).sum() / var[0] - v.size.astype( G.floatX) * (.5 * T.log(2. * math.pi) + logvar[0]) p = 0. for i in range(k): p += mixtureweights[i] * T.exp(-.5 * v**2 / var[i]) / T.sqrt( 2. * math.pi * var[i]) logp = T.log(p).sum() return logp def postup(updates, w): updates[w[name + '_logvar']] = T.clip(updates[w[name + '_logvar']], -1., 1.) return updates return G.Struct(logp=logp, postup=postup, w=w)
def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2): print 'AdaMax2', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'n_accum:', n_accum g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(new[it], n_accum), 0) update = T.eq(T.mod(new[it], n_accum), n_accum - 1) for i in range(len(w)): mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) g_sum = G.sharedf(w[i].get_value() * 0.) #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's new[g_sum] = ifelse(reset, g[i], g_sum + g[i]) new[mom1] = ifelse(update, (1 - beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse( update, T.maximum((1 - beta2) * _max, abs(new[g_sum]) + 1e-8), _max) new[w[i]] = ifelse(update, w[i] + alpha * new[mom1] / new[_max], w[i]) return new
def nonlinearity(name, which, shape=None, w={}): if which == 'prelu': w[name] = G.sharedf(np.zeros(shape)) if which == 'pelu': w[name] = G.sharedf(np.zeros(shape)) if which == 'softplus2': w[name] = G.sharedf(np.zeros(shape)) if which == 'softplus_shiftscale': w[name+'_in_s'] = G.sharedf(np.zeros(shape)) w[name+'_in_b'] = G.sharedf(np.zeros(shape)) if which == 'linearsigmoid': w[name+'_a'] = G.sharedf(.5*np.ones(shape)) w[name+'_b'] = G.sharedf(.5*np.ones(shape)) if which == 'meanonlybatchnorm_softplus': assert type(shape) == int w[name+'_b'] = G.sharedf(np.zeros(shape)) if which == 'meanonlybatchnorm_relu': assert type(shape) == int w[name+'_b'] = G.sharedf(np.zeros(shape)) def f(h, w=None): if which == None or which == 'None': return h elif which == 'tanh': return T.tanh(h) elif which == 'softmax': return T.nnet.softmax(h) elif which == 'prelu': return w[name]*h*(h<0.) + h*(h>=0.) elif which == 'relu': return h*(h>=0.) elif which == 'shiftedrelu': return T.switch(h < -1., -1., h) elif which == 'leakyrelu': return 0.01 * h*(h<0.) + h*(h>=0.) elif which == 'elu': return T.switch(h < 0., T.exp(h)-1, h) elif which == 'softplus': return T.nnet.softplus(h) elif which == 'softplus_shiftscale': return T.nnet.softplus(T.exp(w[name+'_in_s']) * h + w[name+'_in_b']) elif which == 'softplus2': return T.nnet.softplus(h) - w[name] * T.nnet.softplus(-h) elif which == 'linearsigmoid': return w[name+'_a'] * h + w[name+'_b'] * T.nnet.sigmoid(h) elif which == 'meanonlybatchnorm_softplus': h -= h.mean(axis=(0,2,3), keepdims=True) h += w[name+'_b'].dimshuffle('x',0,'x','x') return T.nnet.softplus(h) elif which == 'meanonlybatchnorm_relu': h -= h.mean(axis=(0,2,3), keepdims=True) h += w[name+'_b'].dimshuffle('x',0,'x','x') return T.nnet.relu(h) else: raise Exception("Unrecognized nonlinearity: "+which) return G.Struct(__call__=f, w=w)
def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001): print 'AdaMax', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2 g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() for i in range(len(w)): #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) new[mom1] = (1-beta1) * mom1 + beta1 * g[i] new[_max] = T.maximum((1-beta2)*_max, abs(g[i]) + 1e-8) new[w[i]] = w[i] + alpha * new[mom1] / new[_max] return new
def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001): print 'AdaMax', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2 g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() for i in range(len(w)): #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) new[mom1] = (1 - beta1) * mom1 + beta1 * g[i] new[_max] = T.maximum((1 - beta2) * _max, abs(g[i]) + 1e-8) new[w[i]] = w[i] + alpha * new[mom1] / new[_max] return new
def f_encode_decode(w, train=True): results = {} h = x_enc(_x - .5, w) obj_kl = G.sharedf(0.) # bottom-up encoders for i in range(len(depths)): for j in range(depths[i]): h = layers[i][j].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h, kl = layers[i][j].down_q(h, train, w) kl_sum = kl.sum(axis=(1,2,3)) results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum # Constraint: Minimum number of bits per featuremap, averaged across minibatch if kl_min > 0: kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX) obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX) else: obj_kl += kl_sum output = x_dec(x_dec_nl(h, w), w) # empirical distribution if px == 'logistic': mean_x = T.clip(output+.5, 0, 1) logsd_x = 0*mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp #obj_z = T.printing.Print('obj_z')(obj_z) obj = obj_logpx - obj_kl # Compute the bits per pixel obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32') #if not '__init' in w: # raise Exception() elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpx - obj_kl #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results
def batchnorm_meanonly(name, n_h, w={}): w[name+'_b'] = G.sharedf(np.zeros((n_h,))) def f(h, w): h -= h.mean(axis=(0,2,3), keepdims=True) h += w[name+'_b'].dimshuffle('x',0,'x','x') return h return G.Struct(__call__=f, w=w)
def zero_centered_gaussian(name, w={}): w[name+'_logvar'] = G.sharedf(0.) def logp(v, w): logvar = w[name+'_logvar']*10 return v.size.astype(G.floatX) * -.5 * (T.log(2.*math.pi) + logvar) - .5 * (v**2).sum() / T.exp(logvar) postup = lambda updates, w:updates return G.Struct(logp=logp, postup=postup, w=w)
def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1): if n_accum == 1: return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3) print 'AdaMax_Avg2', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3, 'n_accum:', n_accum gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(it, n_accum), 0) update = T.eq(T.mod(it, n_accum), n_accum - 1) ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0.) w_avg[i] = G.sharedf(_w.get_value()) g_sum = G.sharedf(_w.get_value() * 0.) new[g_sum] = ifelse(reset, _g, g_sum + _g) new[mom1] = ifelse(update, (1 - beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse( update, T.maximum((1 - beta2) * _max, abs(new[g_sum]) + 1e-8), _max) new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w) new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1. - beta3) * w_avg[i], w_avg[i]) ws_avg += [w_avg] return new, ws_avg
def zero_centered_laplace(name, w={}): w[name + '_logscale'] = G.sharedf(0.) def logp(v, w): return -abs(v).sum() / T.exp(w[name + '_logscale']) - v.size.astype( G.floatX) * (T.log(2.) + w[name + '_logscale']) postup = lambda updates, w: updates return G.Struct(logp=logp, postup=postup, w=w)
def zero_centered_gaussian(name, w={}): w[name + '_logvar'] = G.sharedf(0.) def logp(v, w): logvar = w[name + '_logvar'] * 10 return v.size.astype(G.floatX) * -.5 * ( T.log(2. * math.pi) + logvar) - .5 * (v**2).sum() / T.exp(logvar) postup = lambda updates, w: updates return G.Struct(logp=logp, postup=postup, w=w)
def gaussian_spherical(shape=None, sample=None): if sample is None: sample = G.rng_curand.normal(shape) if shape is None: assert sample != None shape = sample.shape logp = -.5 * (T.log(2 * math.pi) + sample**2).flatten(2).sum(axis=1) entr = (1. * T.prod(shape[1:]).astype(G.floatX)) * T.ones( (shape[0], ), dtype=G.floatX) * G.sharedf(.5 * (np.log(2. * math.pi) + 1.)) return RandomVariable(sample, logp, entr, shape=shape)
def gsm(name, k, w={}, logvar_minmax=16): w[name+'_weight'] = G.sharedf(np.zeros((k,))) w[name+'_logvar'] = G.sharedf(np.random.randn(k)*.1) def logp(v, w): mixtureweights = T.exp(w[name+'_weight']) mixtureweights /= mixtureweights.sum() logvar = logvar_minmax*w[name+'_logvar'] var = T.exp(logvar) if k == 0: return 0. if k == 1: return -.5*(v**2).sum()/var[0] - v.size.astype(G.floatX) * (.5*T.log(2.*math.pi) + logvar[0]) p = 0. for i in range(k): p += mixtureweights[i] * T.exp(-.5*v**2/var[i]) / T.sqrt(2.*math.pi*var[i]) logp = T.log(p).sum() return logp def postup(updates, w): updates[w[name+'_logvar']] = T.clip(updates[w[name+'_logvar']], -1., 1.) return updates return G.Struct(logp=logp, postup=postup, w=w)
def Adam(ws, objective, alpha=.0003, beta=.9, gamma=.999): print 'Adam', 'alpha:', alpha, 'beta1:', beta, 'gamma:', gamma new = OrderedDict() gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='warn') #warn/raise it = G.sharedf(0.) new[it] = it + 1. fix1 = 1 - beta**(it + 1.) fix2 = 1 - gamma**(it + 1.) # To make estimates unbiased lr_t = alpha * T.sqrt(fix2) / fix1 ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: w = ws[j][i] g = gs[j][i] # Initial values shape = w.get_value().shape m = G.sharedf(np.zeros(shape)) v = G.sharedf(np.zeros(shape)) w_avg[i] = G.sharedf(np.zeros(shape)) # Updates new[m] = beta * m + (1 - beta) * g new[v] = gamma * v + (1 - gamma) * g**2 new[w] = w + lr_t * new[m] / (T.sqrt(new[v]) + 1e-8) new[w_avg[i]] = gamma * new[w] + (1. - gamma) * w_avg[i] ws_avg += [w_avg] return new, ws_avg
def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2): print 'AdaMax2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2, 'n_accum:', n_accum g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(new[it],n_accum), 0) update = T.eq(T.mod(new[it],n_accum), n_accum-1) for i in range(len(w)): mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) g_sum = G.sharedf(w[i].get_value() * 0.) #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's new[g_sum] = ifelse(reset, g[i], g_sum + g[i]) new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max) new[w[i]] = ifelse(update, w[i] + alpha * new[mom1] / new[_max], w[i]) return new
def Adam(ws, objective, alpha=.0003, beta=.9, gamma=.999): print 'Adam', 'alpha:',alpha,'beta1:',beta,'gamma:',gamma new = OrderedDict() gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='warn') #warn/raise it = G.sharedf(0.) new[it] = it + 1. fix1 = 1-beta**(it+1.) fix2 = 1-gamma**(it+1.) # To make estimates unbiased lr_t = alpha * T.sqrt(fix2) / fix1 ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: w = ws[j][i] g = gs[j][i] # Initial values shape = w.get_value().shape m = G.sharedf(np.zeros(shape)) v = G.sharedf(np.zeros(shape)) w_avg[i] = G.sharedf(np.zeros(shape)) # Updates new[m] = beta * m + (1-beta) * g new[v] = gamma * v + (1-gamma) * g**2 new[w] = w + lr_t * new[m] / (T.sqrt(new[v]) + 1e-8) new[w_avg[i]] = gamma * new[w] + (1.-gamma) * w_avg[i] ws_avg += [w_avg] return new, ws_avg
def linear_l2(name, n_in, n_out, w): # L2 normalization of weights def l2normalize(_w): targetnorm=1. norm = T.sqrt((_w**2).sum(axis=0, keepdims=True)) return _w * (targetnorm / norm) def maxconstraint(_w): return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True))) w[name+'_w'] = G.sharedf(0.05*np.random.randn(n_in,n_out)) if maxweight > 0: w[name+'_w'].set_value(maxconstraint(w[name+'_w']).tag.test_value) w[name+'_b'] = G.sharedf(np.zeros((n_out,))) if l2norm: if logscale: w[name+'_s'] = G.sharedf(np.zeros((n_out,))) else: w[name+'_s'] = G.sharedf(np.ones((n_out,))) else: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.zeros((n_out,))) def f(h, w): _w = w[name+'_w'] if l2norm: _w = l2normalize(_w) h = T.dot(h, _w) if l2norm: if logscale: h *= T.exp(logscale_scale*w[name+'_s']) else: h *= abs(w[name+'_s']) else: h *= T.exp(constant_rescale) h += w[name+'_b'] if '__init' in w: # Std std = (1./init_stdev) * h.std(axis=0) + 1e-8 if name+'_s' in w: if logscale: w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale) else: w[name+'_s'].set_value((1./std).tag.test_value) else: constant_rescale.set_value(-T.log(std).tag.test_value) #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value) h /= std.dimshuffle('x',0) # Mean mean = h.mean(axis=0) w[name+'_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x',0) #print name, abs(w[name+'_w']).get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Post updates: normalize weights to unit L2 norm def postup(updates, w): if l2norm and maxweight>0: updates[w[name+'_w']] = maxconstraint(updates[w[name+'_w']]) return updates return G.Struct(__call__=f, postup=postup, w=w)
def conv2d(name, n_in, n_out, size_kernel=(3,3), pad_channel=True, border_mode='valid', downsample=1, upsample=1, datainit=True, zeroinit=False, l2norm=True, w={}): # TODO FIX: blows up parameters if all inputs are 0 if not pad_channel: border_mode = 'same' print 'No pad_channel, changing border_mode to same' if '[sharedw]' in name and '[/sharedw]' in name: name_w = name pre, b = name.split("[sharedw]") number, post = b.split("[/sharedw]") name_w = pre+"[s]"+post name = pre+number+post # Don't share the bias and scales #name = name_w # Also share the bias and scales else: name_w = name if type(downsample) == int: downsample = (downsample,downsample) assert type(downsample) == tuple assert border_mode in ['valid','full','same'] _n_in = n_in _n_out = n_out if upsample > 1: _n_out = n_out * upsample**2 if pad_channel: if size_kernel[0] > 1 or size_kernel[1] > 1: assert size_kernel[0] == size_kernel[1] assert border_mode == 'valid' _n_in += 1 else: pad_channel = False if border_mode == 'same': assert size_kernel[0]%2 == 1 border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2) def l2normalize(kerns): norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True)) return kerns / norm def maxconstraint(kerns): return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True))) if zeroinit: w[name_w+'_w'] = G.sharedf(np.zeros((_n_out, _n_in, size_kernel[0], size_kernel[1]))) datainit = False else: w[name_w+'_w'] = G.sharedf(0.05*np.random.randn(_n_out, _n_in, size_kernel[0], size_kernel[1])) if maxweight > 0: w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value) w[name+'_b'] = G.sharedf(np.zeros((_n_out,))) if l2norm: if logscale: w[name+'_s'] = G.sharedf(np.zeros((_n_out,))) else: w[name+'_s'] = G.sharedf(np.ones((_n_out,))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.ones((_n_out,))) def f(h, w): input_shape = h.tag.test_value.shape[1:] _input = h if pad_channel: h = pad2dwithchannel(h, size_kernel) kerns = w[name_w+'_w'] #if name == '1_down_conv1': # kerns = T.printing.Print('kerns 1')(kerns) if l2norm: kerns = l2normalize(kerns) if logscale: kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x') else: kerns *= w[name+'_s'].dimshuffle(0,'x','x','x') elif do_constant_rescale: kerns *= constant_rescale.dimshuffle(0,'x','x','x') #if name == '1_down_conv1': # kerns = T.printing.Print('kerns 2')(kerns) h = dnn_conv(h, kerns, border_mode=border_mode, subsample=downsample) # Mean-only batch norm if bn: h -= h.mean(axis=(0,2,3), keepdims=True) h += w[name+'_b'].dimshuffle('x',0,'x','x') if '__init' in w and datainit: # Std data_std = h.std(axis=(0,2,3)) num_zeros = (data_std.tag.test_value == 0).sum() if num_zeros > 0: print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init." else: std = (1./init_stdev) * data_std std += 1e-7 if name+'_s' in w: if logscale: w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale) else: w[name+'_s'].set_value((1./std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value((1./std).tag.test_value) h /= std.dimshuffle('x',0,'x','x') # Mean mean = h.mean(axis=(0,2,3)) w[name+'_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x',0,'x','x') #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() if upsample>1: h = depool2d_split(h, factor=upsample) if not '__init' in w: output_shape = h.tag.test_value.shape[1:] print 'conv2d', name, input_shape, output_shape, size_kernel, pad_channel, border_mode, downsample, upsample #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Normalize weights to _norm L2 norm # TODO: check whether only_upper_bounds here really helps # (the effect is a higher learning rate in the beginning of training) def postup(updates, w): if l2norm and maxweight>0.: updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']]) return updates return G.Struct(__call__=f, w=w, postup=postup)
def conv2d(name, n_in, n_out, size_kernel=(3,3), pad_channel=True, border_mode='valid', downsample=1, upsample=1, datainit=True, zeroinit=False, l2norm=True, w={}): # TODO FIX: blows up parameters if all inputs are 0 if not pad_channel: border_mode = 'same' print 'No pad_channel, changing border_mode to same' if '[sharedw]' in name and '[/sharedw]' in name: name_w = name pre, b = name.split("[sharedw]") number, post = b.split("[/sharedw]") name_w = pre+"[s]"+post name = pre+number+post # Don't share the bias and scales #name = name_w # Also share the bias and scales else: name_w = name if type(downsample) == int: downsample = (downsample,downsample) assert type(downsample) == tuple assert border_mode in ['valid','full','same'] _n_in = n_in _n_out = n_out if upsample > 1: _n_out = n_out * upsample**2 if pad_channel: if size_kernel[0] > 1 or size_kernel[1] > 1: assert size_kernel[0] == size_kernel[1] assert border_mode == 'valid' _n_in += 1 else: pad_channel = False if border_mode == 'same': assert size_kernel[0]%2 == 1 border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2) def l2normalize(kerns): norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True)) return kerns / norm def maxconstraint(kerns): return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True))) if zeroinit: w[name_w+'_w'] = G.sharedf(np.zeros((_n_out, _n_in, size_kernel[0], size_kernel[1]))) datainit = False else: w[name_w+'_w'] = G.sharedf(0.05*np.random.randn(_n_out, _n_in, size_kernel[0], size_kernel[1])) if maxweight > 0: w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value) w[name+'_b'] = G.sharedf(np.zeros((_n_out,))) if bias_logscale: w[name+'_bs'] = G.sharedf(0.) if l2norm: if logscale: w[name+'_s'] = G.sharedf(np.zeros((_n_out,))) else: w[name+'_s'] = G.sharedf(np.ones((_n_out,))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.ones((_n_out,))) def f(h, w): input_shape = h.tag.test_value.shape[1:] _input = h if pad_channel: h = pad2dwithchannel(h, size_kernel) kerns = w[name_w+'_w'] #if name == '1_down_conv1': # kerns = T.printing.Print('kerns 1')(kerns) if l2norm: kerns = l2normalize(kerns) if logscale: kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x') else: kerns *= w[name+'_s'].dimshuffle(0,'x','x','x') elif do_constant_rescale: kerns *= constant_rescale.dimshuffle(0,'x','x','x') #if name == '1_down_conv1': # kerns = T.printing.Print('kerns 2')(kerns) h = dnn_conv(h, kerns, border_mode=border_mode, subsample=downsample) # Mean-only batch norm if bn: h -= h.mean(axis=(0,2,3), keepdims=True) _b = w[name+'_b'].dimshuffle('x',0,'x','x') if bias_logscale: _b *= T.exp(logscale_scale * w[name+'_bs']) h += _b if '__init' in w and datainit: # Std data_std = h.std(axis=(0,2,3)) num_zeros = (data_std.tag.test_value == 0).sum() if num_zeros > 0: print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init." else: std = (1./init_stdev) * data_std std += 1e-7 if name+'_s' in w: if logscale: w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale) else: w[name+'_s'].set_value((1./std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value((1./std).tag.test_value) h /= std.dimshuffle('x',0,'x','x') # Mean mean = h.mean(axis=(0,2,3)) w[name+'_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x',0,'x','x') #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() if upsample>1: h = depool2d_split(h, factor=upsample) if not '__init' in w: output_shape = h.tag.test_value.shape[1:] print 'conv2d', name, input_shape, output_shape, size_kernel, pad_channel, border_mode, downsample, upsample #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Normalize weights to _norm L2 norm # TODO: check whether only_upper_bounds here really helps # (the effect is a higher learning rate in the beginning of training) def postup(updates, w): if l2norm and maxweight>0.: updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']]) return updates return G.Struct(__call__=f, w=w, postup=postup)
def randorth(shape): from scipy.linalg import sqrtm, inv assert len(shape) == 2 w = np.random.normal(0, size=shape) w = w.dot(inv(sqrtm(w.T.dot(w)))) return G.sharedf(w)
def linear(name, n_in, n_out, diagonalzeros, l2norm=True, w={}): assert n_in % n_out == 0 or n_out % n_in == 0 mask = np.ones((n_in, n_out), dtype=G.floatX) if n_out >= n_in: k = n_out / n_in for i in range(n_in): mask[i + 1:, i * k:(i + 1) * k] = 0 if diagonalzeros: mask[i:i + 1, i * k:(i + 1) * k] = 0 else: k = n_in / n_out for i in range(n_out): mask[(i + 1) * k:, i:i + 1] = 0 if diagonalzeros: mask[i * k:(i + 1) * k:, i:i + 1] = 0 # L2 normalization of weights def l2normalize(_w, axis=0): if diagonalzeros: # to prevent NaN gradients # TODO: smarter solution (also see below) if n_out >= n_in: _w = T.set_subtensor(_w[:, :n_out / n_in], 0.) else: _w = T.set_subtensor(_w[:, :1], 0.) targetnorm = 1. norm = T.sqrt((_w**2).sum(axis=axis, keepdims=True)) norm += 1e-8 new_w = _w * (targetnorm / norm) return new_w def maxconstraint(_w): return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True))) w[name + '_w'] = G.sharedf(mask * 0.05 * np.random.randn(n_in, n_out)) if maxweight > 0: w[name + '_w'].set_value(maxconstraint(w[name + '_w']).tag.test_value) w[name + '_b'] = G.sharedf(np.zeros((n_out, ))) if l2norm: if logscale: w[name + '_s'] = G.sharedf(np.zeros((n_out, ))) else: w[name + '_s'] = G.sharedf(np.ones((n_out, ))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.zeros((n_out, ))) def f(h, w): _input = h _w = mask * w[name + '_w'] if l2norm: _w = l2normalize(_w) h = T.dot(h, _w) if l2norm: if logscale: h *= T.exp(logscale_scale * w[name + '_s']) else: h *= abs(w[name + '_s']) elif do_constant_rescale: h *= T.exp(constant_rescale) h += w[name + '_b'] if '__init' in w: # Std std = (1. / init_stdev) * h.std(axis=0) std += (std <= 0) std += 1e-8 if name + '_s' in w: if logscale: w[name + '_s'].set_value(-T.log(std).tag.test_value / logscale_scale) else: w[name + '_s'].set_value((1. / std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value(-T.log(std).tag.test_value) #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value) h /= std.dimshuffle('x', 0) # Mean mean = h.mean(axis=0) w[name + '_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x', 0) #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Post updates: normalize weights to unit L2 norm def postup(updates, w): updates[w[name + '_w']] = mask * updates[w[name + '_w']] if l2norm and maxweight > 0.: updates[w[name + '_w']] = maxconstraint(updates[w[name + '_w']]) return updates return G.Struct(__call__=f, postup=postup, w=w)
def cvae1(shape_x, depths, depth_ar, n_h1, n_h2, n_z, prior='diag', posterior='down_diag', px='logistic', nl='softplus', kernel_x=(5,5), kernel_h=(3,3), kl_min=0, optim='adamax', alpha=0.002, beta1=0.1, beta2=0.001, weightsharing=None, pad_x = 0, data_init=None, downsample_type='nn'): _locals = locals() _locals.pop('data_init') print 'CVAE1 with ', _locals #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear'] assert px in ['logistic','bernoulli'] w = {} # model params if pad_x > 0: shape_x[1] += 2*pad_x shape_x[2] += 2*pad_x # Input whitening if px == 'logistic': w['logsd_x'] = G.sharedf(0.) # encoder x_enc = N.conv.conv2d('x_enc', shape_x[0], n_h1, kernel_x, downsample=2, w=w) x_dec = N.conv.conv2d('x_dec', n_h1, shape_x[0], kernel_x, upsample=2, w=w) x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w) layers = [] for i in range(len(depths)): layers.append([]) for j in range(depths[i]): downsample = (i > 0 and j == 0) if weightsharing is None or not weightsharing: name = str(i)+'_'+str(j) elif weightsharing == 'all': name = '[sharedw]'+str(i)+'_'+str(j)+'[/sharedw]' elif weightsharing == 'acrosslevels': name = '[sharedw]'+str(i)+'[/sharedw]'+'_'+str(j) elif weightsharing == 'withinlevel': name = '[sharedw]'+str(i)+'[/sharedw]'+'_'+str(j) else: raise Exception() layers[i].append(cvae_layer(name, prior, posterior, n_h1, n_h2, n_z, depth_ar, downsample, nl, kernel_h, False, downsample_type, w)) # top-level value w['h_top'] = G.sharedf(np.zeros((n_h1,))) # Initialize variables x = T.tensor4('x', dtype='uint8') x.tag.test_value = data_init['x'] n_batch_test = data_init['x'].shape[0] _x = T.clip((x + .5) / 256., 0, 1) #_x = T.clip(x / 255., 0, 1) if pad_x > 0: _x = N.conv.pad2d(_x, pad_x) # Objective function def f_encode_decode(w, train=True): results = {} h = x_enc(_x - .5, w) obj_kl = G.sharedf(0.) # bottom-up encoders for i in range(len(depths)): for j in range(depths[i]): h = layers[i][j].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h, kl = layers[i][j].down_q(h, train, w) kl_sum = kl.sum(axis=(1,2,3)) results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum # Constraint: Minimum number of bits per featuremap, averaged across minibatch if kl_min > 0: if True: kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX) obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX) else: kl = T.maximum(np.asarray(kl_min,G.floatX), kl.sum(axis=(2,3))).sum(axis=1,dtype=G.floatX) obj_kl += kl else: obj_kl += kl_sum output = .1 * x_dec(x_dec_nl(h, w), w) # empirical distribution if px == 'logistic': mean_x = T.clip(output+.5, 0+1/512., 1-1/512.) logsd_x = 0*mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp #obj_z = T.printing.Print('obj_z')(obj_z) obj = obj_logpx - obj_kl # Compute the bits per pixel obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32') #if not '__init' in w: # raise Exception() elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpx - obj_kl #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results # Turns Gaussian noise 'eps' into a sample def f_decoder(eps, w): # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (eps['eps_0_0'].shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h = layers[i][j].down_p(h, eps['eps_'+str(i)+'_'+str(j)], w) output = .1 * x_dec(x_dec_nl(h, w), w) if px == 'logistic': mean_x = T.clip(output+.5, 0+1/512., 1-1/512.) elif px == 'bernoulli': mean_x = T.nnet.sigmoid(output) image = (256.*mean_x).astype('uint8') if pad_x > 0: image = image[:,:,pad_x:-pad_x,pad_x:-pad_x] return image def f_eps(n_batch, w): eps = {} for i in range(len(depths)): for j in range(depths[i]): eps['eps_'+str(i)+'_'+str(j)] = G.rng_curand.normal((n_batch,n_z,shape_x[1]/2**(i+1),shape_x[2]/2**(i+1)),dtype=floatX) return eps def postup(updates, w): nodes = [x_enc,x_dec] for n in nodes: updates = n.postup(updates, w) for i in range(len(depths)): for j in range(depths[i]): updates = layers[i][j].postup(updates, w) return updates # Compile init function if data_init != None: w['__init'] = OrderedDict() f_encode_decode(w) w.pop('__init') #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean() # Compile training function #todo: replace postup with below #w['_updates'] = updates #f_cost(w) #updates = w.pop('_updates') w_avg = {i: G.sharedf(w[i].get_value()) for i in w} def lazy(f): def newf(*args, **kws): if not hasattr(f, 'cache'): f.cache = f() return f.cache(*args, **kws) return newf @lazy def f_train(): if optim == 'adamax': train_cost = f_encode_decode(w)['cost'] updates = G.misc.optim.AdaMaxAvg([w],[w_avg], train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') elif optim == 'eve': f = lambda w: f_encode_decode(w)['cost'] train_cost, updates = G.misc.optim.Eve(w, w_avg, f, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') updates = postup(updates, w) return G.function({'x':x}, train_cost, updates=updates, lazy=lazy) @lazy def f_train_q(): keys_q = [] for i in w: if '_q_' in i: keys_q.append(i) train_cost = f_encode_decode(w)['cost'] updates = G.misc.optim.AdaMaxAvg([w],None, train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, update_keys=keys_q, disconnected_inputs='ignore') updates = postup(updates, w) return G.function({'x':x}, train_cost, updates=updates, lazy=lazy) # Compile evaluation function @lazy def f_eval(): results = f_encode_decode(w_avg, False) return G.function({'x':x}, results) # Compile epsilon generating function @lazy def f_eps_(): n_batch = T.lscalar() n_batch.tag.test_value = 16 eps = f_eps(n_batch, w) return G.function({'n_batch':n_batch}, eps, lazy=lazy) # Compile sampling function @lazy def f_decode(): eps = {} for i in range(len(depths)): for j in range(depths[i]): eps['eps_'+str(i)+'_'+str(j)] = T.tensor4('eps'+str(i)) eps['eps_'+str(i)+'_'+str(j)].tag.test_value = np.random.randn(n_batch_test,n_z,shape_x[1]/2**(i+1),shape_x[2]/2**(i+1)).astype(floatX) image = f_decoder(eps, w_avg) return G.function(eps, image, lazy=lazy) return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps_, w=w, w_avg=w_avg)
def conv2d(name, n_in, n_out, size_kernel=(3, 3), zerodiagonal=True, flipmask=False, pad_channel=True, border_mode='valid', zeroinit=False, l2norm=True, w={}): do_scale = False if zeroinit: l2norm = False do_scale = True if not pad_channel: border_mode = 'same' print 'No pad_channel, changing border_mode to same' #if 'whitener' not in name: # pad_channel = False # border_mode = 'same' if '[sharedw]' in name and '[/sharedw]' in name: name_w = name pre, b = name.split("[sharedw]") c, post = b.split("[/sharedw]") name_w = pre + "[s]" + post name = pre + c + post # Don't share the bias and scales #name = name_w # Also share the bias and scales else: name_w = name assert border_mode in ['valid', 'full', 'same'] _n_in = n_in if pad_channel: if size_kernel[0] > 1 or size_kernel[1] > 1: assert size_kernel[0] == size_kernel[1] assert border_mode == 'valid' _n_in += 1 else: pad_channel = False if border_mode == 'same': assert size_kernel[0] % 2 == 1 border_mode = ((size_kernel[0] - 1) / 2, (size_kernel[1] - 1) / 2) if True: # Build autoregressive mask l = (size_kernel[0] - 1) / 2 m = (size_kernel[1] - 1) / 2 mask = np.ones((n_out, _n_in, size_kernel[0], size_kernel[1]), dtype=G.floatX) mask[:, :, :l, :] = 0 mask[:, :, l, :m] = 0 if n_out >= n_in: assert n_out % n_in == 0 k = n_out / n_in for i in range(n_in): mask[i * k:(i + 1) * k, i + 1:, l, m] = 0 if zerodiagonal: mask[i * k:(i + 1) * k, i:i + 1, l, m] = 0 else: assert n_in % n_out == 0 k = n_in / n_out for i in range(n_out): mask[i:i + 1, (i + 1) * k:, l, m] = 0 if zerodiagonal: mask[i:i + 1, i * k:(i + 1) * k:, l, m] = 0 if flipmask: mask = mask[::-1, ::-1, ::-1, ::-1] def l2normalize(kerns): if zerodiagonal: # to prevent NaN gradients # TODO: smarter solution (also see below) l = (size_kernel[0] - 1) / 2 m = (size_kernel[1] - 1) / 2 if n_out >= n_in: kerns = T.set_subtensor(kerns[:n_out / n_in, :, l, m], 0.) else: kerns = T.set_subtensor(kerns[:1, :, l, m], 0.) targetnorm = 1. norm = T.sqrt((kerns**2).sum(axis=(1, 2, 3), keepdims=True)) norm += 1e-8 return kerns * (targetnorm / norm) def maxconstraint(kerns): return kerns * (maxweight / T.maximum( maxweight, abs(kerns).max(axis=(1, 2, 3), keepdims=True))) if zeroinit: w[name_w + '_w'] = G.sharedf( np.zeros((n_out, _n_in, size_kernel[0], size_kernel[1]))) else: w[name_w + '_w'] = G.sharedf( mask * 0.05 * np.random.randn(n_out, _n_in, size_kernel[0], size_kernel[1])) if maxweight > 0: w[name_w + '_w'].set_value( maxconstraint(w[name_w + '_w']).tag.test_value) w[name + '_b'] = G.sharedf(np.zeros((n_out, ))) if l2norm or do_scale: if logscale: w[name + '_s'] = G.sharedf(np.zeros((n_out, ))) else: w[name + '_s'] = G.sharedf(np.ones((n_out, ))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.ones((n_out, ))) def f(h, w): input_shape = h.tag.test_value.shape[1:] _input = h if pad_channel: h = N.conv.pad2dwithchannel(h, size_kernel) kerns = mask * w[name_w + '_w'] if l2norm: kerns = l2normalize(kerns) if l2norm or do_scale: if logscale: kerns *= T.exp(logscale_scale * w[name + '_s']).dimshuffle( 0, 'x', 'x', 'x') else: kerns *= w[name + '_s'].dimshuffle(0, 'x', 'x', 'x') elif do_constant_rescale: kerns *= constant_rescale.dimshuffle(0, 'x', 'x', 'x') h = N.conv.dnn_conv(h, kerns, border_mode=border_mode) # Center if bn: # mean-only batch norm h -= h.mean(axis=(0, 2, 3), keepdims=True) h += w[name + '_b'].dimshuffle('x', 0, 'x', 'x') if '__init' in w and not zeroinit: # Std data_std = h.std(axis=(0, 2, 3)) num_zeros = (data_std.tag.test_value == 0).sum() if num_zeros > 0: print "Warning: Stdev=0 for " + str( num_zeros ) + " features in " + name + ". Skipping data-dependent init." else: if name + '_s' in w: if logscale: w[name + '_s'].set_value( -T.log(data_std).tag.test_value / logscale_scale) else: w[name + '_s'].set_value( (1. / data_std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value((1. / data_std).tag.test_value) #w[name+'_w'].set_value((kerns / std.dimshuffle(0,'x','x','x')).tag.test_value) h /= data_std.dimshuffle('x', 0, 'x', 'x') # Mean mean = h.mean(axis=(0, 2, 3)) w[name + '_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x', 0, 'x', 'x') #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() if not '__init' in w: output_shape = h.tag.test_value.shape[1:] print 'ar.conv2d', name, input_shape, output_shape, size_kernel, zerodiagonal, flipmask, pad_channel, border_mode, zeroinit, l2norm #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Normalize weights to _norm L2 norm # TODO: check whether only_upper_bounds here really helps # (the effect is a higher learning rate in the beginning of training) def postup(updates, w): updates[w[name_w + '_w']] = mask * updates[w[name_w + '_w']] if l2norm and maxweight > 0.: updates[w[name_w + '_w']] = maxconstraint(updates[w[name_w + '_w']]) return updates return G.Struct(__call__=f, w=w, postup=postup)
def fcvae(shape_x, depth_model, depth_ar, n_h1, n_h2, n_z, posterior, px='logistic', nl='softplus', alpha=0.002, beta1=0.1, beta2=0.001, share_w=False, data_init=None): _locals = locals() _locals.pop('data_init') print 'CVAE9 with ', _locals #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear'] assert px in ['logistic', 'bernoulli'] w = {} # model params kernel_h = (1, 1) n_x = shape_x[0] * shape_x[1] * shape_x[2] # Input whitening if px == 'logistic': w['logsd_x'] = G.sharedf(0.) # encoder x_enc = N.conv.conv2d('x_enc', n_x, n_h1, (1, 1), w=w) x_dec = N.conv.conv2d('x_dec', n_h1, n_x, (1, 1), w=w) x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w) layers = [] for i in range(depth_model): name = str(i) if share_w: name = '[sharedw]' + str(i) + '[/sharedw]' layers.append( cvae_layer(name, posterior, n_h1, n_h2, n_z, depth_ar, False, nl, kernel_h, share_w, w)) # top-level value #w['h_top'] = G.sharedf(np.zeros((n_h1,))) w['h_top'] = G.sharedf(np.random.normal(0, 0.01, size=(n_h1, ))) # Initialize variables x = T.tensor4('x') x.tag.test_value = data_init['x'] n_batch_test = data_init['x'].shape[0] _x = T.clip(x / 255., 0, 1) # Objective function def f_cost(w, train=True): results = {} h = x_enc(_x.reshape((-1, n_x, 1, 1)) - .5, w) obj_logpz = 0 obj_logqz = 0 # bottom-up encoders for i in range(depth_model): h = layers[i].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'), (_x.shape[0], 1, 1, 1)) # top-down priors, posteriors and decoders for i in list(reversed(range(depth_model))): h, _obj_logqz, _obj_logpz = layers[i].down_q(h, train, w) obj_logqz += _obj_logqz obj_logpz += _obj_logpz results['cost_z' + str(i).zfill(3)] = _obj_logqz - _obj_logpz output = .1 * x_dec(x_dec_nl(h, w), w).reshape( (-1, shape_x[0], shape_x[1], shape_x[2])) # empirical distribution if px == 'logistic': mean_x = T.clip(output, -.5, .5) logsd_x = 0 * mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 255., _x - .5).logp obj = obj_logpz - obj_logqz + obj_logpx # Compute the bits per pixel obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32') elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.minimum(prob_x, 1 - 1e-7) prob_x = T.maximum(prob_x, 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpz - obj_logqz + obj_logpx #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results #print 'obj_logpz', obj_logpz.tag.test_value #print 'obj_logqz', obj_logqz.tag.test_value #print 'obj_logpx', obj_x.tag.test_value #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_x = T.printing.Print('obj_logpx')(obj_x) # Turns Gaussian noise 'eps' into a sample def f_decoder(eps, w): # top-level activations h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'), (eps['eps_0'].shape[0], 1, 1, 1)) # top-down priors, posteriors and decoders for i in list(reversed(range(depth_model))): h = layers[i].down_p(h, eps['eps_' + str(i)], w) output = .1 * x_dec(x_dec_nl(h, w), w).reshape( (-1, shape_x[0], shape_x[1], shape_x[2])) if px == 'logistic': mean_x = T.clip(output[:, :, :, :] + .5, 0, 1) elif px == 'bernoulli': mean_x = T.nnet.sigmoid(output) image = (255. * T.clip(mean_x, 0, 1)).astype('uint8') return image def f_eps(n_batch, w): eps = {} for i in range(depth_model): eps['eps_' + str(i)] = G.rng_curand.normal((n_batch, n_z, 1, 1), dtype=floatX) return eps def postup(updates, w): nodes = [x_enc, x_dec] for n in nodes: updates = n.postup(updates, w) for i in range(depth_model): updates = layers[i].postup(updates, w) return updates # Compile init function if data_init != None: w['__init'] = OrderedDict() f_cost(w) w.pop('__init') #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean() # Compile training function results = f_cost(w) updates, (w_avg, ) = G.misc.optim.AdaMaxAvg([w], results['cost'], alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') #todo: replace postup with below #w['_updates'] = updates #f_cost(w) #updates = w.pop('_updates') updates = postup(updates, w) f_train = G.function({'x': x}, results['cost'], updates=updates) # Compile evaluation function results = f_cost(w_avg, False) f_eval = G.function({'x': x}, results) # Compile epsilon generating function n_batch = T.lscalar() n_batch.tag.test_value = 16 eps = f_eps(n_batch, w) f_eps = G.function({'n_batch': n_batch}, eps) # Compile sampling function eps = {} for i in range(depth_model): eps['eps_' + str(i)] = T.tensor4('eps' + str(i)) eps['eps_' + str(i)].tag.test_value = np.random.randn( n_batch_test, n_z, 1, 1).astype(floatX) image = f_decoder(eps, w_avg) f_decode = G.function(eps, image) return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps, w=w, w_avg=w_avg)
def f_encode_decode(w, train=True): results = {} h = x_enc(_x - .5, w) obj_kl = G.sharedf(0.) # bottom-up encoders for i in range(len(depths)): for j in range(depths[i]): h = layers[i][j].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'), (_x.shape[0], 1, shape_x[1] / 2**len(depths), shape_x[2] / 2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h, kl = layers[i][j].down_q(h, train, w) kl_sum = kl.sum(axis=(1, 2, 3)) results['cost_z' + str(i).zfill(3) + '_' + str(j).zfill(3)] = kl_sum # Constraint: Minimum number of bits per featuremap, averaged across minibatch if kl_min > 0: if True: kl = kl.sum(axis=(2, 3)).mean(axis=0, dtype=G.floatX) obj_kl += T.maximum(np.asarray(kl_min, G.floatX), kl).sum(dtype=G.floatX) else: kl = T.maximum(np.asarray(kl_min, G.floatX), kl.sum(axis=(2, 3))).sum(axis=1, dtype=G.floatX) obj_kl += kl else: obj_kl += kl_sum output = .1 * x_dec(x_dec_nl(h, w), w) # empirical distribution if px == 'logistic': mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.) logsd_x = 0 * mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 256., _x).logp #obj_z = T.printing.Print('obj_z')(obj_z) obj = obj_logpx - obj_kl # Compute the bits per pixel obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32') #if not '__init' in w: # raise Exception() elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.maximum(T.minimum(prob_x, 1 - 1e-7), 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpx - obj_kl #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results
def zero_centered_laplace(name, w={}): w[name+'_logscale'] = G.sharedf(0.) def logp(v, w): return -abs(v).sum()/T.exp(w[name+'_logscale']) - v.size.astype(G.floatX) * (T.log(2.) + w[name+'_logscale']) postup = lambda updates, w:updates return G.Struct(logp=logp, postup=postup, w=w)
def conv2d(name, n_in, n_out, size_kernel=(3,3), zerodiagonal=True, flipmask=False, pad_channel=True, border_mode='valid', zeroinit=False, l2norm=True, w={}): do_scale = False if zeroinit: l2norm = False do_scale = True if not pad_channel: border_mode = 'same' print 'No pad_channel, changing border_mode to same' #if 'whitener' not in name: # pad_channel = False # border_mode = 'same' if '[sharedw]' in name and '[/sharedw]' in name: name_w = name pre, b = name.split("[sharedw]") c, post = b.split("[/sharedw]") name_w = pre+"[s]"+post name = pre+c+post # Don't share the bias and scales #name = name_w # Also share the bias and scales else: name_w = name assert border_mode in ['valid','full','same'] _n_in = n_in if pad_channel: if size_kernel[0] > 1 or size_kernel[1] > 1: assert size_kernel[0] == size_kernel[1] assert border_mode == 'valid' _n_in += 1 else: pad_channel = False if border_mode == 'same': assert size_kernel[0]%2 == 1 border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2) if True: # Build autoregressive mask l = (size_kernel[0]-1)/2 m = (size_kernel[1]-1)/2 mask = np.ones((n_out, _n_in, size_kernel[0], size_kernel[1]),dtype=G.floatX) mask[:,:,:l,:] = 0 mask[:,:,l,:m] = 0 if n_out >= n_in: assert n_out%n_in == 0 k = n_out / n_in for i in range(n_in): mask[i*k:(i+1)*k,i+1:,l,m] = 0 if zerodiagonal: mask[i*k:(i+1)*k,i:i+1,l,m] = 0 else: assert n_in%n_out == 0 k = n_in / n_out for i in range(n_out): mask[i:i+1,(i+1)*k:,l,m] = 0 if zerodiagonal: mask[i:i+1,i*k:(i+1)*k:,l,m] = 0 if flipmask: mask = mask[::-1,::-1,::-1,::-1] def l2normalize(kerns): if zerodiagonal: # to prevent NaN gradients # TODO: smarter solution (also see below) l = (size_kernel[0]-1)/2 m = (size_kernel[1]-1)/2 if n_out >= n_in: kerns = T.set_subtensor(kerns[:n_out/n_in,:,l,m], 0.) else: kerns = T.set_subtensor(kerns[:1,:,l,m], 0.) targetnorm = 1. norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True)) norm += 1e-8 return kerns * (targetnorm / norm) def maxconstraint(kerns): return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True))) if zeroinit: w[name_w+'_w'] = G.sharedf(np.zeros((n_out, _n_in, size_kernel[0], size_kernel[1]))) else: w[name_w+'_w'] = G.sharedf(mask * 0.05*np.random.randn(n_out, _n_in, size_kernel[0], size_kernel[1])) if maxweight > 0: w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value) w[name+'_b'] = G.sharedf(np.zeros((n_out,))) if l2norm or do_scale: if logscale: w[name+'_s'] = G.sharedf(np.zeros((n_out,))) else: w[name+'_s'] = G.sharedf(np.ones((n_out,))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.ones((n_out,))) def f(h, w): input_shape = h.tag.test_value.shape[1:] _input = h if pad_channel: h = N.conv.pad2dwithchannel(h, size_kernel) kerns = mask * w[name_w+'_w'] if l2norm: kerns = l2normalize(kerns) if l2norm or do_scale: if logscale: kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x') else: kerns *= w[name+'_s'].dimshuffle(0,'x','x','x') elif do_constant_rescale: kerns *= constant_rescale.dimshuffle(0,'x','x','x') h = N.conv.dnn_conv(h, kerns, border_mode=border_mode) # Center if bn: # mean-only batch norm h -= h.mean(axis=(0,2,3), keepdims=True) h += w[name+'_b'].dimshuffle('x',0,'x','x') if '__init' in w and not zeroinit: # Std data_std = h.std(axis=(0,2,3)) num_zeros = (data_std.tag.test_value == 0).sum() if num_zeros > 0: print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init." else: if name+'_s' in w: if logscale: w[name+'_s'].set_value(-T.log(data_std).tag.test_value/logscale_scale) else: w[name+'_s'].set_value((1./data_std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value((1./data_std).tag.test_value) #w[name+'_w'].set_value((kerns / std.dimshuffle(0,'x','x','x')).tag.test_value) h /= data_std.dimshuffle('x',0,'x','x') # Mean mean = h.mean(axis=(0,2,3)) w[name+'_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x',0,'x','x') #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() if not '__init' in w: output_shape = h.tag.test_value.shape[1:] print 'ar.conv2d', name, input_shape, output_shape, size_kernel, zerodiagonal, flipmask, pad_channel, border_mode, zeroinit, l2norm #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Normalize weights to _norm L2 norm # TODO: check whether only_upper_bounds here really helps # (the effect is a higher learning rate in the beginning of training) def postup(updates, w): updates[w[name_w+'_w']] = mask * updates[w[name_w+'_w']] if l2norm and maxweight>0.: updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']]) return updates return G.Struct(__call__=f, w=w, postup=postup)
def cvae1(shape_x, depths, depth_ar, n_h1, n_h2, n_z, prior='diag', posterior='down_diag', px='logistic', nl='softplus', kernel_x=(5, 5), kernel_h=(3, 3), kl_min=0, optim='adamax', alpha=0.002, beta1=0.1, beta2=0.001, weightsharing=None, pad_x=0, data_init=None, downsample_type='nn'): _locals = locals() _locals.pop('data_init') print 'CVAE1 with ', _locals #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear'] assert px in ['logistic', 'bernoulli'] w = {} # model params if pad_x > 0: shape_x[1] += 2 * pad_x shape_x[2] += 2 * pad_x # Input whitening if px == 'logistic': w['logsd_x'] = G.sharedf(0.) # encoder x_enc = N.conv.conv2d('x_enc', shape_x[0], n_h1, kernel_x, downsample=2, w=w) x_dec = N.conv.conv2d('x_dec', n_h1, shape_x[0], kernel_x, upsample=2, w=w) x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w) layers = [] for i in range(len(depths)): layers.append([]) for j in range(depths[i]): downsample = (i > 0 and j == 0) if weightsharing is None or not weightsharing: name = str(i) + '_' + str(j) elif weightsharing == 'all': name = '[sharedw]' + str(i) + '_' + str(j) + '[/sharedw]' elif weightsharing == 'acrosslevels': name = '[sharedw]' + str(i) + '[/sharedw]' + '_' + str(j) elif weightsharing == 'withinlevel': name = '[sharedw]' + str(i) + '[/sharedw]' + '_' + str(j) else: raise Exception() layers[i].append( cvae_layer(name, prior, posterior, n_h1, n_h2, n_z, depth_ar, downsample, nl, kernel_h, False, downsample_type, w)) # top-level value w['h_top'] = G.sharedf(np.zeros((n_h1, ))) # Initialize variables x = T.tensor4('x', dtype='uint8') x.tag.test_value = data_init['x'] n_batch_test = data_init['x'].shape[0] _x = T.clip((x + .5) / 256., 0, 1) #_x = T.clip(x / 255., 0, 1) if pad_x > 0: _x = N.conv.pad2d(_x, pad_x) # Objective function def f_encode_decode(w, train=True): results = {} h = x_enc(_x - .5, w) obj_kl = G.sharedf(0.) # bottom-up encoders for i in range(len(depths)): for j in range(depths[i]): h = layers[i][j].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'), (_x.shape[0], 1, shape_x[1] / 2**len(depths), shape_x[2] / 2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h, kl = layers[i][j].down_q(h, train, w) kl_sum = kl.sum(axis=(1, 2, 3)) results['cost_z' + str(i).zfill(3) + '_' + str(j).zfill(3)] = kl_sum # Constraint: Minimum number of bits per featuremap, averaged across minibatch if kl_min > 0: if True: kl = kl.sum(axis=(2, 3)).mean(axis=0, dtype=G.floatX) obj_kl += T.maximum(np.asarray(kl_min, G.floatX), kl).sum(dtype=G.floatX) else: kl = T.maximum(np.asarray(kl_min, G.floatX), kl.sum(axis=(2, 3))).sum(axis=1, dtype=G.floatX) obj_kl += kl else: obj_kl += kl_sum output = .1 * x_dec(x_dec_nl(h, w), w) # empirical distribution if px == 'logistic': mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.) logsd_x = 0 * mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 256., _x).logp #obj_z = T.printing.Print('obj_z')(obj_z) obj = obj_logpx - obj_kl # Compute the bits per pixel obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32') #if not '__init' in w: # raise Exception() elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.maximum(T.minimum(prob_x, 1 - 1e-7), 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpx - obj_kl #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results # Turns Gaussian noise 'eps' into a sample def f_decoder(eps, w): # top-level activations h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'), (eps['eps_0_0'].shape[0], 1, shape_x[1] / 2**len(depths), shape_x[2] / 2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h = layers[i][j].down_p(h, eps['eps_' + str(i) + '_' + str(j)], w) output = .1 * x_dec(x_dec_nl(h, w), w) if px == 'logistic': mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.) elif px == 'bernoulli': mean_x = T.nnet.sigmoid(output) image = (256. * mean_x).astype('uint8') if pad_x > 0: image = image[:, :, pad_x:-pad_x, pad_x:-pad_x] return image def f_eps(n_batch, w): eps = {} for i in range(len(depths)): for j in range(depths[i]): eps['eps_' + str(i) + '_' + str(j)] = G.rng_curand.normal( (n_batch, n_z, shape_x[1] / 2**(i + 1), shape_x[2] / 2**(i + 1)), dtype=floatX) return eps def postup(updates, w): nodes = [x_enc, x_dec] for n in nodes: updates = n.postup(updates, w) for i in range(len(depths)): for j in range(depths[i]): updates = layers[i][j].postup(updates, w) return updates # Compile init function if data_init != None: w['__init'] = OrderedDict() f_encode_decode(w) w.pop('__init') #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean() # Compile training function #todo: replace postup with below #w['_updates'] = updates #f_cost(w) #updates = w.pop('_updates') w_avg = {i: G.sharedf(w[i].get_value()) for i in w} def lazy(f): def newf(*args, **kws): if not hasattr(f, 'cache'): f.cache = f() return f.cache(*args, **kws) return newf @lazy def f_train(): if optim == 'adamax': train_cost = f_encode_decode(w)['cost'] updates = G.misc.optim.AdaMaxAvg([w], [w_avg], train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') elif optim == 'eve': f = lambda w: f_encode_decode(w)['cost'] train_cost, updates = G.misc.optim.Eve( w, w_avg, f, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') updates = postup(updates, w) return G.function({'x': x}, train_cost, updates=updates, lazy=lazy) @lazy def f_train_q(): keys_q = [] for i in w: if '_q_' in i: keys_q.append(i) train_cost = f_encode_decode(w)['cost'] updates = G.misc.optim.AdaMaxAvg([w], None, train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, update_keys=keys_q, disconnected_inputs='ignore') updates = postup(updates, w) return G.function({'x': x}, train_cost, updates=updates, lazy=lazy) # Compile evaluation function @lazy def f_eval(): results = f_encode_decode(w_avg, False) return G.function({'x': x}, results) # Compile epsilon generating function @lazy def f_eps_(): n_batch = T.lscalar() n_batch.tag.test_value = 16 eps = f_eps(n_batch, w) return G.function({'n_batch': n_batch}, eps, lazy=lazy) # Compile sampling function @lazy def f_decode(): eps = {} for i in range(len(depths)): for j in range(depths[i]): eps['eps_' + str(i) + '_' + str(j)] = T.tensor4('eps' + str(i)) eps['eps_' + str(i) + '_' + str(j)].tag.test_value = np.random.randn( n_batch_test, n_z, shape_x[1] / 2**(i + 1), shape_x[2] / 2**(i + 1)).astype(floatX) image = f_decoder(eps, w_avg) return G.function(eps, image, lazy=lazy) return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps_, w=w, w_avg=w_avg)
def gaussian_spherical(shape=None, sample=None): if sample is None: sample = G.rng_curand.normal(shape) if shape is None: assert sample != None shape = sample.shape logp = -.5 * (T.log(2*math.pi) + sample**2).flatten(2).sum(axis=1) entr = (1.*T.prod(shape[1:]).astype(G.floatX)) * T.ones((shape[0],), dtype=G.floatX) * G.sharedf(.5 * (np.log(2.*math.pi)+1.)) return RandomVariable(sample, logp, entr, shape=shape)
def fcvae(shape_x, depth_model, depth_ar, n_h1, n_h2, n_z, posterior, px='logistic', nl='softplus', alpha=0.002, beta1=0.1, beta2=0.001, share_w=False, data_init=None): _locals = locals() _locals.pop('data_init') print 'CVAE9 with ', _locals #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear'] assert px in ['logistic','bernoulli'] w = {} # model params kernel_h = (1,1) n_x = shape_x[0]*shape_x[1]*shape_x[2] # Input whitening if px == 'logistic': w['logsd_x'] = G.sharedf(0.) # encoder x_enc = N.conv.conv2d('x_enc', n_x, n_h1, (1,1), w=w) x_dec = N.conv.conv2d('x_dec', n_h1, n_x, (1,1), w=w) x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w) layers = [] for i in range(depth_model): name = str(i) if share_w: name = '[sharedw]'+str(i)+'[/sharedw]' layers.append(cvae_layer(name, posterior, n_h1, n_h2, n_z, depth_ar, False, nl, kernel_h, share_w, w)) # top-level value #w['h_top'] = G.sharedf(np.zeros((n_h1,))) w['h_top'] = G.sharedf(np.random.normal(0,0.01,size=(n_h1,))) # Initialize variables x = T.tensor4('x') x.tag.test_value = data_init['x'] n_batch_test = data_init['x'].shape[0] _x = T.clip(x / 255., 0, 1) # Objective function def f_cost(w, train=True): results = {} h = x_enc(_x.reshape((-1,n_x,1,1)) - .5, w) obj_logpz = 0 obj_logqz = 0 # bottom-up encoders for i in range(depth_model): h = layers[i].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,1,1)) # top-down priors, posteriors and decoders for i in list(reversed(range(depth_model))): h, _obj_logqz, _obj_logpz = layers[i].down_q(h, train, w) obj_logqz += _obj_logqz obj_logpz += _obj_logpz results['cost_z'+str(i).zfill(3)] = _obj_logqz - _obj_logpz output = .1 * x_dec(x_dec_nl(h, w), w).reshape((-1,shape_x[0],shape_x[1],shape_x[2])) # empirical distribution if px == 'logistic': mean_x = T.clip(output, -.5, .5) logsd_x = 0*mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/255., _x - .5).logp obj = obj_logpz - obj_logqz + obj_logpx # Compute the bits per pixel obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32') elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.minimum(prob_x, 1-1e-7) prob_x = T.maximum(prob_x, 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpz - obj_logqz + obj_logpx #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results #print 'obj_logpz', obj_logpz.tag.test_value #print 'obj_logqz', obj_logqz.tag.test_value #print 'obj_logpx', obj_x.tag.test_value #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_x = T.printing.Print('obj_logpx')(obj_x) # Turns Gaussian noise 'eps' into a sample def f_decoder(eps, w): # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (eps['eps_0'].shape[0],1,1,1)) # top-down priors, posteriors and decoders for i in list(reversed(range(depth_model))): h = layers[i].down_p(h, eps['eps_'+str(i)], w) output = .1 * x_dec(x_dec_nl(h, w), w).reshape((-1,shape_x[0],shape_x[1],shape_x[2])) if px == 'logistic': mean_x = T.clip(output[:,:,:,:] + .5, 0, 1) elif px == 'bernoulli': mean_x = T.nnet.sigmoid(output) image = (255.*T.clip(mean_x, 0, 1)).astype('uint8') return image def f_eps(n_batch, w): eps = {} for i in range(depth_model): eps['eps_'+str(i)] = G.rng_curand.normal((n_batch,n_z,1,1),dtype=floatX) return eps def postup(updates, w): nodes = [x_enc,x_dec] for n in nodes: updates = n.postup(updates, w) for i in range(depth_model): updates = layers[i].postup(updates, w) return updates # Compile init function if data_init != None: w['__init'] = OrderedDict() f_cost(w) w.pop('__init') #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean() # Compile training function results = f_cost(w) updates, (w_avg,) = G.misc.optim.AdaMaxAvg([w], results['cost'], alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore') #todo: replace postup with below #w['_updates'] = updates #f_cost(w) #updates = w.pop('_updates') updates = postup(updates, w) f_train = G.function({'x':x}, results['cost'], updates=updates) # Compile evaluation function results = f_cost(w_avg, False) f_eval = G.function({'x':x}, results) # Compile epsilon generating function n_batch = T.lscalar() n_batch.tag.test_value = 16 eps = f_eps(n_batch, w) f_eps = G.function({'n_batch':n_batch}, eps) # Compile sampling function eps = {} for i in range(depth_model): eps['eps_'+str(i)] = T.tensor4('eps'+str(i)) eps['eps_'+str(i)].tag.test_value = np.random.randn(n_batch_test,n_z,1,1).astype(floatX) image = f_decoder(eps, w_avg) f_decode = G.function(eps, image) return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps, w=w, w_avg=w_avg)
def linear(name, n_in, n_out, diagonalzeros, l2norm=True, w={}): assert n_in % n_out == 0 or n_out % n_in == 0 mask = np.ones((n_in, n_out),dtype=G.floatX) if n_out >= n_in: k = n_out / n_in for i in range(n_in): mask[i+1:,i*k:(i+1)*k] = 0 if diagonalzeros: mask[i:i+1,i*k:(i+1)*k] = 0 else: k = n_in / n_out for i in range(n_out): mask[(i+1)*k:,i:i+1] = 0 if diagonalzeros: mask[i*k:(i+1)*k:,i:i+1] = 0 # L2 normalization of weights def l2normalize(_w, axis=0): if diagonalzeros: # to prevent NaN gradients # TODO: smarter solution (also see below) if n_out >= n_in: _w = T.set_subtensor(_w[:,:n_out/n_in], 0.) else: _w = T.set_subtensor(_w[:,:1], 0.) targetnorm = 1. norm = T.sqrt((_w**2).sum(axis=axis, keepdims=True)) norm += 1e-8 new_w = _w * (targetnorm / norm) return new_w def maxconstraint(_w): return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True))) w[name+'_w'] = G.sharedf(mask * 0.05 * np.random.randn(n_in, n_out)) if maxweight > 0: w[name+'_w'].set_value(maxconstraint(w[name+'_w']).tag.test_value) w[name+'_b'] = G.sharedf(np.zeros((n_out,))) if l2norm: if logscale: w[name+'_s'] = G.sharedf(np.zeros((n_out,))) else: w[name+'_s'] = G.sharedf(np.ones((n_out,))) elif do_constant_rescale: print 'WARNING: constant rescale, these weights arent saved' constant_rescale = G.sharedf(np.zeros((n_out,))) def f(h, w): _input = h _w = mask * w[name+'_w'] if l2norm: _w = l2normalize(_w) h = T.dot(h, _w) if l2norm: if logscale: h *= T.exp(logscale_scale*w[name+'_s']) else: h *= abs(w[name+'_s']) elif do_constant_rescale: h *= T.exp(constant_rescale) h += w[name+'_b'] if '__init' in w: # Std std = (1./init_stdev) * h.std(axis=0) std += (std <= 0) std += 1e-8 if name+'_s' in w: if logscale: w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale) else: w[name+'_s'].set_value((1./std).tag.test_value) elif do_constant_rescale: constant_rescale.set_value(-T.log(std).tag.test_value) #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value) h /= std.dimshuffle('x',0) # Mean mean = h.mean(axis=0) w[name+'_b'].set_value(-mean.tag.test_value) h -= mean.dimshuffle('x',0) #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max() #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value #h = T.printing.Print(name)(h) return h # Post updates: normalize weights to unit L2 norm def postup(updates, w): updates[w[name+'_w']] = mask * updates[w[name+'_w']] if l2norm and maxweight>0.: updates[w[name+'_w']] = maxconstraint(updates[w[name+'_w']]) return updates return G.Struct(__call__=f, postup=postup, w=w)