Exemplo n.º 1
0
Arquivo: optim.py Projeto: gburt/iaf
def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1):
    if n_accum == 1:
        return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3)
    print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum
    
    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise')

    new = OrderedDict()
    
    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(it,n_accum), 0)
    update = T.eq(T.mod(it,n_accum), n_accum-1)
    
    ws_avg = []
    for j in range(len(ws)):
        w_avg = {}
        for i in ws[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0.)
            w_avg[i] = G.sharedf(_w.get_value())
            g_sum = G.sharedf(_w.get_value() * 0.)
        
            new[g_sum] = ifelse(reset, _g, g_sum + _g)
            new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
            new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
            new[_w] = ifelse(update, _w + alpha *  new[mom1] / new[_max], _w)
            new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i])
        ws_avg += [w_avg]   
    return new, ws_avg
Exemplo n.º 2
0
Arquivo: optim.py Projeto: gburt/iaf
def Eve(w, w_avg, f, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, disconnected_inputs='raise'):
    print 'Eve', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3

    mom = {}
    _max = {}
    delta = {}
    w_prime = {}
    for i in w:
        mom[i] = G.sharedf(w[i].get_value() * 0.)
        _max[i] = G.sharedf(w[i].get_value() * 0. + 1e-8)
        delta[i] = G.sharedf(w[i].get_value() * 0.)
        w_prime[i] = w[i] + (1-beta1)/beta1 * delta[i]
    
    train_cost = f(w_prime).mean()
    g = G.ndict.T_grad(train_cost, w, disconnected_inputs=disconnected_inputs) #warn/raise
    
    new = OrderedDict()
    for i in w:
        new[mom[i]] = (1-beta1) * mom[i] + beta1 * g[i]
        new[_max[i]] = T.maximum((1-beta2)*_max[i], abs(g[i]) + 1e-8)
        new[delta[i]] = alpha * new[mom[i]] / new[_max[i]]
        new[w[i]] = w[i] + new[delta[i]]
    
    for i in w:
        new[w_avg[i]] = beta3 * w[i] + (1.-beta3) * w_avg[i]
    return train_cost, new
Exemplo n.º 3
0
Arquivo: optim.py Projeto: gburt/iaf
def AdaMaxAvg(ws, ws_avg, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, update_keys=None, disconnected_inputs='raise'):
    print 'AdaMax_Avg', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3
    
    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs=disconnected_inputs) #warn/raise
    
    if update_keys is None:
        update_keys = [ws[j].keys() for j in range(len(ws))]
    
    new = OrderedDict()
    for j in range(len(ws)):
        if ws_avg is not None:
            w_avg = ws_avg[j]
        for i in update_keys[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0. + 1e-8)
            
            new[mom1] = (1-beta1) * mom1 + beta1 * _g
            new[_max] = T.maximum((1-beta2)*_max, abs(_g) + 1e-8)
            new[_w] = _w + alpha *  new[mom1] / new[_max]
            if ws_avg is not None:
                new[w_avg[i]] = beta3 * _w + (1.-beta3) * w_avg[i]
    return new
Exemplo n.º 4
0
def AdaMaxAvg(ws,
              ws_avg,
              objective,
              alpha=.01,
              beta1=.1,
              beta2=.001,
              beta3=0.01,
              update_keys=None,
              disconnected_inputs='raise'):
    print 'AdaMax_Avg', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3

    gs = G.ndict.T_grad(objective.sum(),
                        ws,
                        disconnected_inputs=disconnected_inputs)  #warn/raise

    if update_keys is None:
        update_keys = [ws[j].keys() for j in range(len(ws))]

    new = OrderedDict()
    for j in range(len(ws)):
        if ws_avg is not None:
            w_avg = ws_avg[j]
        for i in update_keys[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0. + 1e-8)

            new[mom1] = (1 - beta1) * mom1 + beta1 * _g
            new[_max] = T.maximum((1 - beta2) * _max, abs(_g) + 1e-8)
            new[_w] = _w + alpha * new[mom1] / new[_max]
            if ws_avg is not None:
                new[w_avg[i]] = beta3 * _w + (1. - beta3) * w_avg[i]
    return new
Exemplo n.º 5
0
def Eve(w,
        w_avg,
        f,
        alpha=.01,
        beta1=.1,
        beta2=.001,
        beta3=0.01,
        disconnected_inputs='raise'):
    print 'Eve', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3

    mom = {}
    _max = {}
    delta = {}
    w_prime = {}
    for i in w:
        mom[i] = G.sharedf(w[i].get_value() * 0.)
        _max[i] = G.sharedf(w[i].get_value() * 0. + 1e-8)
        delta[i] = G.sharedf(w[i].get_value() * 0.)
        w_prime[i] = w[i] + (1 - beta1) / beta1 * delta[i]

    train_cost = f(w_prime).mean()
    g = G.ndict.T_grad(train_cost, w,
                       disconnected_inputs=disconnected_inputs)  #warn/raise

    new = OrderedDict()
    for i in w:
        new[mom[i]] = (1 - beta1) * mom[i] + beta1 * g[i]
        new[_max[i]] = T.maximum((1 - beta2) * _max[i], abs(g[i]) + 1e-8)
        new[delta[i]] = alpha * new[mom[i]] / new[_max[i]]
        new[w[i]] = w[i] + new[delta[i]]

    for i in w:
        new[w_avg[i]] = beta3 * w[i] + (1. - beta3) * w_avg[i]
    return train_cost, new
Exemplo n.º 6
0
def gsm(name, k, w={}, logvar_minmax=16):
    w[name + '_weight'] = G.sharedf(np.zeros((k, )))
    w[name + '_logvar'] = G.sharedf(np.random.randn(k) * .1)

    def logp(v, w):
        mixtureweights = T.exp(w[name + '_weight'])
        mixtureweights /= mixtureweights.sum()
        logvar = logvar_minmax * w[name + '_logvar']
        var = T.exp(logvar)
        if k == 0:
            return 0.
        if k == 1:
            return -.5 * (v**2).sum() / var[0] - v.size.astype(
                G.floatX) * (.5 * T.log(2. * math.pi) + logvar[0])
        p = 0.
        for i in range(k):
            p += mixtureweights[i] * T.exp(-.5 * v**2 / var[i]) / T.sqrt(
                2. * math.pi * var[i])
        logp = T.log(p).sum()
        return logp

    def postup(updates, w):
        updates[w[name + '_logvar']] = T.clip(updates[w[name + '_logvar']],
                                              -1., 1.)
        return updates

    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 7
0
def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2):
    print 'AdaMax2', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'n_accum:', n_accum
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')

    new = OrderedDict()

    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(new[it], n_accum), 0)
    update = T.eq(T.mod(new[it], n_accum), n_accum - 1)

    for i in range(len(w)):
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        g_sum = G.sharedf(w[i].get_value() * 0.)

        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        new[g_sum] = ifelse(reset, g[i], g_sum + g[i])
        new[mom1] = ifelse(update, (1 - beta1) * mom1 + beta1 * new[g_sum],
                           mom1)
        new[_max] = ifelse(
            update, T.maximum((1 - beta2) * _max,
                              abs(new[g_sum]) + 1e-8), _max)
        new[w[i]] = ifelse(update, w[i] + alpha * new[mom1] / new[_max], w[i])

    return new
Exemplo n.º 8
0
def nonlinearity(name, which, shape=None, w={}):
    
    if which == 'prelu':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'pelu':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'softplus2':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'softplus_shiftscale':
        w[name+'_in_s'] = G.sharedf(np.zeros(shape))
        w[name+'_in_b'] = G.sharedf(np.zeros(shape))
    if which == 'linearsigmoid':
        w[name+'_a'] = G.sharedf(.5*np.ones(shape))
        w[name+'_b'] = G.sharedf(.5*np.ones(shape))
    if which == 'meanonlybatchnorm_softplus':
        assert type(shape) == int
        w[name+'_b'] = G.sharedf(np.zeros(shape))
    if which == 'meanonlybatchnorm_relu':
        assert type(shape) == int
        w[name+'_b'] = G.sharedf(np.zeros(shape))
    
    def f(h, w=None):
        if which == None or which == 'None':
            return h
        elif which == 'tanh':
            return T.tanh(h)
        elif which == 'softmax':
            return T.nnet.softmax(h)
        elif which == 'prelu':
            return w[name]*h*(h<0.) + h*(h>=0.)
        elif which == 'relu':
            return h*(h>=0.)
        elif which == 'shiftedrelu':
            return T.switch(h < -1., -1., h)
        elif which == 'leakyrelu':
            return 0.01 * h*(h<0.) + h*(h>=0.)
        elif which == 'elu':
            return T.switch(h < 0., T.exp(h)-1, h)
        elif which == 'softplus':
            return T.nnet.softplus(h)
        elif which == 'softplus_shiftscale':
            return T.nnet.softplus(T.exp(w[name+'_in_s']) * h + w[name+'_in_b'])
        elif which == 'softplus2':
            return T.nnet.softplus(h) - w[name] * T.nnet.softplus(-h)
        elif which == 'linearsigmoid':
            return w[name+'_a'] * h + w[name+'_b'] * T.nnet.sigmoid(h)
        elif which == 'meanonlybatchnorm_softplus':
            h -= h.mean(axis=(0,2,3), keepdims=True)
            h += w[name+'_b'].dimshuffle('x',0,'x','x')
            return T.nnet.softplus(h)
        elif which == 'meanonlybatchnorm_relu':
            h -= h.mean(axis=(0,2,3), keepdims=True)
            h += w[name+'_b'].dimshuffle('x',0,'x','x')
            return T.nnet.relu(h)
        else:
            raise Exception("Unrecognized nonlinearity: "+which)
        
        
    return G.Struct(__call__=f, w=w)
Exemplo n.º 9
0
def nonlinearity(name, which, shape=None, w={}):
    
    if which == 'prelu':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'pelu':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'softplus2':
        w[name] = G.sharedf(np.zeros(shape))
    if which == 'softplus_shiftscale':
        w[name+'_in_s'] = G.sharedf(np.zeros(shape))
        w[name+'_in_b'] = G.sharedf(np.zeros(shape))
    if which == 'linearsigmoid':
        w[name+'_a'] = G.sharedf(.5*np.ones(shape))
        w[name+'_b'] = G.sharedf(.5*np.ones(shape))
    if which == 'meanonlybatchnorm_softplus':
        assert type(shape) == int
        w[name+'_b'] = G.sharedf(np.zeros(shape))
    if which == 'meanonlybatchnorm_relu':
        assert type(shape) == int
        w[name+'_b'] = G.sharedf(np.zeros(shape))
    
    def f(h, w=None):
        if which == None or which == 'None':
            return h
        elif which == 'tanh':
            return T.tanh(h)
        elif which == 'softmax':
            return T.nnet.softmax(h)
        elif which == 'prelu':
            return w[name]*h*(h<0.) + h*(h>=0.)
        elif which == 'relu':
            return h*(h>=0.)
        elif which == 'shiftedrelu':
            return T.switch(h < -1., -1., h)
        elif which == 'leakyrelu':
            return 0.01 * h*(h<0.) + h*(h>=0.)
        elif which == 'elu':
            return T.switch(h < 0., T.exp(h)-1, h)
        elif which == 'softplus':
            return T.nnet.softplus(h)
        elif which == 'softplus_shiftscale':
            return T.nnet.softplus(T.exp(w[name+'_in_s']) * h + w[name+'_in_b'])
        elif which == 'softplus2':
            return T.nnet.softplus(h) - w[name] * T.nnet.softplus(-h)
        elif which == 'linearsigmoid':
            return w[name+'_a'] * h + w[name+'_b'] * T.nnet.sigmoid(h)
        elif which == 'meanonlybatchnorm_softplus':
            h -= h.mean(axis=(0,2,3), keepdims=True)
            h += w[name+'_b'].dimshuffle('x',0,'x','x')
            return T.nnet.softplus(h)
        elif which == 'meanonlybatchnorm_relu':
            h -= h.mean(axis=(0,2,3), keepdims=True)
            h += w[name+'_b'].dimshuffle('x',0,'x','x')
            return T.nnet.relu(h)
        else:
            raise Exception("Unrecognized nonlinearity: "+which)
        
        
    return G.Struct(__call__=f, w=w)
Exemplo n.º 10
0
Arquivo: optim.py Projeto: gburt/iaf
def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001):
    print 'AdaMax', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')
    
    new = OrderedDict()
    
    for i in range(len(w)):
        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        new[mom1] = (1-beta1) * mom1 + beta1 * g[i]
        new[_max] = T.maximum((1-beta2)*_max, abs(g[i]) + 1e-8)
        new[w[i]] = w[i] + alpha *  new[mom1] / new[_max]
                
    return new
Exemplo n.º 11
0
def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001):
    print 'AdaMax', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')

    new = OrderedDict()

    for i in range(len(w)):
        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        new[mom1] = (1 - beta1) * mom1 + beta1 * g[i]
        new[_max] = T.maximum((1 - beta2) * _max, abs(g[i]) + 1e-8)
        new[w[i]] = w[i] + alpha * new[mom1] / new[_max]

    return new
Exemplo n.º 12
0
 def f_encode_decode(w, train=True):
     
     results = {}
     
     h = x_enc(_x - .5, w)
     
     obj_kl = G.sharedf(0.)
     
     # bottom-up encoders
     for i in range(len(depths)):
         for j in range(depths[i]):
             h = layers[i][j].up(h, w)
     
     # top-level activations
     h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths)))
     
     # top-down priors, posteriors and decoders
     for i in list(reversed(range(len(depths)))):
         for j in list(reversed(range(depths[i]))):
             h, kl = layers[i][j].down_q(h, train, w)
             kl_sum = kl.sum(axis=(1,2,3))
             results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum
             # Constraint: Minimum number of bits per featuremap, averaged across minibatch
             if kl_min > 0:
                 kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX)
                 obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX)
             else:
                 obj_kl += kl_sum
     
     output = x_dec(x_dec_nl(h, w), w)
     
     # empirical distribution
     if px == 'logistic':
         mean_x = T.clip(output+.5, 0, 1)
         logsd_x = 0*mean_x + w['logsd_x']
         obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp
         #obj_z = T.printing.Print('obj_z')(obj_z)
         obj = obj_logpx - obj_kl
         # Compute the bits per pixel
         obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32')
         
         #if not '__init' in w:
         #    raise Exception()
         
     
     elif px == 'bernoulli':
         prob_x = T.nnet.sigmoid(output)
         prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7)
         #prob_x = T.printing.Print('prob_x')(prob_x)
         obj_logpx = N.rand.bernoulli(prob_x, _x).logp
         
         #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
         #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
         #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
         obj = obj_logpx - obj_kl
         #obj = T.printing.Print('obj')(obj)
         
     results['cost_x'] = -obj_logpx
     results['cost'] = -obj
     return results
Exemplo n.º 13
0
def batchnorm_meanonly(name, n_h, w={}):
    w[name+'_b'] = G.sharedf(np.zeros((n_h,)))
    def f(h, w):
        h -= h.mean(axis=(0,2,3), keepdims=True)
        h += w[name+'_b'].dimshuffle('x',0,'x','x')
        return h
    return G.Struct(__call__=f, w=w)
Exemplo n.º 14
0
Arquivo: rand.py Projeto: gburt/iaf
def zero_centered_gaussian(name, w={}):
    w[name+'_logvar'] = G.sharedf(0.)
    def logp(v, w):
        logvar = w[name+'_logvar']*10
        return v.size.astype(G.floatX) * -.5 * (T.log(2.*math.pi) + logvar) - .5 * (v**2).sum() / T.exp(logvar)
    postup = lambda updates, w:updates
    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 15
0
def batchnorm_meanonly(name, n_h, w={}):
    w[name+'_b'] = G.sharedf(np.zeros((n_h,)))
    def f(h, w):
        h -= h.mean(axis=(0,2,3), keepdims=True)
        h += w[name+'_b'].dimshuffle('x',0,'x','x')
        return h
    return G.Struct(__call__=f, w=w)
Exemplo n.º 16
0
def AdaMaxAvg2(ws,
               objective,
               alpha=.01,
               beta1=.1,
               beta2=.001,
               beta3=0.01,
               n_accum=1):
    if n_accum == 1:
        return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3)
    print 'AdaMax_Avg2', 'alpha:', alpha, 'beta1:', beta1, 'beta2:', beta2, 'beta3:', beta3, 'n_accum:', n_accum

    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise')

    new = OrderedDict()

    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(it, n_accum), 0)
    update = T.eq(T.mod(it, n_accum), n_accum - 1)

    ws_avg = []
    for j in range(len(ws)):
        w_avg = {}
        for i in ws[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0.)
            w_avg[i] = G.sharedf(_w.get_value())
            g_sum = G.sharedf(_w.get_value() * 0.)

            new[g_sum] = ifelse(reset, _g, g_sum + _g)
            new[mom1] = ifelse(update, (1 - beta1) * mom1 + beta1 * new[g_sum],
                               mom1)
            new[_max] = ifelse(
                update, T.maximum((1 - beta2) * _max,
                                  abs(new[g_sum]) + 1e-8), _max)
            new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w)
            new[w_avg[i]] = ifelse(update,
                                   beta3 * new[_w] + (1. - beta3) * w_avg[i],
                                   w_avg[i])
        ws_avg += [w_avg]
    return new, ws_avg
Exemplo n.º 17
0
def zero_centered_laplace(name, w={}):
    w[name + '_logscale'] = G.sharedf(0.)

    def logp(v, w):
        return -abs(v).sum() / T.exp(w[name + '_logscale']) - v.size.astype(
            G.floatX) * (T.log(2.) + w[name + '_logscale'])

    postup = lambda updates, w: updates
    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 18
0
def zero_centered_gaussian(name, w={}):
    w[name + '_logvar'] = G.sharedf(0.)

    def logp(v, w):
        logvar = w[name + '_logvar'] * 10
        return v.size.astype(G.floatX) * -.5 * (
            T.log(2. * math.pi) + logvar) - .5 * (v**2).sum() / T.exp(logvar)

    postup = lambda updates, w: updates
    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 19
0
def gaussian_spherical(shape=None, sample=None):
    if sample is None:
        sample = G.rng_curand.normal(shape)
    if shape is None:
        assert sample != None
        shape = sample.shape
    logp = -.5 * (T.log(2 * math.pi) + sample**2).flatten(2).sum(axis=1)
    entr = (1. * T.prod(shape[1:]).astype(G.floatX)) * T.ones(
        (shape[0], ), dtype=G.floatX) * G.sharedf(.5 *
                                                  (np.log(2. * math.pi) + 1.))
    return RandomVariable(sample, logp, entr, shape=shape)
Exemplo n.º 20
0
Arquivo: rand.py Projeto: gburt/iaf
def gsm(name, k, w={}, logvar_minmax=16):
    w[name+'_weight'] = G.sharedf(np.zeros((k,)))
    w[name+'_logvar'] = G.sharedf(np.random.randn(k)*.1)
    def logp(v, w):
        mixtureweights = T.exp(w[name+'_weight'])
        mixtureweights /= mixtureweights.sum()
        logvar = logvar_minmax*w[name+'_logvar']
        var = T.exp(logvar)
        if k == 0:
            return 0.
        if k == 1:
            return -.5*(v**2).sum()/var[0] - v.size.astype(G.floatX) * (.5*T.log(2.*math.pi) + logvar[0])
        p = 0.
        for i in range(k):
            p += mixtureweights[i] * T.exp(-.5*v**2/var[i]) / T.sqrt(2.*math.pi*var[i])
        logp = T.log(p).sum()
        return logp
    
    def postup(updates, w):
        updates[w[name+'_logvar']] = T.clip(updates[w[name+'_logvar']], -1., 1.)
        return updates
     
    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 21
0
def Adam(ws, objective, alpha=.0003, beta=.9, gamma=.999):
    print 'Adam', 'alpha:', alpha, 'beta1:', beta, 'gamma:', gamma

    new = OrderedDict()

    gs = G.ndict.T_grad(objective.sum(), ws,
                        disconnected_inputs='warn')  #warn/raise

    it = G.sharedf(0.)
    new[it] = it + 1.

    fix1 = 1 - beta**(it + 1.)
    fix2 = 1 - gamma**(it + 1.)  # To make estimates unbiased
    lr_t = alpha * T.sqrt(fix2) / fix1

    ws_avg = []
    for j in range(len(ws)):
        w_avg = {}
        for i in ws[j]:
            w = ws[j][i]
            g = gs[j][i]

            # Initial values
            shape = w.get_value().shape
            m = G.sharedf(np.zeros(shape))
            v = G.sharedf(np.zeros(shape))
            w_avg[i] = G.sharedf(np.zeros(shape))

            # Updates
            new[m] = beta * m + (1 - beta) * g
            new[v] = gamma * v + (1 - gamma) * g**2
            new[w] = w + lr_t * new[m] / (T.sqrt(new[v]) + 1e-8)
            new[w_avg[i]] = gamma * new[w] + (1. - gamma) * w_avg[i]

        ws_avg += [w_avg]

    return new, ws_avg
Exemplo n.º 22
0
Arquivo: optim.py Projeto: gburt/iaf
def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2):
    print 'AdaMax2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2, 'n_accum:', n_accum
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')
    
    new = OrderedDict()
    
    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(new[it],n_accum), 0)
    update = T.eq(T.mod(new[it],n_accum), n_accum-1)

    for i in range(len(w)):
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        g_sum = G.sharedf(w[i].get_value() * 0.)
        
        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        new[g_sum] = ifelse(reset, g[i], g_sum + g[i])
        new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
        new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
        new[w[i]] = ifelse(update, w[i] + alpha *  new[mom1] / new[_max], w[i])
                
    return new
Exemplo n.º 23
0
Arquivo: optim.py Projeto: gburt/iaf
def Adam(ws, objective, alpha=.0003, beta=.9, gamma=.999):
    print 'Adam', 'alpha:',alpha,'beta1:',beta,'gamma:',gamma
    
    new = OrderedDict()

    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='warn') #warn/raise
    
    it = G.sharedf(0.)
    new[it] = it + 1.
    
    fix1 = 1-beta**(it+1.)
    fix2 = 1-gamma**(it+1.) # To make estimates unbiased
    lr_t = alpha * T.sqrt(fix2) / fix1
    
    ws_avg = []
    for j in range(len(ws)):
        w_avg = {}
        for i in ws[j]:
            w = ws[j][i]
            g = gs[j][i]
            
            # Initial values
            shape = w.get_value().shape
            m = G.sharedf(np.zeros(shape))
            v = G.sharedf(np.zeros(shape))
            w_avg[i] = G.sharedf(np.zeros(shape))
            
            # Updates
            new[m] = beta * m + (1-beta) * g
            new[v] = gamma * v + (1-gamma) * g**2
            new[w] = w + lr_t * new[m] / (T.sqrt(new[v]) + 1e-8)
            new[w_avg[i]] = gamma * new[w] + (1.-gamma) * w_avg[i]
            
        ws_avg += [w_avg]   
        
    return new, ws_avg
Exemplo n.º 24
0
def linear_l2(name, n_in, n_out, w):
    
    # L2 normalization of weights
    def l2normalize(_w):
        targetnorm=1.
        norm = T.sqrt((_w**2).sum(axis=0, keepdims=True))
        return _w * (targetnorm / norm)
    def maxconstraint(_w):
        return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True)))
    
    w[name+'_w'] = G.sharedf(0.05*np.random.randn(n_in,n_out))
    
    if maxweight > 0:
        w[name+'_w'].set_value(maxconstraint(w[name+'_w']).tag.test_value)
    w[name+'_b'] = G.sharedf(np.zeros((n_out,)))
    if l2norm:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((n_out,)))
    else:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.zeros((n_out,)))
    
    
    def f(h, w):
        _w = w[name+'_w']
        if l2norm:
            _w = l2normalize(_w)
        h = T.dot(h, _w)
        if l2norm:
            if logscale:
                h *= T.exp(logscale_scale*w[name+'_s'])
            else:
                h *= abs(w[name+'_s'])
        else:
            h *= T.exp(constant_rescale)
        h += w[name+'_b']
        
        if '__init' in w:
            # Std
            std = (1./init_stdev) * h.std(axis=0) + 1e-8
            if name+'_s' in w:
                if logscale:
                    w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale)
                else:
                    w[name+'_s'].set_value((1./std).tag.test_value)
            else:
                constant_rescale.set_value(-T.log(std).tag.test_value)
                #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value)
            
            h /= std.dimshuffle('x',0)
            
            # Mean
            mean = h.mean(axis=0)
            w[name+'_b'].set_value(-mean.tag.test_value)
            h -= mean.dimshuffle('x',0)
            
            #print name, abs(w[name+'_w']).get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()

        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Post updates: normalize weights to unit L2 norm
    def postup(updates, w):
        if l2norm and maxweight>0:
            updates[w[name+'_w']] = maxconstraint(updates[w[name+'_w']])
        return updates
    
    return G.Struct(__call__=f, postup=postup, w=w)
Exemplo n.º 25
0
Arquivo: conv.py Projeto: gburt/iaf
def conv2d(name, n_in, n_out, size_kernel=(3,3), pad_channel=True, border_mode='valid', downsample=1, upsample=1, datainit=True, zeroinit=False, l2norm=True,  w={}):
    
    # TODO FIX: blows up parameters if all inputs are 0
    
    if not pad_channel:
        border_mode = 'same'
        print 'No pad_channel, changing border_mode to same'

    if '[sharedw]' in name and '[/sharedw]' in name:
        name_w = name
        pre, b = name.split("[sharedw]")
        number, post = b.split("[/sharedw]")
        name_w = pre+"[s]"+post
        name = pre+number+post # Don't share the bias and scales
        #name = name_w # Also share the bias and scales
    else:
        name_w = name
    
    if type(downsample) == int:
        downsample = (downsample,downsample)
    assert type(downsample) == tuple
    assert border_mode in ['valid','full','same']
    
    _n_in = n_in
    _n_out = n_out
    if upsample > 1:
        _n_out = n_out * upsample**2
    
    if pad_channel:
        if size_kernel[0] > 1 or size_kernel[1] > 1:
            assert size_kernel[0] == size_kernel[1]
            assert border_mode == 'valid'
            _n_in += 1
        else:
            pad_channel = False
    
    if border_mode == 'same':
        assert size_kernel[0]%2 == 1
        border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2)
    
    def l2normalize(kerns):
        norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True))
        return kerns / norm
    def maxconstraint(kerns):
        return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True)))

    if zeroinit:
        w[name_w+'_w'] = G.sharedf(np.zeros((_n_out, _n_in, size_kernel[0], size_kernel[1])))
        datainit = False
    else: 
        w[name_w+'_w'] = G.sharedf(0.05*np.random.randn(_n_out, _n_in, size_kernel[0], size_kernel[1]))
        if maxweight > 0:
            w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value)
    
    w[name+'_b'] = G.sharedf(np.zeros((_n_out,)))
    
    if l2norm:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((_n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((_n_out,)))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.ones((_n_out,)))
    
    
    def f(h, w):
        
        input_shape = h.tag.test_value.shape[1:]

        _input = h
        
        if pad_channel:
            h = pad2dwithchannel(h, size_kernel)

        kerns = w[name_w+'_w']
        #if name == '1_down_conv1':
        #    kerns = T.printing.Print('kerns 1')(kerns)
        if l2norm:
            kerns = l2normalize(kerns)
            if logscale:
                kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x')
            else:
                kerns *= w[name+'_s'].dimshuffle(0,'x','x','x')
        elif do_constant_rescale:
            kerns *= constant_rescale.dimshuffle(0,'x','x','x')
        
        #if name == '1_down_conv1':
        #    kerns = T.printing.Print('kerns 2')(kerns)
        
        h = dnn_conv(h, kerns, border_mode=border_mode, subsample=downsample)

        # Mean-only batch norm
        if bn: 
            h -= h.mean(axis=(0,2,3), keepdims=True)
        
        h += w[name+'_b'].dimshuffle('x',0,'x','x')

        if '__init' in w and datainit:
            
            # Std
            data_std = h.std(axis=(0,2,3))
            num_zeros = (data_std.tag.test_value == 0).sum()
            if num_zeros > 0:
                print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init."
            else:
                
                std = (1./init_stdev) * data_std
                std += 1e-7
                
                if name+'_s' in w:
                    if logscale:
                        w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale)
                    else:
                        w[name+'_s'].set_value((1./std).tag.test_value)
                elif do_constant_rescale:
                    constant_rescale.set_value((1./std).tag.test_value)
                
                h /= std.dimshuffle('x',0,'x','x')
                
                # Mean
                mean = h.mean(axis=(0,2,3))
                w[name+'_b'].set_value(-mean.tag.test_value)
                h -= mean.dimshuffle('x',0,'x','x')
            
                #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()
        
        if upsample>1:
            h = depool2d_split(h, factor=upsample)
        
        if not '__init' in w:
            output_shape = h.tag.test_value.shape[1:]
            print 'conv2d', name, input_shape, output_shape, size_kernel, pad_channel, border_mode, downsample, upsample
        
        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Normalize weights to _norm L2 norm
    # TODO: check whether only_upper_bounds here really helps
    # (the effect is a higher learning rate in the beginning of training)
    def postup(updates, w):
        if l2norm and maxweight>0.:
            updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']])
        return updates
    
    return G.Struct(__call__=f, w=w, postup=postup)
Exemplo n.º 26
0
def conv2d(name, n_in, n_out, size_kernel=(3,3), pad_channel=True, border_mode='valid', downsample=1, upsample=1, datainit=True, zeroinit=False, l2norm=True,  w={}):
    
    # TODO FIX: blows up parameters if all inputs are 0
    
    if not pad_channel:
        border_mode = 'same'
        print 'No pad_channel, changing border_mode to same'

    if '[sharedw]' in name and '[/sharedw]' in name:
        name_w = name
        pre, b = name.split("[sharedw]")
        number, post = b.split("[/sharedw]")
        name_w = pre+"[s]"+post
        name = pre+number+post # Don't share the bias and scales
        #name = name_w # Also share the bias and scales
    else:
        name_w = name
    
    if type(downsample) == int:
        downsample = (downsample,downsample)
    assert type(downsample) == tuple
    assert border_mode in ['valid','full','same']
    
    _n_in = n_in
    _n_out = n_out
    if upsample > 1:
        _n_out = n_out * upsample**2
    
    if pad_channel:
        if size_kernel[0] > 1 or size_kernel[1] > 1:
            assert size_kernel[0] == size_kernel[1]
            assert border_mode == 'valid'
            _n_in += 1
        else:
            pad_channel = False
    
    if border_mode == 'same':
        assert size_kernel[0]%2 == 1
        border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2)
    
    def l2normalize(kerns):
        norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True))
        return kerns / norm
    def maxconstraint(kerns):
        return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True)))

    if zeroinit:
        w[name_w+'_w'] = G.sharedf(np.zeros((_n_out, _n_in, size_kernel[0], size_kernel[1])))
        datainit = False
    else: 
        w[name_w+'_w'] = G.sharedf(0.05*np.random.randn(_n_out, _n_in, size_kernel[0], size_kernel[1]))
        if maxweight > 0:
            w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value)
    
    w[name+'_b'] = G.sharedf(np.zeros((_n_out,)))
    if bias_logscale:
        w[name+'_bs'] = G.sharedf(0.)
    
    if l2norm:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((_n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((_n_out,)))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.ones((_n_out,)))
    
    
    def f(h, w):
        
        input_shape = h.tag.test_value.shape[1:]

        _input = h
        
        if pad_channel:
            h = pad2dwithchannel(h, size_kernel)

        kerns = w[name_w+'_w']
        #if name == '1_down_conv1':
        #    kerns = T.printing.Print('kerns 1')(kerns)
        if l2norm:
            kerns = l2normalize(kerns)
            if logscale:
                kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x')
            else:
                kerns *= w[name+'_s'].dimshuffle(0,'x','x','x')
        elif do_constant_rescale:
            kerns *= constant_rescale.dimshuffle(0,'x','x','x')
        
        #if name == '1_down_conv1':
        #    kerns = T.printing.Print('kerns 2')(kerns)
        
        h = dnn_conv(h, kerns, border_mode=border_mode, subsample=downsample)

        # Mean-only batch norm
        if bn: 
            h -= h.mean(axis=(0,2,3), keepdims=True)
        
        _b = w[name+'_b'].dimshuffle('x',0,'x','x')
        if bias_logscale:
            _b *= T.exp(logscale_scale * w[name+'_bs'])
        h += _b
        
        if '__init' in w and datainit:
            
            # Std
            data_std = h.std(axis=(0,2,3))
            num_zeros = (data_std.tag.test_value == 0).sum()
            if num_zeros > 0:
                print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init."
            else:
                
                std = (1./init_stdev) * data_std
                std += 1e-7
                
                if name+'_s' in w:
                    if logscale:
                        w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale)
                    else:
                        w[name+'_s'].set_value((1./std).tag.test_value)
                elif do_constant_rescale:
                    constant_rescale.set_value((1./std).tag.test_value)
                
                h /= std.dimshuffle('x',0,'x','x')
                
                # Mean
                mean = h.mean(axis=(0,2,3))
                w[name+'_b'].set_value(-mean.tag.test_value)
                h -= mean.dimshuffle('x',0,'x','x')
            
                #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()
        
        if upsample>1:
            h = depool2d_split(h, factor=upsample)
        
        if not '__init' in w:
            output_shape = h.tag.test_value.shape[1:]
            print 'conv2d', name, input_shape, output_shape, size_kernel, pad_channel, border_mode, downsample, upsample
        
        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Normalize weights to _norm L2 norm
    # TODO: check whether only_upper_bounds here really helps
    # (the effect is a higher learning rate in the beginning of training)
    def postup(updates, w):
        if l2norm and maxweight>0.:
            updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']])
        return updates
    
    return G.Struct(__call__=f, w=w, postup=postup)
Exemplo n.º 27
0
def randorth(shape):
    from scipy.linalg import sqrtm, inv
    assert len(shape) == 2
    w = np.random.normal(0, size=shape)
    w = w.dot(inv(sqrtm(w.T.dot(w))))
    return G.sharedf(w)
Exemplo n.º 28
0
def linear_l2(name, n_in, n_out, w):
    
    # L2 normalization of weights
    def l2normalize(_w):
        targetnorm=1.
        norm = T.sqrt((_w**2).sum(axis=0, keepdims=True))
        return _w * (targetnorm / norm)
    def maxconstraint(_w):
        return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True)))
    
    w[name+'_w'] = G.sharedf(0.05*np.random.randn(n_in,n_out))
    
    if maxweight > 0:
        w[name+'_w'].set_value(maxconstraint(w[name+'_w']).tag.test_value)
    w[name+'_b'] = G.sharedf(np.zeros((n_out,)))
    if l2norm:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((n_out,)))
    else:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.zeros((n_out,)))
    
    
    def f(h, w):
        _w = w[name+'_w']
        if l2norm:
            _w = l2normalize(_w)
        h = T.dot(h, _w)
        if l2norm:
            if logscale:
                h *= T.exp(logscale_scale*w[name+'_s'])
            else:
                h *= abs(w[name+'_s'])
        else:
            h *= T.exp(constant_rescale)
        h += w[name+'_b']
        
        if '__init' in w:
            # Std
            std = (1./init_stdev) * h.std(axis=0) + 1e-8
            if name+'_s' in w:
                if logscale:
                    w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale)
                else:
                    w[name+'_s'].set_value((1./std).tag.test_value)
            else:
                constant_rescale.set_value(-T.log(std).tag.test_value)
                #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value)
            
            h /= std.dimshuffle('x',0)
            
            # Mean
            mean = h.mean(axis=0)
            w[name+'_b'].set_value(-mean.tag.test_value)
            h -= mean.dimshuffle('x',0)
            
            #print name, abs(w[name+'_w']).get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()

        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Post updates: normalize weights to unit L2 norm
    def postup(updates, w):
        if l2norm and maxweight>0:
            updates[w[name+'_w']] = maxconstraint(updates[w[name+'_w']])
        return updates
    
    return G.Struct(__call__=f, postup=postup, w=w)
Exemplo n.º 29
0
def linear(name, n_in, n_out, diagonalzeros, l2norm=True, w={}):
    assert n_in % n_out == 0 or n_out % n_in == 0

    mask = np.ones((n_in, n_out), dtype=G.floatX)
    if n_out >= n_in:
        k = n_out / n_in
        for i in range(n_in):
            mask[i + 1:, i * k:(i + 1) * k] = 0
            if diagonalzeros:
                mask[i:i + 1, i * k:(i + 1) * k] = 0
    else:
        k = n_in / n_out
        for i in range(n_out):
            mask[(i + 1) * k:, i:i + 1] = 0
            if diagonalzeros:
                mask[i * k:(i + 1) * k:, i:i + 1] = 0

    # L2 normalization of weights
    def l2normalize(_w, axis=0):
        if diagonalzeros:
            # to prevent NaN gradients
            # TODO: smarter solution (also see below)
            if n_out >= n_in:
                _w = T.set_subtensor(_w[:, :n_out / n_in], 0.)
            else:
                _w = T.set_subtensor(_w[:, :1], 0.)
        targetnorm = 1.
        norm = T.sqrt((_w**2).sum(axis=axis, keepdims=True))
        norm += 1e-8
        new_w = _w * (targetnorm / norm)
        return new_w

    def maxconstraint(_w):
        return _w * (maxweight / T.maximum(maxweight,
                                           abs(_w).max(axis=0, keepdims=True)))

    w[name + '_w'] = G.sharedf(mask * 0.05 * np.random.randn(n_in, n_out))
    if maxweight > 0:
        w[name + '_w'].set_value(maxconstraint(w[name + '_w']).tag.test_value)

    w[name + '_b'] = G.sharedf(np.zeros((n_out, )))
    if l2norm:
        if logscale:
            w[name + '_s'] = G.sharedf(np.zeros((n_out, )))
        else:
            w[name + '_s'] = G.sharedf(np.ones((n_out, )))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.zeros((n_out, )))

    def f(h, w):
        _input = h
        _w = mask * w[name + '_w']
        if l2norm:
            _w = l2normalize(_w)
        h = T.dot(h, _w)
        if l2norm:
            if logscale:
                h *= T.exp(logscale_scale * w[name + '_s'])
            else:
                h *= abs(w[name + '_s'])
        elif do_constant_rescale:
            h *= T.exp(constant_rescale)

        h += w[name + '_b']

        if '__init' in w:
            # Std
            std = (1. / init_stdev) * h.std(axis=0)
            std += (std <= 0)
            std += 1e-8
            if name + '_s' in w:
                if logscale:
                    w[name + '_s'].set_value(-T.log(std).tag.test_value /
                                             logscale_scale)
                else:
                    w[name + '_s'].set_value((1. / std).tag.test_value)
            elif do_constant_rescale:
                constant_rescale.set_value(-T.log(std).tag.test_value)
                #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value)

            h /= std.dimshuffle('x', 0)

            # Mean
            mean = h.mean(axis=0)
            w[name + '_b'].set_value(-mean.tag.test_value)
            h -= mean.dimshuffle('x', 0)

            #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()

        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)

        return h

    # Post updates: normalize weights to unit L2 norm
    def postup(updates, w):
        updates[w[name + '_w']] = mask * updates[w[name + '_w']]
        if l2norm and maxweight > 0.:
            updates[w[name + '_w']] = maxconstraint(updates[w[name + '_w']])
        return updates

    return G.Struct(__call__=f, postup=postup, w=w)
Exemplo n.º 30
0
Arquivo: models.py Projeto: openai/iaf
def cvae1(shape_x, depths, depth_ar, n_h1, n_h2, n_z, prior='diag', posterior='down_diag', px='logistic', nl='softplus', kernel_x=(5,5), kernel_h=(3,3), kl_min=0, optim='adamax', alpha=0.002, beta1=0.1, beta2=0.001, weightsharing=None, pad_x = 0, data_init=None, downsample_type='nn'):
    _locals = locals()
    _locals.pop('data_init')
    print 'CVAE1 with ', _locals
    #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear']
    assert px in ['logistic','bernoulli']
    w = {} # model params
    if pad_x > 0:
        shape_x[1] += 2*pad_x
        shape_x[2] += 2*pad_x
    
    # Input whitening
    if px == 'logistic':
        w['logsd_x'] = G.sharedf(0.)
    
    # encoder
    x_enc = N.conv.conv2d('x_enc', shape_x[0], n_h1, kernel_x, downsample=2, w=w)
    x_dec = N.conv.conv2d('x_dec', n_h1, shape_x[0], kernel_x, upsample=2, w=w)
    x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w)
    
    layers = []
    for i in range(len(depths)):
        layers.append([])
        for j in range(depths[i]):
            downsample = (i > 0 and j == 0)
            if weightsharing is None or not weightsharing:
                name = str(i)+'_'+str(j)
            elif weightsharing == 'all':
                name = '[sharedw]'+str(i)+'_'+str(j)+'[/sharedw]'
            elif weightsharing == 'acrosslevels':
                name = '[sharedw]'+str(i)+'[/sharedw]'+'_'+str(j)
            elif weightsharing == 'withinlevel':
                name = '[sharedw]'+str(i)+'[/sharedw]'+'_'+str(j)
            else:
                raise Exception()
            layers[i].append(cvae_layer(name, prior, posterior, n_h1, n_h2, n_z, depth_ar, downsample, nl, kernel_h, False, downsample_type, w))
    
    # top-level value
    w['h_top'] = G.sharedf(np.zeros((n_h1,)))
    
    # Initialize variables
    x = T.tensor4('x', dtype='uint8')
    x.tag.test_value = data_init['x']
    n_batch_test = data_init['x'].shape[0]
    _x = T.clip((x + .5) / 256., 0, 1)
    #_x = T.clip(x / 255., 0, 1)
    
    if pad_x > 0:
        _x = N.conv.pad2d(_x, pad_x)
    
    # Objective function
    def f_encode_decode(w, train=True):
        
        results = {}
        
        h = x_enc(_x - .5, w)
        
        obj_kl = G.sharedf(0.)
        
        # bottom-up encoders
        for i in range(len(depths)):
            for j in range(depths[i]):
                h = layers[i][j].up(h, w)
        
        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths)))
        
        # top-down priors, posteriors and decoders
        for i in list(reversed(range(len(depths)))):
            for j in list(reversed(range(depths[i]))):
                h, kl = layers[i][j].down_q(h, train, w)
                kl_sum = kl.sum(axis=(1,2,3))
                results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum
                # Constraint: Minimum number of bits per featuremap, averaged across minibatch
                if kl_min > 0:
                    if True:
                        kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX)
                        obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX)
                    else:
                        kl = T.maximum(np.asarray(kl_min,G.floatX), kl.sum(axis=(2,3))).sum(axis=1,dtype=G.floatX)
                        obj_kl += kl
                else:
                    obj_kl += kl_sum
        
        output = .1 * x_dec(x_dec_nl(h, w), w)
        
        # empirical distribution
        if px == 'logistic':
            mean_x = T.clip(output+.5, 0+1/512., 1-1/512.)
            logsd_x = 0*mean_x + w['logsd_x']
            obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp
            #obj_z = T.printing.Print('obj_z')(obj_z)
            obj = obj_logpx - obj_kl
            # Compute the bits per pixel
            obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32')
            
            #if not '__init' in w:
            #    raise Exception()
        
        elif px == 'bernoulli':
            prob_x = T.nnet.sigmoid(output)
            prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7)
            #prob_x = T.printing.Print('prob_x')(prob_x)
            obj_logpx = N.rand.bernoulli(prob_x, _x).logp
            
            #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
            #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
            #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
            obj = obj_logpx - obj_kl
            #obj = T.printing.Print('obj')(obj)
        
        results['cost_x'] = -obj_logpx
        results['cost'] = -obj
        return results

    # Turns Gaussian noise 'eps' into a sample 
    def f_decoder(eps, w):
        
        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (eps['eps_0_0'].shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths)))
        
        # top-down priors, posteriors and decoders
        for i in list(reversed(range(len(depths)))):
            for j in list(reversed(range(depths[i]))):
                h = layers[i][j].down_p(h, eps['eps_'+str(i)+'_'+str(j)], w)
        
        output = .1 * x_dec(x_dec_nl(h, w), w)
        
        if px == 'logistic':
            mean_x = T.clip(output+.5, 0+1/512., 1-1/512.)
        elif px == 'bernoulli':
            mean_x = T.nnet.sigmoid(output)
        
        image = (256.*mean_x).astype('uint8')
        if pad_x > 0:
            image = image[:,:,pad_x:-pad_x,pad_x:-pad_x]
        
        return image
    
    def f_eps(n_batch, w):
        eps = {}
        for i in range(len(depths)):
            for j in range(depths[i]):
                eps['eps_'+str(i)+'_'+str(j)] = G.rng_curand.normal((n_batch,n_z,shape_x[1]/2**(i+1),shape_x[2]/2**(i+1)),dtype=floatX)
        return eps
            
    def postup(updates, w):
        nodes = [x_enc,x_dec]
        for n in nodes:
            updates = n.postup(updates, w)
        for i in range(len(depths)):
            for j in range(depths[i]):
                updates = layers[i][j].postup(updates, w)
        
        return updates
    
    # Compile init function
    if data_init != None:
        w['__init'] = OrderedDict()
        f_encode_decode(w)
        w.pop('__init')
        #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean()
    
    # Compile training function
        
    #todo: replace postup with below
    #w['_updates'] = updates
    #f_cost(w)
    #updates = w.pop('_updates')
    
    
    w_avg = {i: G.sharedf(w[i].get_value()) for i in w}
    
    def lazy(f):
        def newf(*args, **kws):
            if not hasattr(f, 'cache'):
                f.cache = f()
            return f.cache(*args, **kws)
        return newf
    
    @lazy
    def f_train():
        if optim == 'adamax':
            train_cost = f_encode_decode(w)['cost']
            updates = G.misc.optim.AdaMaxAvg([w],[w_avg], train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore')
        elif optim == 'eve':
            f = lambda w: f_encode_decode(w)['cost']
            train_cost, updates = G.misc.optim.Eve(w, w_avg, f, alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore')
        updates = postup(updates, w)
        return G.function({'x':x}, train_cost, updates=updates, lazy=lazy)    

    @lazy
    def f_train_q():
        keys_q = []
        for i in w:
            if '_q_' in i: keys_q.append(i)
        train_cost = f_encode_decode(w)['cost']
        updates = G.misc.optim.AdaMaxAvg([w],None, train_cost, alpha=-alpha, beta1=beta1, beta2=beta2, update_keys=keys_q, disconnected_inputs='ignore')
        updates = postup(updates, w)
        return G.function({'x':x}, train_cost, updates=updates, lazy=lazy)    
    
    # Compile evaluation function
    @lazy
    def f_eval():
        results = f_encode_decode(w_avg, False)
        return G.function({'x':x}, results)
    
    # Compile epsilon generating function
    @lazy
    def f_eps_():
        n_batch = T.lscalar()
        n_batch.tag.test_value = 16
        eps = f_eps(n_batch, w)
        return G.function({'n_batch':n_batch}, eps, lazy=lazy)
    
    # Compile sampling function
    @lazy
    def f_decode():
        eps = {}
        for i in range(len(depths)):
            for j in range(depths[i]):
                eps['eps_'+str(i)+'_'+str(j)] = T.tensor4('eps'+str(i))
                eps['eps_'+str(i)+'_'+str(j)].tag.test_value = np.random.randn(n_batch_test,n_z,shape_x[1]/2**(i+1),shape_x[2]/2**(i+1)).astype(floatX)
        image = f_decoder(eps, w_avg)
        return G.function(eps, image, lazy=lazy)
    
    return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps_, w=w, w_avg=w_avg)
Exemplo n.º 31
0
def conv2d(name,
           n_in,
           n_out,
           size_kernel=(3, 3),
           zerodiagonal=True,
           flipmask=False,
           pad_channel=True,
           border_mode='valid',
           zeroinit=False,
           l2norm=True,
           w={}):

    do_scale = False
    if zeroinit:
        l2norm = False
        do_scale = True

    if not pad_channel:
        border_mode = 'same'
        print 'No pad_channel, changing border_mode to same'

    #if 'whitener' not in name:
    #    pad_channel = False
    #    border_mode = 'same'

    if '[sharedw]' in name and '[/sharedw]' in name:
        name_w = name
        pre, b = name.split("[sharedw]")
        c, post = b.split("[/sharedw]")
        name_w = pre + "[s]" + post
        name = pre + c + post  # Don't share the bias and scales
        #name = name_w # Also share the bias and scales
    else:
        name_w = name

    assert border_mode in ['valid', 'full', 'same']

    _n_in = n_in

    if pad_channel:
        if size_kernel[0] > 1 or size_kernel[1] > 1:
            assert size_kernel[0] == size_kernel[1]
            assert border_mode == 'valid'
            _n_in += 1
        else:
            pad_channel = False

    if border_mode == 'same':
        assert size_kernel[0] % 2 == 1
        border_mode = ((size_kernel[0] - 1) / 2, (size_kernel[1] - 1) / 2)

    if True:
        # Build autoregressive mask
        l = (size_kernel[0] - 1) / 2
        m = (size_kernel[1] - 1) / 2
        mask = np.ones((n_out, _n_in, size_kernel[0], size_kernel[1]),
                       dtype=G.floatX)
        mask[:, :, :l, :] = 0
        mask[:, :, l, :m] = 0

        if n_out >= n_in:
            assert n_out % n_in == 0
            k = n_out / n_in
            for i in range(n_in):
                mask[i * k:(i + 1) * k, i + 1:, l, m] = 0
                if zerodiagonal:
                    mask[i * k:(i + 1) * k, i:i + 1, l, m] = 0
        else:
            assert n_in % n_out == 0
            k = n_in / n_out
            for i in range(n_out):
                mask[i:i + 1, (i + 1) * k:, l, m] = 0
                if zerodiagonal:
                    mask[i:i + 1, i * k:(i + 1) * k:, l, m] = 0
        if flipmask:
            mask = mask[::-1, ::-1, ::-1, ::-1]

    def l2normalize(kerns):
        if zerodiagonal:
            # to prevent NaN gradients
            # TODO: smarter solution (also see below)
            l = (size_kernel[0] - 1) / 2
            m = (size_kernel[1] - 1) / 2
            if n_out >= n_in:
                kerns = T.set_subtensor(kerns[:n_out / n_in, :, l, m], 0.)
            else:
                kerns = T.set_subtensor(kerns[:1, :, l, m], 0.)

        targetnorm = 1.
        norm = T.sqrt((kerns**2).sum(axis=(1, 2, 3), keepdims=True))
        norm += 1e-8
        return kerns * (targetnorm / norm)

    def maxconstraint(kerns):
        return kerns * (maxweight / T.maximum(
            maxweight,
            abs(kerns).max(axis=(1, 2, 3), keepdims=True)))

    if zeroinit:
        w[name_w + '_w'] = G.sharedf(
            np.zeros((n_out, _n_in, size_kernel[0], size_kernel[1])))
    else:
        w[name_w + '_w'] = G.sharedf(
            mask * 0.05 *
            np.random.randn(n_out, _n_in, size_kernel[0], size_kernel[1]))
        if maxweight > 0:
            w[name_w + '_w'].set_value(
                maxconstraint(w[name_w + '_w']).tag.test_value)

    w[name + '_b'] = G.sharedf(np.zeros((n_out, )))

    if l2norm or do_scale:
        if logscale:
            w[name + '_s'] = G.sharedf(np.zeros((n_out, )))
        else:
            w[name + '_s'] = G.sharedf(np.ones((n_out, )))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.ones((n_out, )))

    def f(h, w):
        input_shape = h.tag.test_value.shape[1:]

        _input = h

        if pad_channel:
            h = N.conv.pad2dwithchannel(h, size_kernel)

        kerns = mask * w[name_w + '_w']
        if l2norm:
            kerns = l2normalize(kerns)
        if l2norm or do_scale:
            if logscale:
                kerns *= T.exp(logscale_scale * w[name + '_s']).dimshuffle(
                    0, 'x', 'x', 'x')
            else:
                kerns *= w[name + '_s'].dimshuffle(0, 'x', 'x', 'x')
        elif do_constant_rescale:
            kerns *= constant_rescale.dimshuffle(0, 'x', 'x', 'x')

        h = N.conv.dnn_conv(h, kerns, border_mode=border_mode)

        # Center
        if bn:  # mean-only batch norm
            h -= h.mean(axis=(0, 2, 3), keepdims=True)

        h += w[name + '_b'].dimshuffle('x', 0, 'x', 'x')

        if '__init' in w and not zeroinit:

            # Std
            data_std = h.std(axis=(0, 2, 3))
            num_zeros = (data_std.tag.test_value == 0).sum()
            if num_zeros > 0:
                print "Warning: Stdev=0 for " + str(
                    num_zeros
                ) + " features in " + name + ". Skipping data-dependent init."
            else:
                if name + '_s' in w:
                    if logscale:
                        w[name + '_s'].set_value(
                            -T.log(data_std).tag.test_value / logscale_scale)
                    else:
                        w[name + '_s'].set_value(
                            (1. / data_std).tag.test_value)
                elif do_constant_rescale:
                    constant_rescale.set_value((1. / data_std).tag.test_value)
                    #w[name+'_w'].set_value((kerns / std.dimshuffle(0,'x','x','x')).tag.test_value)

                h /= data_std.dimshuffle('x', 0, 'x', 'x')

                # Mean
                mean = h.mean(axis=(0, 2, 3))
                w[name + '_b'].set_value(-mean.tag.test_value)
                h -= mean.dimshuffle('x', 0, 'x', 'x')

            #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()

        if not '__init' in w:
            output_shape = h.tag.test_value.shape[1:]
            print 'ar.conv2d', name, input_shape, output_shape, size_kernel, zerodiagonal, flipmask, pad_channel, border_mode, zeroinit, l2norm

        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)

        return h

    # Normalize weights to _norm L2 norm
    # TODO: check whether only_upper_bounds here really helps
    # (the effect is a higher learning rate in the beginning of training)
    def postup(updates, w):
        updates[w[name_w + '_w']] = mask * updates[w[name_w + '_w']]
        if l2norm and maxweight > 0.:
            updates[w[name_w + '_w']] = maxconstraint(updates[w[name_w +
                                                                '_w']])
        return updates

    return G.Struct(__call__=f, w=w, postup=postup)
Exemplo n.º 32
0
def fcvae(shape_x,
          depth_model,
          depth_ar,
          n_h1,
          n_h2,
          n_z,
          posterior,
          px='logistic',
          nl='softplus',
          alpha=0.002,
          beta1=0.1,
          beta2=0.001,
          share_w=False,
          data_init=None):
    _locals = locals()
    _locals.pop('data_init')
    print 'CVAE9 with ', _locals
    #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear']
    assert px in ['logistic', 'bernoulli']
    w = {}  # model params

    kernel_h = (1, 1)
    n_x = shape_x[0] * shape_x[1] * shape_x[2]

    # Input whitening
    if px == 'logistic':
        w['logsd_x'] = G.sharedf(0.)

    # encoder
    x_enc = N.conv.conv2d('x_enc', n_x, n_h1, (1, 1), w=w)
    x_dec = N.conv.conv2d('x_dec', n_h1, n_x, (1, 1), w=w)
    x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w)

    layers = []
    for i in range(depth_model):
        name = str(i)
        if share_w:
            name = '[sharedw]' + str(i) + '[/sharedw]'
        layers.append(
            cvae_layer(name, posterior, n_h1, n_h2, n_z, depth_ar, False, nl,
                       kernel_h, share_w, w))

    # top-level value
    #w['h_top'] = G.sharedf(np.zeros((n_h1,)))
    w['h_top'] = G.sharedf(np.random.normal(0, 0.01, size=(n_h1, )))

    # Initialize variables
    x = T.tensor4('x')
    x.tag.test_value = data_init['x']
    n_batch_test = data_init['x'].shape[0]
    _x = T.clip(x / 255., 0, 1)

    # Objective function
    def f_cost(w, train=True):

        results = {}

        h = x_enc(_x.reshape((-1, n_x, 1, 1)) - .5, w)

        obj_logpz = 0
        obj_logqz = 0

        # bottom-up encoders
        for i in range(depth_model):
            h = layers[i].up(h, w)

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'),
                   (_x.shape[0], 1, 1, 1))

        # top-down priors, posteriors and decoders
        for i in list(reversed(range(depth_model))):
            h, _obj_logqz, _obj_logpz = layers[i].down_q(h, train, w)
            obj_logqz += _obj_logqz
            obj_logpz += _obj_logpz
            results['cost_z' + str(i).zfill(3)] = _obj_logqz - _obj_logpz

        output = .1 * x_dec(x_dec_nl(h, w), w).reshape(
            (-1, shape_x[0], shape_x[1], shape_x[2]))

        # empirical distribution
        if px == 'logistic':
            mean_x = T.clip(output, -.5, .5)
            logsd_x = 0 * mean_x + w['logsd_x']
            obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 255.,
                                                    _x - .5).logp

            obj = obj_logpz - obj_logqz + obj_logpx
            # Compute the bits per pixel
            obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32')

        elif px == 'bernoulli':
            prob_x = T.nnet.sigmoid(output)
            prob_x = T.minimum(prob_x, 1 - 1e-7)
            prob_x = T.maximum(prob_x, 1e-7)
            #prob_x = T.printing.Print('prob_x')(prob_x)
            obj_logpx = N.rand.bernoulli(prob_x, _x).logp

            #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
            #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
            #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
            obj = obj_logpz - obj_logqz + obj_logpx
            #obj = T.printing.Print('obj')(obj)

        results['cost_x'] = -obj_logpx
        results['cost'] = -obj
        return results

        #print 'obj_logpz', obj_logpz.tag.test_value
        #print 'obj_logqz', obj_logqz.tag.test_value
        #print 'obj_logpx', obj_x.tag.test_value
        #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
        #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
        #obj_x = T.printing.Print('obj_logpx')(obj_x)

    # Turns Gaussian noise 'eps' into a sample
    def f_decoder(eps, w):

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'),
                   (eps['eps_0'].shape[0], 1, 1, 1))

        # top-down priors, posteriors and decoders
        for i in list(reversed(range(depth_model))):
            h = layers[i].down_p(h, eps['eps_' + str(i)], w)

        output = .1 * x_dec(x_dec_nl(h, w), w).reshape(
            (-1, shape_x[0], shape_x[1], shape_x[2]))
        if px == 'logistic':
            mean_x = T.clip(output[:, :, :, :] + .5, 0, 1)
        elif px == 'bernoulli':
            mean_x = T.nnet.sigmoid(output)
        image = (255. * T.clip(mean_x, 0, 1)).astype('uint8')
        return image

    def f_eps(n_batch, w):
        eps = {}
        for i in range(depth_model):
            eps['eps_' + str(i)] = G.rng_curand.normal((n_batch, n_z, 1, 1),
                                                       dtype=floatX)
        return eps

    def postup(updates, w):
        nodes = [x_enc, x_dec]
        for n in nodes:
            updates = n.postup(updates, w)
        for i in range(depth_model):
            updates = layers[i].postup(updates, w)

        return updates

    # Compile init function
    if data_init != None:
        w['__init'] = OrderedDict()
        f_cost(w)
        w.pop('__init')
        #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean()

    # Compile training function
    results = f_cost(w)
    updates, (w_avg, ) = G.misc.optim.AdaMaxAvg([w],
                                                results['cost'],
                                                alpha=-alpha,
                                                beta1=beta1,
                                                beta2=beta2,
                                                disconnected_inputs='ignore')
    #todo: replace postup with below
    #w['_updates'] = updates
    #f_cost(w)
    #updates = w.pop('_updates')

    updates = postup(updates, w)
    f_train = G.function({'x': x}, results['cost'], updates=updates)

    # Compile evaluation function
    results = f_cost(w_avg, False)
    f_eval = G.function({'x': x}, results)

    # Compile epsilon generating function
    n_batch = T.lscalar()
    n_batch.tag.test_value = 16
    eps = f_eps(n_batch, w)
    f_eps = G.function({'n_batch': n_batch}, eps)

    # Compile sampling function
    eps = {}
    for i in range(depth_model):
        eps['eps_' + str(i)] = T.tensor4('eps' + str(i))
        eps['eps_' + str(i)].tag.test_value = np.random.randn(
            n_batch_test, n_z, 1, 1).astype(floatX)
    image = f_decoder(eps, w_avg)
    f_decode = G.function(eps, image)

    return G.Struct(train=f_train,
                    eval=f_eval,
                    decode=f_decode,
                    eps=f_eps,
                    w=w,
                    w_avg=w_avg)
Exemplo n.º 33
0
    def f_encode_decode(w, train=True):

        results = {}

        h = x_enc(_x - .5, w)

        obj_kl = G.sharedf(0.)

        # bottom-up encoders
        for i in range(len(depths)):
            for j in range(depths[i]):
                h = layers[i][j].up(h, w)

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'),
                   (_x.shape[0], 1, shape_x[1] / 2**len(depths),
                    shape_x[2] / 2**len(depths)))

        # top-down priors, posteriors and decoders
        for i in list(reversed(range(len(depths)))):
            for j in list(reversed(range(depths[i]))):
                h, kl = layers[i][j].down_q(h, train, w)
                kl_sum = kl.sum(axis=(1, 2, 3))
                results['cost_z' + str(i).zfill(3) + '_' +
                        str(j).zfill(3)] = kl_sum
                # Constraint: Minimum number of bits per featuremap, averaged across minibatch
                if kl_min > 0:
                    if True:
                        kl = kl.sum(axis=(2, 3)).mean(axis=0, dtype=G.floatX)
                        obj_kl += T.maximum(np.asarray(kl_min, G.floatX),
                                            kl).sum(dtype=G.floatX)
                    else:
                        kl = T.maximum(np.asarray(kl_min, G.floatX),
                                       kl.sum(axis=(2, 3))).sum(axis=1,
                                                                dtype=G.floatX)
                        obj_kl += kl
                else:
                    obj_kl += kl_sum

        output = .1 * x_dec(x_dec_nl(h, w), w)

        # empirical distribution
        if px == 'logistic':
            mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.)
            logsd_x = 0 * mean_x + w['logsd_x']
            obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 256.,
                                                    _x).logp
            #obj_z = T.printing.Print('obj_z')(obj_z)
            obj = obj_logpx - obj_kl
            # Compute the bits per pixel
            obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32')

            #if not '__init' in w:
            #    raise Exception()

        elif px == 'bernoulli':
            prob_x = T.nnet.sigmoid(output)
            prob_x = T.maximum(T.minimum(prob_x, 1 - 1e-7), 1e-7)
            #prob_x = T.printing.Print('prob_x')(prob_x)
            obj_logpx = N.rand.bernoulli(prob_x, _x).logp

            #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
            #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
            #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
            obj = obj_logpx - obj_kl
            #obj = T.printing.Print('obj')(obj)

        results['cost_x'] = -obj_logpx
        results['cost'] = -obj
        return results
Exemplo n.º 34
0
Arquivo: rand.py Projeto: gburt/iaf
def zero_centered_laplace(name, w={}):
    w[name+'_logscale'] = G.sharedf(0.)
    def logp(v, w):
        return -abs(v).sum()/T.exp(w[name+'_logscale']) - v.size.astype(G.floatX) * (T.log(2.) + w[name+'_logscale'])
    postup = lambda updates, w:updates
    return G.Struct(logp=logp, postup=postup, w=w)
Exemplo n.º 35
0
Arquivo: ar.py Projeto: gburt/iaf
def conv2d(name, n_in, n_out, size_kernel=(3,3), zerodiagonal=True, flipmask=False, pad_channel=True, border_mode='valid', zeroinit=False, l2norm=True, w={}):
    
    do_scale = False
    if zeroinit:
        l2norm = False
        do_scale = True
    
    if not pad_channel:
        border_mode = 'same'
        print 'No pad_channel, changing border_mode to same'
        
    #if 'whitener' not in name:
    #    pad_channel = False
    #    border_mode = 'same'
    
    if '[sharedw]' in name and '[/sharedw]' in name:
        name_w = name
        pre, b = name.split("[sharedw]")
        c, post = b.split("[/sharedw]")
        name_w = pre+"[s]"+post
        name = pre+c+post # Don't share the bias and scales
        #name = name_w # Also share the bias and scales
    else:
        name_w = name
    
    assert border_mode in ['valid','full','same']
    
    _n_in = n_in
    
    if pad_channel:
        if size_kernel[0] > 1 or size_kernel[1] > 1:
            assert size_kernel[0] == size_kernel[1]
            assert border_mode == 'valid'
            _n_in += 1
        else:
            pad_channel = False
    
    if border_mode == 'same':
        assert size_kernel[0]%2 == 1
        border_mode = ((size_kernel[0]-1)/2,(size_kernel[1]-1)/2)
    
    if True:
        # Build autoregressive mask
        l = (size_kernel[0]-1)/2
        m = (size_kernel[1]-1)/2
        mask = np.ones((n_out, _n_in, size_kernel[0], size_kernel[1]),dtype=G.floatX)
        mask[:,:,:l,:] = 0
        mask[:,:,l,:m] = 0
        
        if n_out >= n_in:
            assert n_out%n_in == 0
            k = n_out / n_in
            for i in range(n_in):
                mask[i*k:(i+1)*k,i+1:,l,m] = 0
                if zerodiagonal:
                    mask[i*k:(i+1)*k,i:i+1,l,m] = 0
        else:
            assert n_in%n_out == 0
            k = n_in / n_out
            for i in range(n_out):
                mask[i:i+1,(i+1)*k:,l,m] = 0
                if zerodiagonal:
                    mask[i:i+1,i*k:(i+1)*k:,l,m] = 0
        if flipmask:
            mask = mask[::-1,::-1,::-1,::-1]
    
    
    def l2normalize(kerns):
        if zerodiagonal:
            # to prevent NaN gradients
            # TODO: smarter solution (also see below)
            l = (size_kernel[0]-1)/2
            m = (size_kernel[1]-1)/2
            if n_out >= n_in:
                kerns = T.set_subtensor(kerns[:n_out/n_in,:,l,m], 0.)
            else:
                kerns = T.set_subtensor(kerns[:1,:,l,m], 0.)
        
        targetnorm = 1.
        norm = T.sqrt((kerns**2).sum(axis=(1,2,3), keepdims=True))
        norm += 1e-8
        return kerns * (targetnorm / norm)
    def maxconstraint(kerns):
        return kerns * (maxweight / T.maximum(maxweight, abs(kerns).max(axis=(1,2,3), keepdims=True)))

    if zeroinit:
        w[name_w+'_w'] = G.sharedf(np.zeros((n_out, _n_in, size_kernel[0], size_kernel[1])))
    else:
        w[name_w+'_w'] = G.sharedf(mask * 0.05*np.random.randn(n_out, _n_in, size_kernel[0], size_kernel[1]))
        if maxweight > 0:
            w[name_w+'_w'].set_value(maxconstraint(w[name_w+'_w']).tag.test_value)
    
    w[name+'_b'] = G.sharedf(np.zeros((n_out,)))

    if l2norm or do_scale:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((n_out,)))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.ones((n_out,)))
    
    
    def f(h, w):
        input_shape = h.tag.test_value.shape[1:]
        
        _input = h
        
        if pad_channel:
            h = N.conv.pad2dwithchannel(h, size_kernel)
        
        kerns = mask * w[name_w+'_w']
        if l2norm:
            kerns = l2normalize(kerns)
        if l2norm or do_scale:
            if logscale:
                kerns *= T.exp(logscale_scale*w[name+'_s']).dimshuffle(0,'x','x','x')
            else:
                kerns *= w[name+'_s'].dimshuffle(0,'x','x','x')
        elif do_constant_rescale:
            kerns *= constant_rescale.dimshuffle(0,'x','x','x')
        
        h = N.conv.dnn_conv(h, kerns, border_mode=border_mode)
        
        # Center
        if bn: # mean-only batch norm
            h -= h.mean(axis=(0,2,3), keepdims=True)
        
        h += w[name+'_b'].dimshuffle('x',0,'x','x')
        
        if '__init' in w and not zeroinit:
            
            # Std
            data_std = h.std(axis=(0,2,3))
            num_zeros = (data_std.tag.test_value == 0).sum()
            if num_zeros > 0:
                print "Warning: Stdev=0 for "+str(num_zeros)+" features in "+name+". Skipping data-dependent init."
            else:
                if name+'_s' in w:
                    if logscale:
                        w[name+'_s'].set_value(-T.log(data_std).tag.test_value/logscale_scale)
                    else:
                        w[name+'_s'].set_value((1./data_std).tag.test_value)
                elif do_constant_rescale:
                    constant_rescale.set_value((1./data_std).tag.test_value)
                    #w[name+'_w'].set_value((kerns / std.dimshuffle(0,'x','x','x')).tag.test_value)
                
                h /= data_std.dimshuffle('x',0,'x','x')
                
                # Mean
                mean = h.mean(axis=(0,2,3))
                w[name+'_b'].set_value(-mean.tag.test_value)
                h -= mean.dimshuffle('x',0,'x','x')
                
            #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()
        
        if not '__init' in w:
            output_shape = h.tag.test_value.shape[1:]
            print 'ar.conv2d', name, input_shape, output_shape, size_kernel, zerodiagonal, flipmask, pad_channel, border_mode, zeroinit, l2norm
        
        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Normalize weights to _norm L2 norm
    # TODO: check whether only_upper_bounds here really helps
    # (the effect is a higher learning rate in the beginning of training)
    def postup(updates, w):
        updates[w[name_w+'_w']] = mask * updates[w[name_w+'_w']]
        if l2norm and maxweight>0.:
            updates[w[name_w+'_w']] = maxconstraint(updates[w[name_w+'_w']])
        return updates
    
    return G.Struct(__call__=f, w=w, postup=postup)
Exemplo n.º 36
0
def randorth(shape):
    from scipy.linalg import sqrtm, inv
    assert len(shape) == 2
    w = np.random.normal(0, size=shape)
    w = w.dot(inv(sqrtm(w.T.dot(w))))
    return G.sharedf(w)
Exemplo n.º 37
0
def cvae1(shape_x,
          depths,
          depth_ar,
          n_h1,
          n_h2,
          n_z,
          prior='diag',
          posterior='down_diag',
          px='logistic',
          nl='softplus',
          kernel_x=(5, 5),
          kernel_h=(3, 3),
          kl_min=0,
          optim='adamax',
          alpha=0.002,
          beta1=0.1,
          beta2=0.001,
          weightsharing=None,
          pad_x=0,
          data_init=None,
          downsample_type='nn'):
    _locals = locals()
    _locals.pop('data_init')
    print 'CVAE1 with ', _locals
    #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear']
    assert px in ['logistic', 'bernoulli']
    w = {}  # model params
    if pad_x > 0:
        shape_x[1] += 2 * pad_x
        shape_x[2] += 2 * pad_x

    # Input whitening
    if px == 'logistic':
        w['logsd_x'] = G.sharedf(0.)

    # encoder
    x_enc = N.conv.conv2d('x_enc',
                          shape_x[0],
                          n_h1,
                          kernel_x,
                          downsample=2,
                          w=w)
    x_dec = N.conv.conv2d('x_dec', n_h1, shape_x[0], kernel_x, upsample=2, w=w)
    x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w)

    layers = []
    for i in range(len(depths)):
        layers.append([])
        for j in range(depths[i]):
            downsample = (i > 0 and j == 0)
            if weightsharing is None or not weightsharing:
                name = str(i) + '_' + str(j)
            elif weightsharing == 'all':
                name = '[sharedw]' + str(i) + '_' + str(j) + '[/sharedw]'
            elif weightsharing == 'acrosslevels':
                name = '[sharedw]' + str(i) + '[/sharedw]' + '_' + str(j)
            elif weightsharing == 'withinlevel':
                name = '[sharedw]' + str(i) + '[/sharedw]' + '_' + str(j)
            else:
                raise Exception()
            layers[i].append(
                cvae_layer(name, prior, posterior, n_h1, n_h2, n_z, depth_ar,
                           downsample, nl, kernel_h, False, downsample_type,
                           w))

    # top-level value
    w['h_top'] = G.sharedf(np.zeros((n_h1, )))

    # Initialize variables
    x = T.tensor4('x', dtype='uint8')
    x.tag.test_value = data_init['x']
    n_batch_test = data_init['x'].shape[0]
    _x = T.clip((x + .5) / 256., 0, 1)
    #_x = T.clip(x / 255., 0, 1)

    if pad_x > 0:
        _x = N.conv.pad2d(_x, pad_x)

    # Objective function
    def f_encode_decode(w, train=True):

        results = {}

        h = x_enc(_x - .5, w)

        obj_kl = G.sharedf(0.)

        # bottom-up encoders
        for i in range(len(depths)):
            for j in range(depths[i]):
                h = layers[i][j].up(h, w)

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'),
                   (_x.shape[0], 1, shape_x[1] / 2**len(depths),
                    shape_x[2] / 2**len(depths)))

        # top-down priors, posteriors and decoders
        for i in list(reversed(range(len(depths)))):
            for j in list(reversed(range(depths[i]))):
                h, kl = layers[i][j].down_q(h, train, w)
                kl_sum = kl.sum(axis=(1, 2, 3))
                results['cost_z' + str(i).zfill(3) + '_' +
                        str(j).zfill(3)] = kl_sum
                # Constraint: Minimum number of bits per featuremap, averaged across minibatch
                if kl_min > 0:
                    if True:
                        kl = kl.sum(axis=(2, 3)).mean(axis=0, dtype=G.floatX)
                        obj_kl += T.maximum(np.asarray(kl_min, G.floatX),
                                            kl).sum(dtype=G.floatX)
                    else:
                        kl = T.maximum(np.asarray(kl_min, G.floatX),
                                       kl.sum(axis=(2, 3))).sum(axis=1,
                                                                dtype=G.floatX)
                        obj_kl += kl
                else:
                    obj_kl += kl_sum

        output = .1 * x_dec(x_dec_nl(h, w), w)

        # empirical distribution
        if px == 'logistic':
            mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.)
            logsd_x = 0 * mean_x + w['logsd_x']
            obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1 / 256.,
                                                    _x).logp
            #obj_z = T.printing.Print('obj_z')(obj_z)
            obj = obj_logpx - obj_kl
            # Compute the bits per pixel
            obj *= (1. / np.prod(shape_x) * 1. / np.log(2.)).astype('float32')

            #if not '__init' in w:
            #    raise Exception()

        elif px == 'bernoulli':
            prob_x = T.nnet.sigmoid(output)
            prob_x = T.maximum(T.minimum(prob_x, 1 - 1e-7), 1e-7)
            #prob_x = T.printing.Print('prob_x')(prob_x)
            obj_logpx = N.rand.bernoulli(prob_x, _x).logp

            #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
            #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
            #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
            obj = obj_logpx - obj_kl
            #obj = T.printing.Print('obj')(obj)

        results['cost_x'] = -obj_logpx
        results['cost'] = -obj
        return results

    # Turns Gaussian noise 'eps' into a sample
    def f_decoder(eps, w):

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x', 0, 'x', 'x'),
                   (eps['eps_0_0'].shape[0], 1, shape_x[1] / 2**len(depths),
                    shape_x[2] / 2**len(depths)))

        # top-down priors, posteriors and decoders
        for i in list(reversed(range(len(depths)))):
            for j in list(reversed(range(depths[i]))):
                h = layers[i][j].down_p(h, eps['eps_' + str(i) + '_' + str(j)],
                                        w)

        output = .1 * x_dec(x_dec_nl(h, w), w)

        if px == 'logistic':
            mean_x = T.clip(output + .5, 0 + 1 / 512., 1 - 1 / 512.)
        elif px == 'bernoulli':
            mean_x = T.nnet.sigmoid(output)

        image = (256. * mean_x).astype('uint8')
        if pad_x > 0:
            image = image[:, :, pad_x:-pad_x, pad_x:-pad_x]

        return image

    def f_eps(n_batch, w):
        eps = {}
        for i in range(len(depths)):
            for j in range(depths[i]):
                eps['eps_' + str(i) + '_' + str(j)] = G.rng_curand.normal(
                    (n_batch, n_z, shape_x[1] / 2**(i + 1),
                     shape_x[2] / 2**(i + 1)),
                    dtype=floatX)
        return eps

    def postup(updates, w):
        nodes = [x_enc, x_dec]
        for n in nodes:
            updates = n.postup(updates, w)
        for i in range(len(depths)):
            for j in range(depths[i]):
                updates = layers[i][j].postup(updates, w)

        return updates

    # Compile init function
    if data_init != None:
        w['__init'] = OrderedDict()
        f_encode_decode(w)
        w.pop('__init')
        #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean()

    # Compile training function

    #todo: replace postup with below
    #w['_updates'] = updates
    #f_cost(w)
    #updates = w.pop('_updates')

    w_avg = {i: G.sharedf(w[i].get_value()) for i in w}

    def lazy(f):
        def newf(*args, **kws):
            if not hasattr(f, 'cache'):
                f.cache = f()
            return f.cache(*args, **kws)

        return newf

    @lazy
    def f_train():
        if optim == 'adamax':
            train_cost = f_encode_decode(w)['cost']
            updates = G.misc.optim.AdaMaxAvg([w], [w_avg],
                                             train_cost,
                                             alpha=-alpha,
                                             beta1=beta1,
                                             beta2=beta2,
                                             disconnected_inputs='ignore')
        elif optim == 'eve':
            f = lambda w: f_encode_decode(w)['cost']
            train_cost, updates = G.misc.optim.Eve(
                w,
                w_avg,
                f,
                alpha=-alpha,
                beta1=beta1,
                beta2=beta2,
                disconnected_inputs='ignore')
        updates = postup(updates, w)
        return G.function({'x': x}, train_cost, updates=updates, lazy=lazy)

    @lazy
    def f_train_q():
        keys_q = []
        for i in w:
            if '_q_' in i: keys_q.append(i)
        train_cost = f_encode_decode(w)['cost']
        updates = G.misc.optim.AdaMaxAvg([w],
                                         None,
                                         train_cost,
                                         alpha=-alpha,
                                         beta1=beta1,
                                         beta2=beta2,
                                         update_keys=keys_q,
                                         disconnected_inputs='ignore')
        updates = postup(updates, w)
        return G.function({'x': x}, train_cost, updates=updates, lazy=lazy)

    # Compile evaluation function
    @lazy
    def f_eval():
        results = f_encode_decode(w_avg, False)
        return G.function({'x': x}, results)

    # Compile epsilon generating function
    @lazy
    def f_eps_():
        n_batch = T.lscalar()
        n_batch.tag.test_value = 16
        eps = f_eps(n_batch, w)
        return G.function({'n_batch': n_batch}, eps, lazy=lazy)

    # Compile sampling function
    @lazy
    def f_decode():
        eps = {}
        for i in range(len(depths)):
            for j in range(depths[i]):
                eps['eps_' + str(i) + '_' + str(j)] = T.tensor4('eps' + str(i))
                eps['eps_' + str(i) + '_' +
                    str(j)].tag.test_value = np.random.randn(
                        n_batch_test, n_z, shape_x[1] / 2**(i + 1),
                        shape_x[2] / 2**(i + 1)).astype(floatX)
        image = f_decoder(eps, w_avg)
        return G.function(eps, image, lazy=lazy)

    return G.Struct(train=f_train,
                    eval=f_eval,
                    decode=f_decode,
                    eps=f_eps_,
                    w=w,
                    w_avg=w_avg)
Exemplo n.º 38
0
Arquivo: rand.py Projeto: gburt/iaf
def gaussian_spherical(shape=None, sample=None):
    if sample is None:
        sample = G.rng_curand.normal(shape)
    if shape is None:
        assert sample != None
        shape = sample.shape
    logp = -.5 * (T.log(2*math.pi) + sample**2).flatten(2).sum(axis=1)
    entr = (1.*T.prod(shape[1:]).astype(G.floatX)) * T.ones((shape[0],), dtype=G.floatX) * G.sharedf(.5 * (np.log(2.*math.pi)+1.))
    return RandomVariable(sample, logp, entr, shape=shape)
Exemplo n.º 39
0
Arquivo: models.py Projeto: openai/iaf
def fcvae(shape_x, depth_model, depth_ar, n_h1, n_h2, n_z, posterior, px='logistic', nl='softplus', alpha=0.002, beta1=0.1, beta2=0.001, share_w=False, data_init=None):
    _locals = locals()
    _locals.pop('data_init')
    print 'CVAE9 with ', _locals
    #assert posterior in ['diag1','diag2','iaf_linear','iaf_nonlinear']
    assert px in ['logistic','bernoulli']
    w = {} # model params
    
    kernel_h = (1,1)
    n_x = shape_x[0]*shape_x[1]*shape_x[2]
    
    # Input whitening
    if px == 'logistic':
        w['logsd_x'] = G.sharedf(0.)
    
    # encoder
    x_enc = N.conv.conv2d('x_enc', n_x, n_h1, (1,1), w=w)
    x_dec = N.conv.conv2d('x_dec', n_h1, n_x, (1,1), w=w)
    x_dec_nl = N.nonlinearity('x_dec_nl', nl, n_h1, w)
    
    layers = []
    for i in range(depth_model):
        name = str(i)
        if share_w:
            name = '[sharedw]'+str(i)+'[/sharedw]'
        layers.append(cvae_layer(name, posterior, n_h1, n_h2, n_z, depth_ar, False, nl, kernel_h, share_w, w))
    
    # top-level value
    #w['h_top'] = G.sharedf(np.zeros((n_h1,)))
    w['h_top'] = G.sharedf(np.random.normal(0,0.01,size=(n_h1,)))
    
    # Initialize variables
    x = T.tensor4('x')
    x.tag.test_value = data_init['x']
    n_batch_test = data_init['x'].shape[0]
    _x = T.clip(x / 255., 0, 1)
    
    # Objective function
    def f_cost(w, train=True):
        
        results = {}
        
        h = x_enc(_x.reshape((-1,n_x,1,1)) - .5, w)
        
        obj_logpz = 0
        obj_logqz = 0
        
        # bottom-up encoders
        for i in range(depth_model):
            h = layers[i].up(h, w)
        
        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,1,1))
        
        # top-down priors, posteriors and decoders
        for i in list(reversed(range(depth_model))):
            h, _obj_logqz, _obj_logpz = layers[i].down_q(h, train, w)
            obj_logqz += _obj_logqz
            obj_logpz += _obj_logpz
            results['cost_z'+str(i).zfill(3)] = _obj_logqz - _obj_logpz
        
        output = .1 * x_dec(x_dec_nl(h, w), w).reshape((-1,shape_x[0],shape_x[1],shape_x[2]))
        
        # empirical distribution
        if px == 'logistic':
            mean_x = T.clip(output, -.5, .5)
            logsd_x = 0*mean_x + w['logsd_x']
            obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/255., _x - .5).logp
            
            obj = obj_logpz - obj_logqz + obj_logpx
            # Compute the bits per pixel
            obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32')
            
        elif px == 'bernoulli':
            prob_x = T.nnet.sigmoid(output)
            prob_x = T.minimum(prob_x, 1-1e-7)
            prob_x = T.maximum(prob_x, 1e-7)
            #prob_x = T.printing.Print('prob_x')(prob_x)
            obj_logpx = N.rand.bernoulli(prob_x, _x).logp
            
            #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
            #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
            #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
            obj = obj_logpz - obj_logqz + obj_logpx
            #obj = T.printing.Print('obj')(obj)
        
        results['cost_x'] = -obj_logpx
        results['cost'] = -obj
        return results
        
        #print 'obj_logpz', obj_logpz.tag.test_value
        #print 'obj_logqz', obj_logqz.tag.test_value
        #print 'obj_logpx', obj_x.tag.test_value
        #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
        #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
        #obj_x = T.printing.Print('obj_logpx')(obj_x)

        
        
    
    # Turns Gaussian noise 'eps' into a sample 
    def f_decoder(eps, w):

        # top-level activations
        h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (eps['eps_0'].shape[0],1,1,1))
        
        # top-down priors, posteriors and decoders
        for i in list(reversed(range(depth_model))):
            h = layers[i].down_p(h, eps['eps_'+str(i)], w)
        
        output = .1 * x_dec(x_dec_nl(h, w), w).reshape((-1,shape_x[0],shape_x[1],shape_x[2]))
        if px == 'logistic':
            mean_x = T.clip(output[:,:,:,:] + .5, 0, 1)
        elif px == 'bernoulli':
            mean_x = T.nnet.sigmoid(output)
        image = (255.*T.clip(mean_x, 0, 1)).astype('uint8')
        return image
    
    def f_eps(n_batch, w):
        eps = {}
        for i in range(depth_model):
            eps['eps_'+str(i)] = G.rng_curand.normal((n_batch,n_z,1,1),dtype=floatX)
        return eps
            
    def postup(updates, w):
        nodes = [x_enc,x_dec]
        for n in nodes:
            updates = n.postup(updates, w)
        for i in range(depth_model):
            updates = layers[i].postup(updates, w)
        
        return updates
    
    # Compile init function
    if data_init != None:
        w['__init'] = OrderedDict()
        f_cost(w)
        w.pop('__init')
        #for i in w: print i, abs(w[i].get_value()).min(), abs(w[i].get_value()).max(), abs(w[i].get_value()).mean()
    
    # Compile training function
    results = f_cost(w)
    updates, (w_avg,) = G.misc.optim.AdaMaxAvg([w], results['cost'], alpha=-alpha, beta1=beta1, beta2=beta2, disconnected_inputs='ignore')
    #todo: replace postup with below
    #w['_updates'] = updates
    #f_cost(w)
    #updates = w.pop('_updates')
    
    updates = postup(updates, w)
    f_train = G.function({'x':x}, results['cost'], updates=updates)
    
    # Compile evaluation function
    results = f_cost(w_avg, False)
    f_eval = G.function({'x':x}, results)
    
    # Compile epsilon generating function
    n_batch = T.lscalar()
    n_batch.tag.test_value = 16
    eps = f_eps(n_batch, w)
    f_eps = G.function({'n_batch':n_batch}, eps)
    
    # Compile sampling function
    eps = {}
    for i in range(depth_model):
        eps['eps_'+str(i)] = T.tensor4('eps'+str(i))
        eps['eps_'+str(i)].tag.test_value = np.random.randn(n_batch_test,n_z,1,1).astype(floatX)
    image = f_decoder(eps, w_avg)
    f_decode = G.function(eps, image)
    
    return G.Struct(train=f_train, eval=f_eval, decode=f_decode, eps=f_eps, w=w, w_avg=w_avg)
Exemplo n.º 40
0
Arquivo: ar.py Projeto: gburt/iaf
def linear(name, n_in, n_out, diagonalzeros, l2norm=True, w={}):
    assert n_in % n_out == 0 or n_out % n_in == 0
        
    mask = np.ones((n_in, n_out),dtype=G.floatX)
    if n_out >= n_in:
        k = n_out / n_in
        for i in range(n_in):
            mask[i+1:,i*k:(i+1)*k] = 0
            if diagonalzeros:
                mask[i:i+1,i*k:(i+1)*k] = 0
    else:
        k = n_in / n_out
        for i in range(n_out):
            mask[(i+1)*k:,i:i+1] = 0
            if diagonalzeros:
                mask[i*k:(i+1)*k:,i:i+1] = 0
    
    # L2 normalization of weights
    def l2normalize(_w, axis=0):
        if diagonalzeros:
            # to prevent NaN gradients
            # TODO: smarter solution (also see below)
            if n_out >= n_in:
                _w = T.set_subtensor(_w[:,:n_out/n_in], 0.)
            else:
                _w = T.set_subtensor(_w[:,:1], 0.)
        targetnorm = 1.
        norm = T.sqrt((_w**2).sum(axis=axis, keepdims=True))
        norm += 1e-8 
        new_w = _w * (targetnorm / norm)
        return new_w
    def maxconstraint(_w):
        return _w * (maxweight / T.maximum(maxweight, abs(_w).max(axis=0, keepdims=True)))
    
    w[name+'_w'] = G.sharedf(mask * 0.05 * np.random.randn(n_in, n_out))
    if maxweight > 0:
        w[name+'_w'].set_value(maxconstraint(w[name+'_w']).tag.test_value)
    
    w[name+'_b'] = G.sharedf(np.zeros((n_out,)))
    if l2norm:
        if logscale:
            w[name+'_s'] = G.sharedf(np.zeros((n_out,)))
        else:
            w[name+'_s'] = G.sharedf(np.ones((n_out,)))
    elif do_constant_rescale:
        print 'WARNING: constant rescale, these weights arent saved'
        constant_rescale = G.sharedf(np.zeros((n_out,)))
    
    
    def f(h, w):
        _input = h
        _w = mask * w[name+'_w']
        if l2norm:
            _w = l2normalize(_w)
        h = T.dot(h, _w)
        if l2norm:
            if logscale:
                h *= T.exp(logscale_scale*w[name+'_s'])
            else:
                h *= abs(w[name+'_s'])
        elif do_constant_rescale:
            h *= T.exp(constant_rescale)
        
        h += w[name+'_b']
        
        if '__init' in w:
            # Std
            std = (1./init_stdev) * h.std(axis=0)
            std += (std <= 0)
            std += 1e-8
            if name+'_s' in w:
                if logscale:
                    w[name+'_s'].set_value(-T.log(std).tag.test_value/logscale_scale)
                else:
                    w[name+'_s'].set_value((1./std).tag.test_value)
            elif do_constant_rescale:
                constant_rescale.set_value(-T.log(std).tag.test_value)
                #w[name+'_w'].set_value((_w / std.dimshuffle('x',0)).tag.test_value)
                
            h /= std.dimshuffle('x',0)
            
            # Mean
            mean = h.mean(axis=0)
            w[name+'_b'].set_value(-mean.tag.test_value)
            h -= mean.dimshuffle('x',0)
        
            #print name, w[name+'_w'].get_value().mean(), w[name+'_w'].get_value().std(), w[name+'_w'].get_value().max()
        
        #print name, abs(h).max().tag.test_value, abs(h).min().tag.test_value
        #h = T.printing.Print(name)(h)
        
        return h
    
    # Post updates: normalize weights to unit L2 norm
    def postup(updates, w):
        updates[w[name+'_w']] = mask * updates[w[name+'_w']]
        if l2norm and maxweight>0.:
            updates[w[name+'_w']] = maxconstraint(updates[w[name+'_w']])
        return updates
    
    return G.Struct(__call__=f, postup=postup, w=w)