예제 #1
0
def get_elbo(pred,
             targ,
             weights,
             logdets,
             weight,
             dataset_size,
             prior=log_normal,
             lbda=0,
             output_type='categorical'):
    """
    negative elbo, an upper bound on NLL
    """

    logqw = -logdets
    """
    originally...
    logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets)
        --> constants are neglected in this wrapperfrom utils import log_laplace
    """
    logpw = prior(weights, 0., -T.log(lbda)).sum(1)
    """
    using normal prior centered at zero, with lbda being the inverse 
    of the variance
    """
    kl = (logqw - logpw).mean()
    if output_type == 'categorical':
        logpyx = -cc(pred, targ).mean()
    elif output_type == 'real':
        logpyx = -se(pred, targ).mean()  # assume output is a vector !
    else:
        assert False
    loss = -(logpyx - weight * kl / T.cast(dataset_size, floatX))

    return loss, [logpyx, logpw, logqw]
    def __init__(self, softmax=softmax):

        self.inpv = T.matrix('inpv')
        self.outv = T.imatrix('outv')  # indices
        self.ep = T.matrix('ep')
        self.w = T.scalar('w')

        self.n = self.inpv.shape[0]

        self.enc_m = get_encoder()
        self.enc_s = get_encoder()
        self.dec = get_decoder()

        self.mu = get_output(self.enc_m, self.inpv)
        self.log_s = get_output(self.enc_s, self.inpv)
        self.log_v = 2 * self.log_s
        self.sigma = T.exp(self.log_s)
        self.var = T.exp(self.log_s * 2)
        self.z = self.mu + self.sigma * self.ep
        self.rec_linear = get_output(self.dec, self.z)
        self.rec_reshaped_ln = self.rec_linear.reshape((self.n * d2, 256))
        self.rec_reshaped = softmax(self.rec_reshaped_ln)

        self.out_onehot = T.extra_ops.to_one_hot(
            self.outv.reshape((self.n * d2, )), 256)

        # lazy modeling just using squared error ...
        self.rec_losses_reshaped = cc(self.rec_reshaped, self.out_onehot)
        self.rec_losses = self.rec_losses_reshaped.reshape((self.n, d2)).sum(1)
        self.klss = - 0.5 * (1+self.log_v) + \
                      0.5 * (self.mu**2 + self.var)
        self.kls = self.klss.sum(1)
        self.rec_loss = self.rec_losses.mean()
        self.kl = self.kls.mean()
        self.loss = self.rec_loss + self.kl * self.w

        self.params = get_all_params(self.enc_m) + \
                      get_all_params(self.enc_s) + \
                      get_all_params(self.dec)
        self.updates = lasagne.updates.adam(self.loss, self.params, lr)

        print '\tgetting train func'
        self.train_func = theano.function(
            [self.inpv, self.outv, self.ep, self.w],
            [self.loss.mean(),
             self.rec_loss.mean(),
             self.kl.mean()],
            updates=self.updates)

        print '\tgetting other useful funcs'
        self.recon = theano.function([self.inpv, self.ep],
                                     self.rec_reshaped.argmax(1).reshape(
                                         (self.n, d2)))
        self.recon_ = theano.function([self.inpv, self.ep],
                                      self.rec_reshaped.reshape(
                                          (self.n, d2, 256)))
        self.project = theano.function([self.inpv, self.ep], self.z)
        self.get_mu = theano.function([self.inpv], self.mu)
        self.get_var = theano.function([self.inpv], self.var)
        self.get_klss = theano.function([self.inpv], self.klss)
예제 #3
0
    def _get_elbo(self):
        """
        negative elbo, an upper bound on NLL
        """

        # TODO: kldiv_bias = tf.reduce_sum(.5 * self.pvar_bias - .5 * self.logvar_bias + ((tf.exp(self.logvar_bias) + tf.square(self.mu_bias)) / (2 * tf.exp(self.pvar_bias))) - .5)

        # eqn14
        kl_q_w_z_p = 0
        for mu, sig, z_T_f in zip(self.mus, self.sigs, self.z_T_fs):
            kl_q_w_z_p += (sig**2).sum() - T.log(
                sig**2).sum() + mu**2 * z_T_f**2  # leaving off the -1
        kl_q_w_z_p *= 0.5

        # eqn15
        self.log_r_z_T_f_W = 0
        print '\n \n eqn15'
        for mu, sig, z_T_b, c, b_mu, b_logsig in zip(
                self.mus, self.sigs, self.z_T_bs, self.cs, self.b_mus,
                self.b_logsigs
        ):  # we'll compute this seperately for every layer's W
            print 'eqn15'
            print[tt.shape for tt in [mu, sig, z_T_b, c, b_mu, b_logsig]]
            # reparametrization trick for eqn 9/10
            cTW_mu = T.dot(c, mu)
            cTW_sig = T.dot(c, sig**2)**.5
            the_scalar = T.tanh(
                cTW_mu + cTW_sig * self.srng.normal(cTW_sig.shape)).sum(
                )  # TODO: double check (does the sum belong here??)
            # scaling b by the_scalar
            mu_tilde = (b_mu * the_scalar).squeeze()
            log_sig_tilde = (b_logsig * the_scalar).squeeze()
            self.log_r_z_T_f_W += (-.5 * T.exp(log_sig_tilde) *
                                   (z_T_b - mu_tilde)**2 -
                                   .5 * T.log(2 * np.pi) +
                                   .5 * log_sig_tilde).sum()
        self.log_r_z_T_f_W += self.logdets_z_T_b

        # -eqn13
        self.kl = (
            -self.logdets + kl_q_w_z_p -
            self.log_r_z_T_f_W).sum()  # TODO: why do I need the mean/sum??

        if self.output_type == 'categorical':
            self.logpyx = -cc(self.y, self.target_var).mean()
        elif self.output_type == 'real':
            self.logpyx = -se(self.y, self.target_var).mean()
        else:
            assert False
        # FIXME: not a scalar!?
        self.loss = - (self.logpyx - \
                       self.weight * self.kl/T.cast(self.dataset_size,floatX))

        # DK - extra monitoring
        params = self.params
        ds = self.dataset_size
        self.monitored = []
    def _get_elbo(self):
        """
        negative elbo, an upper bound on NLL
        """
        self.logpyx = - cc(self.y,self.target_var).mean()
        self.loss = - (self.logpyx - \
                       self.weight * self.kl/T.cast(self.dataset_size,floatX))

        # DK - extra monitoring
        params = self.params
        ds = self.dataset_size
        self.logpyx_grad = flatten_list(T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2)
        self.logpw_grad = flatten_list(T.grad(-self.logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2)
        self.logqw_grad = flatten_list(T.grad(self.logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2)
        self.monitored = [self.logpyx, self.logpw, self.logqw,
                          self.logpyx_grad, self.logpw_grad, self.logqw_grad]
        
        self.logpyx = - cc(self.y,self.target_var).mean()
        self.loss = - (self.logpyx - \
                       self.weight * self.kl/T.cast(self.dataset_size,floatX))
    def _get_elbo(self):
        """
        negative elbo, an upper bound on NLL
        """

        logdets = self.logdets
        self.logqw = -logdets
        """
        originally...
        logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets)
            --> constants are neglected in this wrapperfrom utils import log_laplace
        """
        self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1)
        """
        using normal prior centered at zero, with lbda being the inverse 
        of the variance
        """
        self.kl = (self.logqw - self.logpw).mean()
        if self.output_type == 'categorical':
            self.logpyx = -cc(self.y, self.target_var).mean()
        elif self.output_type == 'real':
            self.logpyx = -se(self.y, self.target_var).mean()
        else:
            assert False
        self.loss = - (self.logpyx - \
                       self.weight * self.kl/T.cast(self.dataset_size,floatX))

        # DK - extra monitoring
        params = self.params
        ds = self.dataset_size
        self.logpyx_grad = flatten_list(
            T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2)
        self.logpw_grad = flatten_list(
            T.grad(-self.logpw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.logqw_grad = flatten_list(
            T.grad(self.logqw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.monitored = [
            self.logpyx, self.logpw, self.logqw, self.logpyx_grad,
            self.logpw_grad, self.logqw_grad
        ]
예제 #6
0
    def _get_elbo(self):
        """
        negative elbo, an upper bound on NLL
        """

        logdets = self.logdets
        logqw = -logdets
        """
        originally...
        logqw = - (0.5*(ep**2).sum(1)+0.5*T.log(2*np.pi)*num_params+logdets)
            --> constants are neglected in this wrapperfrom utils import log_laplace
        """
        logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1)
        """
        using normal prior centered at zero, with lbda being the inverse 
        of the variance
        """
        kl = (logqw - logpw).mean()
        logpyx = -cc(self.y, self.target_var).mean()
        self.loss = -(logpyx - kl / T.cast(self.dataset_size, floatX))
    def _get_elbo(self):
        # NTS: is KL waaay too big??
        self.kl = KL(self.prior_mean, self.prior_log_var, self.mean,
                     self.log_var).sum(-1).mean()

        if self.output_type == 'categorical':
            self.logpyx = -cc(self.y, self.target_var).mean()
        elif self.output_type == 'real':
            self.logpyx = -se(self.y, self.target_var).mean()
        else:
            assert False
        self.loss = - (self.logpyx - \
                       self.weight * self.kl/T.cast(self.dataset_size,floatX))

        # DK - extra monitoring
        params = self.params
        ds = self.dataset_size
        self.logpyx_grad = flatten_list(
            T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2)
        self.monitored = [self.logpyx, self.logpyx_grad,
                          self.kl]  #, self.target_var]
예제 #8
0
        weight = weights[:,t:t+num_param].reshape((wd1,)+ws)
        inputs[w_layer] = weight
        layer = stochasticDenseLayer([layer,w_layer],ws[1])
        t += num_param
        
    layer.nonlinearity = nonlinearities.softmax
    y = get_output(layer,inputs)
    #y = T.clip(y, 0.00001, 0.99999) # stability 

    
    ###########################
    # loss and grad
    logdets = sum([get_output(logdet,ep) for logdet in logdets_layers])
    logqw = - (0.5*(ep**2).sum(1) + 0.5*T.log(2*np.pi)*num_params + logdets)
    logpw = log_stdnormal(weights).sum(1)
    logpyx = - cc(y,target_var).mean()
    kl = (logqw - logpw).mean()
    ds = T.cast(dataset_size,floatX)
    loss = - (logpyx - kl/ds)
    params = lasagne.layers.get_all_params([h_layer,layer])
    grads = T.grad(loss, params)

    ###########################
    # extra monitoring
    nll_grads = flatten_list(T.grad(-logpyx, params, disconnected_inputs='warn')).norm(2)
    prior_grads = flatten_list(T.grad(-logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2)
    entropy_grads = flatten_list(T.grad(logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2)
    outputs = [loss, -logpyx, -logpw / ds, logqw / ds, 
                     nll_grads, prior_grads, entropy_grads,
                     logdets] # logdets is "legacy"
def main():
    """
    MNIST example
    weight norm reparameterized MLP with prior on rescaling parameters
    """

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--perdatapoint', action='store_true')
    parser.add_argument('--coupling', action='store_true')
    parser.add_argument('--size', default=10000, type=int)
    parser.add_argument('--lrdecay', action='store_true')
    parser.add_argument('--lr0', default=0.1, type=float)
    parser.add_argument('--lbda', default=0.01, type=float)
    parser.add_argument('--bs', default=50, type=int)
    args = parser.parse_args()
    print args

    perdatapoint = args.perdatapoint
    coupling = 1  #args.coupling
    lr0 = args.lr0
    lrdecay = args.lrdecay
    lbda = np.cast[floatX](args.lbda)
    bs = args.bs
    size = max(10, min(50000, args.size))
    clip_grad = 100
    max_norm = 100

    # load dataset
    filename = '/data/lisa/data/mnist.pkl.gz'
    train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename)

    input_var = T.matrix('input_var')
    target_var = T.matrix('target_var')
    dataset_size = T.scalar('dataset_size')
    lr = T.scalar('lr')

    # 784 -> 20 -> 10
    weight_shapes = [(784, 200), (200, 10)]

    num_params = sum(ws[1] for ws in weight_shapes)
    if perdatapoint:
        wd1 = input_var.shape[0]
    else:
        wd1 = 1

    # stochastic hypernet
    ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX)
    logdets_layers = []
    h_layer = lasagne.layers.InputLayer([None, num_params])

    layer_temp = LinearFlowLayer(h_layer)
    h_layer = IndexLayer(layer_temp, 0)
    logdets_layers.append(IndexLayer(layer_temp, 1))

    if coupling:
        layer_temp = CoupledDenseLayer(h_layer, 200)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

        h_layer = PermuteLayer(h_layer, num_params)

        layer_temp = CoupledDenseLayer(h_layer, 200)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

    weights = lasagne.layers.get_output(h_layer, ep)

    # primary net
    t = np.cast['int32'](0)
    layer = lasagne.layers.InputLayer([None, 784])
    inputs = {layer: input_var}
    for ws in weight_shapes:
        num_param = ws[1]
        w_layer = lasagne.layers.InputLayer((None, ws[1]))
        weight = weights[:, t:t + num_param].reshape((wd1, ws[1]))
        inputs[w_layer] = weight
        layer = stochasticDenseLayer2([layer, w_layer], ws[1])
        print layer.output_shape
        t += num_param

    layer.nonlinearity = nonlinearities.softmax
    y = T.clip(get_output(layer, inputs), 0.001, 0.999)  # stability

    # loss terms
    logdets = sum([get_output(logdet, ep) for logdet in logdets_layers])
    logqw = -(0.5 *
              (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets)
    #logpw = log_normal(weights,0.,-T.log(lbda)).sum(1)
    logpw = log_stdnormal(weights).sum(1)
    kl = (logqw - logpw).mean()
    logpyx = -cc(y, target_var).mean()
    loss = -(logpyx - kl / T.cast(dataset_size, floatX))

    params = lasagne.layers.get_all_params([h_layer, layer])
    grads = T.grad(loss, params)
    mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
    cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
    updates = lasagne.updates.adam(cgrads, params, learning_rate=lr)

    train = theano.function([input_var, target_var, dataset_size, lr],
                            loss,
                            updates=updates)
    predict = theano.function([input_var], y.argmax(1))

    records = train_model(train, predict, train_x[:size], train_y[:size],
                          valid_x, valid_y, lr0, lrdecay, bs)
예제 #10
0
def main():
    """
    MNIST example
    """

    import argparse

    parser = argparse.ArgumentParser()
    parser = argparse.ArgumentParser()
    parser.add_argument('--perdatapoint', action='store_true')
    parser.add_argument('--coupling', action='store_true')
    parser.add_argument('--size', default=10000, type=int)
    parser.add_argument('--lrdecay', action='store_true')
    parser.add_argument('--lr0', default=0.1, type=float)
    parser.add_argument('--lbda', default=10, type=float)
    parser.add_argument('--bs', default=50, type=int)
    args = parser.parse_args()
    print args

    perdatapoint = args.perdatapoint
    coupling = args.coupling
    size = max(10, min(50000, args.size))
    clip_grad = 10
    max_norm = 1000

    # load dataset
    filename = '/data/lisa/data/mnist.pkl.gz'
    train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename)

    input_var = T.matrix('input_var')
    target_var = T.matrix('target_var')
    dataset_size = T.scalar('dataset_size')
    lr = T.scalar('lr')

    # 784 -> 20 -> 10
    weight_shapes = [(784, 20), (20, 20), (20, 10)]

    num_params = sum(np.prod(ws) for ws in weight_shapes)
    if perdatapoint:
        wd1 = input_var.shape[0]
    else:
        wd1 = 1

    # stochastic hypernet
    ep = srng.normal(size=(wd1, num_params), dtype=floatX)
    logdets_layers = []
    h_layer = lasagne.layers.InputLayer([None, num_params])

    layer_temp = LinearFlowLayer(h_layer)
    h_layer = IndexLayer(layer_temp, 0)
    logdets_layers.append(IndexLayer(layer_temp, 1))

    if coupling:
        layer_temp = CoupledConv1DLayer(h_layer, 16, 5)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

        h_layer = PermuteLayer(h_layer, num_params)

        layer_temp = CoupledConv1DLayer(h_layer, 16, 5)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

    weights = lasagne.layers.get_output(h_layer, ep)

    # primary net
    t = np.cast['int32'](0)
    layer = lasagne.layers.InputLayer([None, 784])
    inputs = {layer: input_var}
    for ws in weight_shapes:
        num_param = np.prod(ws)
        print t, t + num_param
        w_layer = lasagne.layers.InputLayer((None, ) + ws)
        weight = weights[:, t:t + num_param].reshape((wd1, ) + ws)
        inputs[w_layer] = weight
        layer = stochasticDenseLayer([layer, w_layer], ws[1])
        t += num_param

    layer.nonlinearity = nonlinearities.softmax
    y = T.clip(get_output(layer, inputs), 0.001, 0.999)  # stability

    # loss terms
    logdets = sum([get_output(logdet, ep) for logdet in logdets_layers])
    logqw = -(0.5 *
              (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets)
    logpw = log_stdnormal(weights).sum(1)
    kl = (logqw - logpw).mean()
    logpyx = -cc(y, target_var).mean()
    loss = -(logpyx - kl / T.cast(dataset_size, floatX))

    params = lasagne.layers.get_all_params([h_layer, layer])
    grads = T.grad(loss, params)
    mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
    cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
    updates = lasagne.updates.nesterov_momentum(cgrads,
                                                params,
                                                learning_rate=lr)

    train = theano.function([input_var, target_var, dataset_size, lr],
                            loss,
                            updates=updates)
    predict = theano.function([input_var], y.argmax(1))

    records = train_model(train, predict, train_x[:size], train_y[:size],
                          valid_x, valid_y)

    output_probs = theano.function([input_var], y)
    MCt = np.zeros((100, 1000, 10))
    MCv = np.zeros((100, 1000, 10))
    for i in range(100):
        MCt[i] = output_probs(train_x[:1000])
        MCv[i] = output_probs(valid_x[:1000])

    tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean()
    va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean()
    print "train perf=", tr
    print "valid perf=", va

    for ii in range(15):
        print np.round(MCt[ii][0] * 1000)
예제 #11
0
def main():
    """
    MNIST example
    weight norm reparameterized MLP with prior on rescaling parameters
    """

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--coupling', action='store_true')
    parser.add_argument('--size', default=10000, type=int)
    parser.add_argument('--lrdecay', action='store_true')
    parser.add_argument('--lr0', default=0.1, type=float)
    parser.add_argument('--lbda', default=0.01, type=float)
    parser.add_argument('--bs', default=50, type=int)
    args = parser.parse_args()
    print args

    coupling = args.coupling
    lr0 = args.lr0
    lrdecay = args.lrdecay
    lbda = np.cast[floatX](args.lbda)
    bs = args.bs
    size = max(10, min(50000, args.size))
    clip_grad = 5
    max_norm = 10

    # load dataset
    filename = '/data/lisa/data/mnist.pkl.gz'
    train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename)
    train_x = train_x.reshape(50000, 1, 28, 28)
    valid_x = valid_x.reshape(10000, 1, 28, 28)
    test_x = test_x.reshape(10000, 1, 28, 28)

    input_var = T.tensor4('input_var')
    target_var = T.matrix('target_var')
    dataset_size = T.scalar('dataset_size')
    lr = T.scalar('lr')

    # 784 -> 20 -> 10
    weight_shapes = [
        (16, 1, 5, 5),  # -> (None, 16, 14, 14)
        (16, 16, 5, 5),  # -> (None, 16,  7,  7)
        (16, 16, 5, 5)
    ]  # -> (None, 16,  4,  4)

    num_params = sum(np.prod(ws) for ws in weight_shapes) + 10
    wd1 = 1

    # stochastic hypernet
    ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX)
    logdets_layers = []
    h_layer = lasagne.layers.InputLayer([None, num_params])

    layer_temp = LinearFlowLayer(h_layer)
    h_layer = IndexLayer(layer_temp, 0)
    logdets_layers.append(IndexLayer(layer_temp, 1))

    if coupling:
        layer_temp = CoupledDenseLayer(h_layer, 200)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

        h_layer = PermuteLayer(h_layer, num_params)

        layer_temp = CoupledDenseLayer(h_layer, 200)
        h_layer = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

    weights = lasagne.layers.get_output(h_layer, ep)

    # primary net
    t = np.cast['int32'](0)
    layer = lasagne.layers.InputLayer([None, 1, 28, 28])
    inputs = {layer: input_var}
    for ws in weight_shapes:
        num_param = np.prod(ws)
        weight = weights[:, t:t + num_param].reshape(ws)
        num_filters = ws[0]
        filter_size = ws[2]
        stride = 2
        pad = 'same'
        layer = stochasticConv2DLayer([layer, weight], num_filters,
                                      filter_size, stride, pad)
        print layer.output_shape
        t += num_param

    w_layer = lasagne.layers.InputLayer((None, 10))
    weight = weights[:, t:t + 10].reshape((wd1, 10))
    inputs[w_layer] = weight
    layer = stochasticDenseLayer2([layer, w_layer],
                                  10,
                                  nonlinearity=nonlinearities.softmax)

    y = T.clip(get_output(layer, inputs), 0.001, 0.999)

    # loss terms
    logdets = sum([get_output(logdet, ep) for logdet in logdets_layers])
    logqw = -(0.5 *
              (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets)
    logpw = log_normal(weights, 0., -T.log(lbda)).sum(1)
    #logpw = log_stdnormal(weights).sum(1)
    kl = (logqw - logpw).mean()
    logpyx = -cc(y, target_var).mean()
    loss = -(logpyx - kl / T.cast(dataset_size, floatX))

    params = lasagne.layers.get_all_params([layer])[1:]  # excluding rand state
    grads = T.grad(loss, params)

    mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
    cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
    updates = lasagne.updates.adam(cgrads, params, learning_rate=lr)

    train = theano.function([input_var, target_var, dataset_size, lr],
                            loss,
                            updates=updates)
    predict = theano.function([input_var], y.argmax(1))

    records = train_model(train, predict, train_x[:size], train_y[:size],
                          valid_x, valid_y, lr0, lrdecay, bs)

    output_probs = theano.function([input_var], y)
    MCt = np.zeros((100, 1000, 10))
    MCv = np.zeros((100, 1000, 10))
    for i in range(100):
        MCt[i] = output_probs(train_x[:1000])
        MCv[i] = output_probs(valid_x[:1000])

    tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean()
    va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean()
    print "train perf=", tr
    print "valid perf=", va

    for ii in range(15):
        print np.round(MCt[ii][0] * 1000)
    def __init__(
            self,
            srng=RandomStreams(seed=427),
            prior_mean=0,
            prior_log_var=0,
            n_hiddens=2,
            n_units=800,
            n_inputs=784,
            n_classes=10,
            output_type='categorical',
            random_biases=1,
            #dataset_size=None,
            opt='adam',
            #weight=1.,# the weight of the KL term
            **kargs):

        self.__dict__.update(locals())

        # TODO
        self.dataset_size = T.scalar('dataset_size')
        self.weight = T.scalar('weight')
        self.learning_rate = T.scalar('learning_rate')

        self.weight_shapes = []
        self.weight_shapes = []
        if n_hiddens > 0:
            self.weight_shapes.append((n_inputs, n_units))
            #self.params.append((theano.shared()))
            for i in range(1, n_hiddens):
                self.weight_shapes.append((n_units, n_units))
            self.weight_shapes.append((n_units, n_classes))
        else:
            self.weight_shapes = [(n_inputs, n_classes)]

        if self.random_biases:
            self.num_params = sum(
                (ws[0] + 1) * ws[1] for ws in self.weight_shapes)
        else:
            self.num_params = sum((ws[0]) * ws[1] for ws in self.weight_shapes)

        self.wd1 = 1
        self.X = T.matrix()
        self.y = T.matrix()
        self.mean = ts(self.num_params)
        self.log_var = ts(self.num_params, scale=1e-6, bias=-1e8)
        self.params = [self.mean, self.log_var]
        self.ep = self.srng.normal(size=(self.num_params, ), dtype=floatX)
        self.weights = self.mean + (T.exp(self.log_var) +
                                    np.float32(.000001)) * self.ep

        t = 0
        acts = self.X
        for nn, ws in enumerate(self.weight_shapes):
            if self.random_biases:
                num_param = (ws[0] + 1) * ws[1]
                weight_and_bias = self.weights[t:t + num_param]
                weight = weight_and_bias[:ws[0] * ws[1]].reshape(
                    (ws[0], ws[1]))
                bias = weight_and_bias[ws[0] * ws[1]:].reshape((ws[1], ))
                acts = T.dot(acts, weight) + bias
            else:
                assert False  # TODO
            if nn < len(self.weight_shapes) - 1:
                acts = (acts > 0.) * (acts)
            else:
                acts = T.nnet.softmax(acts)

            t += num_param

        y_hat = acts
        #y_hat = T.clip(y_hat, 0.001, 0.999) # stability
        self.y_hat = y_hat

        self.kl = KL(self.prior_mean, self.prior_log_var, self.mean,
                     self.log_var).sum(-1).mean()
        self.logpyx = -cc(self.y_hat, self.y).mean()
        self.logpyx = -se(self.y_hat, self.y).mean()
        self.loss = -(self.logpyx - self.weight * self.kl /
                      T.cast(self.dataset_size, floatX))
        self.loss = se(self.y_hat, self.y).mean()
        self.logpyx_grad = flatten_list(
            T.grad(-self.logpyx, self.params,
                   disconnected_inputs='warn')).norm(2)
        self.monitored = [self.logpyx, self.logpyx_grad, self.kl]

        #def _get_useful_funcs(self):
        self.predict_proba = theano.function([self.X], self.y_hat)
        self.predict = theano.function([self.X], self.y_hat.argmax(1))
        self.predict_fixed_mask = theano.function([self.X, self.weights],
                                                  self.y_hat)
        self.sample_weights = theano.function([], self.weights)
        self.monitor_fn = theano.function(
            [self.X, self.y], self.monitored)  #, (self.predict(x) == y).sum()

        #def _get_grads(self):
        grads = T.grad(self.loss, self.params)
        #mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=self.max_norm)
        #cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads]
        cgrads = grads
        if self.opt == 'adam':
            self.updates = lasagne.updates.adam(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'momentum':
            self.updates = lasagne.updates.nesterov_momentum(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'sgd':
            self.updates = lasagne.updates.sgd(
                cgrads, self.params, learning_rate=self.learning_rate)

        #def _get_train_func(self):
        inputs = [
            self.X, self.y, self.dataset_size, self.learning_rate, self.weight
        ]
        train = theano.function(inputs,
                                self.loss,
                                updates=self.updates,
                                on_unused_input='warn')
        self.train_func_ = train
        # DK - putting this here, because is doesn't get overwritten by subclasses
        self.monitor_func = theano.function(
            [self.X, self.y, self.dataset_size, self.learning_rate],
            self.monitored,
            on_unused_input='warn')
예제 #13
0
    def __init__(self,
                 arch=None,
                 lbda=1,
                 perdatapoint=False,
                 srng=RandomStreams(seed=427),
                 prior=log_normal,
                 opt='adam',
                 coupling=4,
                 coupling_dim=200,
                 pad='same',
                 stride=2,
                 pool=None,
                 uncoupled_init=0,
                 convex_combination=0):

        if arch == 'Riashat':
            kernel_width = 3
            self.kernel_width = kernel_width
            stride = 1
            self.stride = stride
            pad = 'valid'
            self.pad = pad
            self.weight_shapes = [
                (32, 1, kernel_width, kernel_width),  # -> (None, 16, 14, 14)
                (32, 32, kernel_width, kernel_width)
            ]  # -> (None, 16,  7,  7)
            self.args = [[32, kernel_width, stride, pad, rectify, 'none'],
                         [32, kernel_width, stride, pad, rectify, 'max']]
            self.pool_size = 5
        else:
            self.pool_size = 2

        self.n_kernels = np.array(self.weight_shapes)[:, 1].sum()
        self.kernel_shape = self.weight_shapes[0][:1] + self.weight_shapes[0][
            2:]
        print "kernel_shape", self.kernel_shape
        self.kernel_size = np.prod(self.weight_shapes[0])

        self.num_classes = 10
        if arch == 'Riashat':
            self.num_hids = 256
        else:
            self.num_hids = 128
        self.num_mlp_layers = 1
        self.num_mlp_params = self.num_classes + \
                              self.num_hids * self.num_mlp_layers
        self.num_cnn_params = np.sum(np.array(self.weight_shapes)[:, 0])
        self.num_params = self.num_mlp_params + self.num_cnn_params
        self.coupling = coupling
        self.extra_l2 = 0
        self.convex_combination = convex_combination

        #def __init__(self,

        self.lbda = lbda
        self.perdatapoint = perdatapoint
        self.srng = srng
        self.prior = prior
        self.__dict__.update(locals())

        if perdatapoint:
            self.wd1 = self.input_var.shape[0]
        else:
            self.wd1 = 1

    #def _get_theano_variables(self):
        self.input_var = T.matrix('input_var')
        self.input_var = T.tensor4('input_var')  # <-- for CNN
        self.target_var = T.matrix('target_var')
        self.dataset_size = T.scalar('dataset_size')
        self.learning_rate = T.scalar('learning_rate')

        #def _get_hyper_net(self):
        # inition random noise
        print self.num_params
        ep = self.srng.normal(size=(self.wd1, self.num_params), dtype=floatX)
        logdets_layers = []
        h_net = lasagne.layers.InputLayer([None, self.num_params])

        # mean and variation of the initial noise
        layer_temp = LinearFlowLayer(h_net)
        h_net = IndexLayer(layer_temp, 0)
        logdets_layers.append(IndexLayer(layer_temp, 1))

        if self.coupling:
            layer_temp = CoupledWNDenseLayer(h_net,
                                             coupling_dim,
                                             uncoupled_init=uncoupled_init)
            h_net = IndexLayer(layer_temp, 0)
            logdets_layers.append(IndexLayer(layer_temp, 1))

            for c in range(self.coupling - 1):
                h_net = PermuteLayer(h_net, self.num_params)

                layer_temp = CoupledWNDenseLayer(h_net,
                                                 coupling_dim,
                                                 uncoupled_init=uncoupled_init)
                h_net = IndexLayer(layer_temp, 0)
                logdets_layers.append(IndexLayer(layer_temp, 1))

        if self.convex_combination:
            layer_temp = ConvexBiasLayer(
                h_net, upweight_primary=self.convex_combination)
            h_net = IndexLayer(layer_temp, 0)
            logdets_layers.append(IndexLayer(layer_temp, 1))

        self.h_net = h_net
        self.weights = lasagne.layers.get_output(h_net, ep)
        self.logdets = sum([get_output(ld, ep) for ld in logdets_layers])

        #def _get_primary_net(self):

        t = np.cast['int32'](0)
        if 1:  #self.dataset == 'mnist':
            p_net = lasagne.layers.InputLayer([None, 1, 28, 28])
        print p_net.output_shape
        inputs = {p_net: self.input_var}

        #logpw = np.float32(0.)

        for ws, args in zip(self.weight_shapes, self.args):

            num_filters = ws[0]

            # TO-DO: generalize to have multiple samples?
            weight = self.weights[0, t:t + num_filters].dimshuffle(
                0, 'x', 'x', 'x')

            num_filters = args[0]
            filter_size = args[1]
            stride = args[2]
            pad = args[3]
            nonl = args[4]
            p_net = lasagne.layers.Conv2DLayer(p_net,
                                               num_filters,
                                               filter_size,
                                               stride,
                                               pad,
                                               nonlinearity=nonl)
            p_net = stochastic_weight_norm(p_net, weight)

            if args[5] == 'max':
                p_net = lasagne.layers.MaxPool2DLayer(p_net, self.pool_size)
            #print p_net.output_shape
            t += num_filters

        for layer in range(self.num_mlp_layers):
            weight = self.weights[:, t:t + self.num_hids].reshape(
                (self.wd1, self.num_hids))
            p_net = lasagne.layers.DenseLayer(p_net,
                                              self.num_hids,
                                              nonlinearity=rectify)
            p_net = stochastic_weight_norm(p_net, weight)
            if self.extra_l2:
                self.l2_penalty = lasagne.regularization.regularize_layer_params_weighted(
                    {p_net: 3.5 / 128}, lasagne.regularization.l2)
            t += self.num_hids

        weight = self.weights[:, t:t + self.num_classes].reshape(
            (self.wd1, self.num_classes))

        p_net = lasagne.layers.DenseLayer(p_net,
                                          self.num_classes,
                                          nonlinearity=nonlinearities.softmax)
        p_net = stochastic_weight_norm(p_net, weight)

        y = T.clip(get_output(p_net, inputs), 0.001, 0.999)  # stability

        self.p_net = p_net
        self.y = y

        #def _get_params(self):

        params = lasagne.layers.get_all_params([self.h_net, self.p_net])
        self.params = list()
        for param in params:
            if type(param) is not RSSV:
                self.params.append(param)

        params0 = lasagne.layers.get_all_param_values([self.h_net, self.p_net])
        params = lasagne.layers.get_all_params([self.h_net, self.p_net])
        updates = {p: p0 for p, p0 in zip(params, params0)}
        self.reset = theano.function([], None, updates=updates)
        self.add_reset('init')

        #def _get_elbo(self):

        logdets = self.logdets
        self.logqw = -logdets
        self.logpw = self.prior(self.weights, 0., -T.log(self.lbda)).sum(1)
        self.kl = (self.logqw - self.logpw).mean()
        self.kl_term = self.kl / T.cast(self.dataset_size, floatX)
        self.logpyx = -cc(self.y, self.target_var).mean()
        self.loss = -self.logpyx + self.kl_term

        # DK - extra monitoring (TODO)
        params = self.params
        ds = self.dataset_size
        self.logpyx_grad = flatten_list(
            T.grad(-self.logpyx, params, disconnected_inputs='warn')).norm(2)
        self.logpw_grad = flatten_list(
            T.grad(-self.logpw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.logqw_grad = flatten_list(
            T.grad(self.logqw.mean() / ds, params,
                   disconnected_inputs='warn')).norm(2)
        self.monitored = [
            self.logpyx, self.logpw, self.logqw, self.logpyx_grad,
            self.logpw_grad, self.logqw_grad
        ]

        #def _get_grads(self):
        grads = T.grad(self.loss, self.params)
        mgrads = lasagne.updates.total_norm_constraint(grads,
                                                       max_norm=self.max_norm)
        cgrads = [T.clip(g, -self.clip_grad, self.clip_grad) for g in mgrads]
        if self.opt == 'adam':
            self.updates = lasagne.updates.adam(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'momentum':
            self.updates = lasagne.updates.nesterov_momentum(
                cgrads, self.params, learning_rate=self.learning_rate)
        elif self.opt == 'sgd':
            self.updates = lasagne.updates.sgd(
                cgrads, self.params, learning_rate=self.learning_rate)

    #def _get_train_func(self):
        train = theano.function([
            self.input_var, self.target_var, self.dataset_size,
            self.learning_rate
        ],
                                self.loss,
                                updates=self.updates)
        self.train_func = train
        # DK - putting this here, because is doesn't get overwritten by subclasses
        self.monitor_func = theano.function([
            self.input_var, self.target_var, self.dataset_size,
            self.learning_rate
        ],
                                            self.monitored,
                                            on_unused_input='warn')

        #def _get_useful_funcs(self):
        self.predict_proba = theano.function([self.input_var], self.y)
        self.predict = theano.function([self.input_var], self.y.argmax(1))
예제 #14
0
    def __init__(self,
                 n_hiddens,
                 n_units,
                 n_inputs=784,
                 dropout=False,
                 flow='IAF',
                 norm_type='WN',
                 coupling=0,
                 n_units_h=200,
                 static_bias=True,
                 prior=log_normal,
                 lbda=1,
                 srng=RandomStreams(seed=427),
                 max_norm=10,
                 clip_grad=5):
        """
        flow: 
            if None, then just regular MLE estimate of parameters
            flow can be `IAF` or `NVP` to approximate the rescaling 
            parameters (and shift) of Weightnorm or Batchnorm 
        
        coupling: 
            number of transformation layers using `IAF` or `RealNVP` if 
            flow is not None
            
        dropout:
            dropout layer after activation
        
        static_bias:
            if one wants the hyper net to output the shifting parameters
            of WN/BN of flow is not None
        
        """

        layer = lasagne.layers.InputLayer([None, n_inputs])

        self.n_hiddens = n_hiddens
        self.n_units = n_units
        self.weight_shapes = list()
        self.weight_shapes.append((n_inputs, n_units))
        for i in range(1, n_hiddens):
            self.weight_shapes.append((n_units, n_units))
        self.weight_shapes.append((n_units, 10))
        self.num_params = sum(ws[1] for ws in self.weight_shapes)
        self.flow = flow
        self.norm_type = norm_type
        self.coupling = coupling
        self.dropout = dropout
        self.static_bias = static_bias
        self.prior = prior
        self.lbda = lbda
        self.max_norm = max_norm
        self.clip_grad = clip_grad

        for j, ws in enumerate(self.weight_shapes):
            layer = lasagne.layers.DenseLayer(
                layer, ws[1], nonlinearity=lasagne.nonlinearities.rectify)
            if dropout:
                if j != len(self.weight_shapes) - 1:
                    layer = lasagne.layers.dropout(layer)

        layer.nonlinearity = lasagne.nonlinearities.softmax
        self.input_var = T.matrix('input_var')
        self.target_var = T.matrix('target_var')
        self.learning_rate = T.scalar('leanring_rate')
        self.inputs = [self.input_var, self.target_var, self.learning_rate]

        self.layer = layer
        if flow is None:
            self.output_var = get_output(layer, self.input_var)
            self.output_var_det = get_output(layer,
                                             self.input_var,
                                             deterministic=True)

            losses = cc(self.y, self.target_var)
            self.loss = losses.mean()
            self.prints = []

        elif flow == 'IAF' or flow == 'RealNVP':

            self.dataset_size = T.scalar('dataset_size')
            self.beta = T.scalar('beta')  # anealing weight
            self.inputs = [
                self.input_var, self.target_var, self.dataset_size,
                self.learning_rate, self.beta
            ]

            copies = 1 if self.static_bias else 2
            hnet, ld, num_params = hypernet(layer,
                                            n_units_h,
                                            coupling,
                                            flow,
                                            copies=copies)
            static_bias = theano.shared(np.zeros(
                (num_params)).astype(floatX)) if self.static_bias else None

            ep = srng.normal(size=(1, num_params), dtype=floatX)

            output_var = N_get_output(layer,
                                      self.input_var,
                                      hnet,
                                      ep,
                                      norm_type=norm_type,
                                      static_bias=static_bias)
            weights = get_output(hnet, ep)
            logdets = get_output(ld, ep)

            self.num_params = num_params
            self.N_bias = static_bias
            self.hnet = hnet
            self.ep = ep
            self.output_var_ = output_var

            if norm_type == 'BN' and flow is not None:
                print 'BN test time uses running avg'
                #self.output_var = N_get_output(layer,
                #                               self.input_var,hnet,ep,
                #                               norm_type=norm_type,
                #                               static_bias=static_bias,
                #                               test_time=True)
                self.output_var = self.output_var
            else:
                self.output_var = self.output_var

            self.weights = weights
            self.logdets = logdets

            loss, prints = get_elbo(
                T.clip(output_var, 0.001, 0.999),  # stability
                self.target_var,
                self.weights,
                self.logdets,
                self.beta,
                self.dataset_size,
                prior=self.prior,
                lbda=self.lbda,
                output_type='categorical')
            self.loss = loss
            self.prints = prints



        self.params = lasagne.layers.get_all_params(self.layer) + \
                      lasagne.layers.get_all_params(self.hnet)
        if hasattr(self, 'N_bias'):
            if self.N_bias is not None:
                self.params.append(self.N_bias)

        self.grads = stable_grad(self.loss, self.params, self.clip_grad,
                                 self.max_norm)
        self.updates = lasagne.updates.adam(self.grads, self.params,
                                            self.learning_rate)

        print '\tgetting train_func'
        if len(self.inputs) == 3:
            self.train_func_ = theano.function(self.inputs, [
                self.loss,
            ] + self.prints,
                                               updates=self.updates)
            self.tran_func = lambda x, y, n, lr, w: self.train_func_(x, y, lr)
        elif len(self.inputs) == 5:
            self.train_func = theano.function(self.inputs, [
                self.loss,
            ] + self.prints,
                                              updates=self.updates)

        print '\tgetting useful_funcs'
        self.predict_proba = theano.function([self.input_var], self.output_var)