Пример #1
0
                    type=int,
                    default=0,
                    help="ID of GPU device (if there are multiple)")
args = parser.parse_args()

import tensorflow as tf
import data, graph, graphST, warp
from params import Params

print("=======================================================")
print("train.py (training on MNIST)")
print("=======================================================")

# load data
print("loading MNIST dataset...")
trainData, validData, testData = data.loadMNIST("data/MNIST.npz")

# set parameters
print("setting configurations...")
params = Params(args)

# create directories for model output
suffix = args.group
if not os.path.exists("models_{0}".format(suffix)):
    os.mkdir("models_{0}".format(suffix))
if not os.path.exists("models_{0}/interm".format(suffix)):
    os.mkdir("models_{0}/interm".format(suffix))
if not os.path.exists("models_{0}/final".format(suffix)):
    os.mkdir("models_{0}/final".format(suffix))
saveFname = args.model
Пример #2
0
import argparse

import options
print("setting configurations...")
opt = options.set()

import tensorflow as tf
import data,graph,warp,util

print("=======================================================")
print("train.py (training on MNIST)")
print("=======================================================")

# load data
print("loading MNIST dataset...")
trainData,validData,testData = data.loadMNIST("data/MNIST.npz")

# create directories for model output
util.mkdir("models_{0}".format(opt.group))
util.mkdir("models_{0}/interm".format(opt.group))
util.mkdir("models_{0}/final".format(opt.group))

print("training model {0}...".format(opt.model))
print("------------------------------------------")
print("warpScale: (pert) {0} (trans) {1}".format(opt.warpScale["pert"],opt.warpScale["trans"]))
print("warpType: {0}".format(opt.warpType))
print("batchSize: {0}".format(opt.batchSize))
print("GPU device: {0}".format(opt.gpu))
print("------------------------------------------")

tf.reset_default_graph()
Пример #3
0
def main():
    
    # Main setup
    
    latent_sizes = [2, 5, 10, 20, 30, 50, 100]
    downsampling_factors = [1, 2, 4]
    N_epochs = 50
    binarise_downsampling = False
    bernoulli_sampling = True
    
    # Setup
    
    C = 1 # number of channels in image
    H = 28 # height of image
    W = 28 # width of image
    # K = 10 # number of classes
    
    hidden_sizes = [200, 200]
    
    batch_size = 100
    
    analytic_kl_term = True
    learning_rate = 0.001 #0.0003
    
    shape = [H * W * C]
    
    # Symbolic variables
    symbolic_x_LR = T.matrix()
    symbolic_x_HR = T.matrix()
    symbolic_z = T.matrix()
    symbolic_learning_rate = T.scalar('learning_rate')
    
    # Fix random seed for reproducibility
    numpy.random.seed(1234)
    
    # Data
    
    file_name = "mnist.pkl.gz"
    file_path = data_path(file_name)
    
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape)
    
    X_train = numpy.concatenate([X_train, X_valid])
    
    X_train = X_train.astype(theano.config.floatX)
    X_test = X_test.astype(theano.config.floatX)

    N_train_batches = X_train.shape[0] / batch_size
    N_test_batches = X_test.shape[0] / batch_size
    
    if bernoulli_sampling:
        preprocess = bernoullisample
    else:
        preprocess = numpy.round

    # Setup shared variables
    X_train_shared = theano.shared(preprocess(X_train), borrow = True)
    X_test_shared = theano.shared(preprocess(X_test), borrow = True)
    X_test_shared_fixed = theano.shared(numpy.round(X_test), borrow = True)
    X_test_shared_normal = theano.shared(X_test, borrow = True)
    
    all_runs_duration = 0
    
    for latent_size, downsampling_factor in product(latent_sizes, downsampling_factors):
        
        run_start = time.time()
        
        print("Training model with a latent size of {} and images downsampled by {}:\n".format(latent_size, downsampling_factor))
        
        # Models
    
        h = H / downsampling_factor
        w = W / downsampling_factor
    
        ## Recognition model q(z|x)
    
        l_enc_HR_in = InputLayer((None, H * W * C), name = "ENC_HR_INPUT")
    
        l_enc_HR_downsample = l_enc_HR_in
    
        l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W))
        if downsampling_factor != 1:
            l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad")
            # TODO Should downsampled data be binarised? (worse performance)
            if binarise_downsampling:
                l_enc_HR_downsample = NonlinearityLayer(l_enc_HR_downsample, nonlinearity = T.round)
        l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, h * w * C))
    
        l_enc_LR_in = InputLayer((None, h * w * C), name = "ENC_LR_INPUT")
    
        l_enc = l_enc_LR_in
    
        for i, hidden_size in enumerate(hidden_sizes, start = 1):
            l_enc = DenseLayer(l_enc, num_units = hidden_size, nonlinearity = softplus, name = 'ENC_DENSE{:d}'.format(i))
    
        l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_MU')
        l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_LOG_VAR')
    
        # Sample the latent variables using mu(x) and log(sigma^2(x))
        l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma
        # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var)

        ## Generative model p(x|z)
    
        l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT")
    
        l_dec = l_dec_in
    
        for i, hidden_size in enumerate_reversed(hidden_sizes, start = 0):
            l_dec = DenseLayer(l_dec, num_units = hidden_size, nonlinearity = softplus, name = 'DEC_DENSE{:d}'.format(i))
    
        l_dec_x_mu = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU')
        l_dec_x_log_var = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU')
    
        # TRY relu instead of softplus (maybe with more hidden units)
        # TRY softmax instead of sigmoid
        # PROBLEM with this is that we have several pixels activated.

        ## Get outputs from models
    
        # With noise
        x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False)
        z_train, z_mu_train, z_log_var_train = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False
        )
        x_mu_train, x_log_var_train = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_train}, deterministic = False)
    
        # Without noise
        x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True)
        z_eval, z_mu_eval, z_log_var_eval = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True
        )
        x_mu_eval, x_log_var_eval = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_eval}, deterministic = True)
    
        # Sampling
        x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z},
            deterministic = True)
        
        # Likelihood
        
        # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)]
        def log_likelihood(z, z_mu, z_log_var, x_mu, x_log_var, x, analytic_kl_term):
            if analytic_kl_term:
                kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1)
                log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
                # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1)
                LL = T.mean(-kl_term + log_px_given_z)
            else:
                log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1)
                log_pz = log_stdnormal(z).sum(axis = 1)
                log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
                # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1)
                LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
            return LL

        # log-likelihood for training
        ll_train = log_likelihood(
            z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, symbolic_x_HR, analytic_kl_term)

        # log-likelihood for evaluating
        ll_eval = log_likelihood(
            z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, symbolic_x_HR, analytic_kl_term)
    
        # Parameters to train
        parameters = get_all_params([l_z, l_dec_x_mu], trainable = True)
        # parameters = get_all_params([l_z, l_dec_x_mu, l_dec_x_log_var], trainable = True)
        print("Parameters that will be trained:")
        for parameter in parameters:
            print("{}: {}".format(parameter, parameter.get_value().shape))

        ### Take gradient of negative log-likelihood
        gradients = T.grad(-ll_train, parameters)

        # Adding gradient clipping to reduce the effects of exploding gradients,
        # and hence speed up convergence
        gradient_clipping = 1
        gradient_norm_max = 5
        gradient_constrained = updates.total_norm_constraint(gradients,
            max_norm = gradient_norm_max)
        gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained]
    
        # Setting up functions for training
    
        symbolic_batch_index = T.iscalar('index')
        batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size)

        update_expressions = updates.adam(gradients_clipped, parameters,
            learning_rate = symbolic_learning_rate)

        train_model = theano.function(
            [symbolic_batch_index, symbolic_learning_rate], ll_train,
            updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]}
        )

        test_model = theano.function(
            [symbolic_batch_index], ll_eval,
            givens = {symbolic_x_HR: X_test_shared[batch_slice]}
        )
    
        test_model_fixed = theano.function(
            [symbolic_batch_index], ll_eval,
            givens = {symbolic_x_HR: X_test_shared_fixed[batch_slice]}
        )
    
        def train_epoch(learning_rate):
            costs = []
            for i in range(N_train_batches):
                cost_batch = train_model(i, learning_rate)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        def test_epoch():
            costs = []
            for i in range(N_test_batches):
                cost_batch = test_model(i)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        def test_epoch_fixed():
            costs = []
            for i in range(N_test_batches):
                cost_batch = test_model_fixed(i)
                costs += [cost_batch]
            return numpy.mean(costs)
    
        # Training
    
        epochs = []
        cost_train = []
        cost_test = []
    
        print

        for epoch in range(N_epochs):
        
            epoch_start = time.time()
        
            # Shuffle train data
            numpy.random.shuffle(X_train)
            X_train_shared.set_value(preprocess(X_train))
        
            # TODO: Using dynamically changed learning rate
            train_cost = train_epoch(learning_rate)
            test_cost = test_epoch()
            test_cost_fixed = test_epoch_fixed()
        
            epoch_duration = time.time() - epoch_start
        
            epochs.append(epoch + 1)
            cost_train.append(train_cost)
            cost_test.append(test_cost)
        
            # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost)
            print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, epoch_duration, learning_rate))
            print("    log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))
        
        print
        
        # Results
    
        ## Reconstruction
    
        N_reconstructions = 50
    
        X_test_eval = X_test_shared.eval()
        X_test_eval_fixed = X_test_shared_fixed.eval()
        X_test_eval_normal = X_test_shared_normal.eval()
        
        subset = numpy.random.randint(0, len(X_test_eval), size = N_reconstructions)
    
        x_original = X_test_eval[numpy.array(subset)]
        x_LR = get_output(l_enc_HR_downsample, x_original).eval()
        z = get_output(l_z, x_LR).eval()
        x_reconstructed = x_mu_sample.eval({symbolic_z: z})
    
        x_original_fixed = X_test_eval_fixed[numpy.array(subset)]
        x_LR_fixed = get_output(l_enc_HR_downsample, x_original_fixed).eval()
        z_fixed = get_output(l_z, x_LR_fixed).eval()
        x_reconstructed_fixed = x_mu_sample.eval({symbolic_z: z_fixed})
        
        originals = X_test_eval_normal[numpy.array(subset)]
        
        reconstructions = {
            "originals": x_original,
            "downsampled":  x_LR,
            "reconstructions": x_reconstructed
        }
        
        reconstructions_fixed = {
            "originals": x_original_fixed,
            "downsampled":  x_LR_fixed,
            "reconstructions": x_reconstructed_fixed
        }
        
        ## Manifold
        
        if latent_size == 2:
        
            x = numpy.linspace(0.1, 0.9, 20)
            # TODO: Ideally sample from the real p(z)
            v = gaussian.ppf(x)
            z = numpy.zeros((20**2, 2))
        
            i = 0
            for a in v:
                for b in v:
                    z[i,0] = a
                    z[i,1] = b
                    i += 1
            z = z.astype('float32')
        
            samples = x_mu_sample.eval({symbolic_z: z})
    
        else:
            samples = None
    
        ## Reconstructions of homemade numbers
    
        if downsampling_factor == 2:
        
            file_names = [
                "hm_7_Avenir.png",
                "hm_7_Noteworthy.png",
                "hm_7_Chalkboard.png",
                "hm_7_drawn.png",
                "hm_A_Noteworthy.png",
                "hm_A_drawn.png",
                "hm_7_0.txt",
                "hm_7_1.txt",
                "hm_7_2.txt",
                "hm_A.txt"
            ]
        
            x_LR_HM = data.loadHomemade(map(data_path, file_names), [h * w])
        
            z = get_output(l_z, x_LR_HM).eval()
            x_HM_reconstructed = x_mu_sample.eval({symbolic_z: z})
    
            reconstructions_homemade = {
                "originals": x_LR_HM,
                "reconstructions": x_HM_reconstructed
            }
    
        else:
            reconstructions_homemade = None
    
        # Saving
    
        setup_and_results = {
            "setup": {
                "image size": (C, H, W),
                "downsampling factor": downsampling_factor,
                "learning rate": learning_rate,
                "analytic K-L term": analytic_kl_term,
                "batch size": batch_size,
                "hidden layer sizes": hidden_sizes,
                "latent size": latent_size,
                "number of epochs": N_epochs
            },
            "results": {
                "learning curve": {
                    "epochs": epochs,
                    "training cost function": cost_train,
                    "test cost function": cost_test
                },
                "originals": originals,
                "reconstructions": reconstructions,
                "reconstructions (fixed)": reconstructions_fixed,
                "manifold": {
                    "samples": samples
                },
                "reconstructed homemade numbers": reconstructions_homemade
            }
        }
        
        file_name = "results{}_ds{}{}_l{}_e{}.pkl".format("_bs" if bernoulli_sampling else "", downsampling_factor, "b" if binarise_downsampling else "", latent_size, N_epochs)
    
        with open(data_path(file_name), "w") as f:
            pickle.dump(setup_and_results, f)
        
        run_duration = time.time() - run_start
        
        all_runs_duration += run_duration
        
        print("Run took {:.2f} minutes.".format(run_duration / 60))
        
        print("\n")
    
    print("All runs took {:.2f} minutes in total.".format(all_runs_duration / 60))
Пример #4
0
        classifier = graph.CNN(opt)
    # ------ define loss ------
    loss = torch.nn.CrossEntropyLoss()
    # ------ optimizer ------
    optimList = [{
        "params": geometric.parameters(),
        "lr": opt.lrGP
    }, {
        "params": classifier.parameters(),
        "lr": opt.lrC
    }]
    optim = torch.optim.SGD(optimList)

# load data
print(util.toMagenta("loading MNIST dataset..."))
trainData, testData = data.loadMNIST(opt, "data")

# visdom visualizer
vis = util.Visdom(opt)

print(util.toYellow("======= TRAINING START ======="))
timeStart = time.time()
# start session
with torch.cuda.device(0):
    geometric.train()
    classifier.train()
    if opt.fromIt != 0:
        util.restoreModel(opt, geometric, classifier, opt.fromIt)
        print(
            util.toMagenta("resuming from iteration {0}...".format(
                opt.fromIt)))
def main():
    
    # TODO Make this work better.
    # See https://swarbrickjones.wordpress.com/2015/04/29/convolutional-autoencoders-in-pythontheanolasagne/.
    
    # Setup
    
    C = 1 # number of channels in image
    H = 28 # height of image
    W = 28 # width of image
    # K = 10 # number of classes
    
    shape = [C * H * W]
    
    padding_size = 2
    
    downsampling_factor = 2
    
    # Dense layers
    hidden_sizes = [200, 200]
    latent_size = 2
    
    # Convolutional layers
    filters = [{"number": 16, "size": 3, "stride": 1}]
    
    batch_size = 100
    
    analytic_kl_term = True
    learning_rate = 0.01
    
    N_epochs = 10 # 1000
    
    # Symbolic variables
    symbolic_x_LR = T.matrix()
    symbolic_x_HR = T.matrix()
    symbolic_z = T.matrix()
    symbolic_learning_rate = T.scalar('learning_rate')
    
    # Fix random seed for reproducibility
    numpy.random.seed(1234)
    
    # Data
    
    file_name = "mnist.pkl.gz"
    file_path = data_path(file_name)
    
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape)
    
    X_train = numpy.concatenate([X_train, X_valid])
    
    X_train = X_train.astype(theano.config.floatX)
    X_test = X_test.astype(theano.config.floatX)

    N_train_batches = X_train.shape[0] / batch_size
    N_test_batches = X_test.shape[0] / batch_size

    # Setup shared variables
    X_train_shared = theano.shared(X_train, borrow = True)
    X_test_shared = theano.shared(X_test, borrow = True)
    
    # Models
    
    ## Recognition model q(z|x)
    
    pool_size = 2
    
    l_enc_HR_in = InputLayer((None, C * H * W), name = "ENC_HR_INPUT")
    
    l_enc_HR_downsample = l_enc_HR_in
    l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W))
    l_enc_HR_downsample = PadLayer(l_enc_HR_downsample, width = padding_size)
    l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad")
    _, _, h, w = l_enc_HR_downsample.output_shape
    l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C * h * w))
    
    l_enc_LR_in = InputLayer((None, C * h * w), name = "ENC_LR_INPUT")
    
    l_enc = l_enc_LR_in
    l_enc = ReshapeLayer(l_enc, (-1, C, h, w))
    for i, filter_ in enumerate(filters):
        l_enc = Conv2DLayer(l_enc, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'ENC_CONV_{:d}'.format(i))
    # l_enc = Pool2DLayer(l_enc, pool_size)
    
    l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_MU')
    l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_LOG_VAR')
    
    # Sample the latent variables using mu(x) and log(sigma^2(x))
    l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma
    # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var)

    ## Generative model p(x|z)
    
    l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT")
    
    l_dec = DenseLayer(l_dec_in, num_units = C * H * W, nonlinearity = rectify, name = "DEC_DENSE")
    l_dec = ReshapeLayer(l_dec, (-1, C, H, W))
    for i, filter_ in enumerate_reversed(filters, start = 0):
        if filter_["stride"] == 1:
            l_dec = Conv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i))
        else:
            l_dec = Deconv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i))
    
    l_dec_x_mu = Conv2DLayer(l_dec, num_filters = C, filter_size = (3, 3), stride = 1, pad  = 'same', nonlinearity = None, name = 'DEC_X_MU')
    l_dec_x_mu = ReshapeLayer(l_dec_x_mu, (-1, C * H * W))
    
    ## Get outputs from models
    
    # With noise
    x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False)
    z_train, z_mu_train, z_log_var_train = get_output(
        [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False
    )
    x_mu_train = get_output(l_dec_x_mu, {l_dec_in: z_train}, deterministic = False)

    # Without noise
    x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True)
    z_eval, z_mu_eval, z_log_var_eval = get_output(
        [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True
    )
    x_mu_eval = get_output(l_dec_x_mu, {l_dec_in: z_eval}, deterministic = True)
    
    # Sampling
    x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True)
    
    # Likelihood
    
    # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)]
    def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term):
        if analytic_kl_term:
            kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1)
            log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
            LL = T.mean(-kl_term + log_px_given_z)
        else:
            log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1)
            log_pz = log_stdnormal(z).sum(axis = 1)
            log_px_given_z = log_bernoulli(x, x_mu,  eps = 1e-6).sum(axis = 1)
            LL = T.mean(log_pz + log_px_given_z - log_qz_given_x)
        return LL

    # log-likelihood for training
    ll_train = log_likelihood(
        z_train, z_mu_train, z_log_var_train, x_mu_train, symbolic_x_HR, analytic_kl_term)

    # log-likelihood for evaluating
    ll_eval = log_likelihood(
        z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, symbolic_x_HR, analytic_kl_term)
    
    # Parameters to train
    parameters = get_all_params([l_z_mu, l_dec_x_mu], trainable = True)
    print("Parameters that will be trained:")
    for parameter in parameters:
        print("{}: {}".format(parameter, parameter.get_value().shape))

    ### Take gradient of negative log-likelihood
    gradients = T.grad(-ll_train, parameters)

    # Adding gradient clipping to reduce the effects of exploding gradients,
    # and hence speed up convergence
    gradient_clipping = 1
    gradient_norm_max = 5
    gradient_constrained = updates.total_norm_constraint(gradients,
        max_norm = gradient_norm_max)
    gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained]
    
    # Setting up functions for training
    
    symbolic_batch_index = T.iscalar('index')
    batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size)

    update_expressions = updates.adam(gradients_clipped, parameters,
        learning_rate = symbolic_learning_rate)

    train_model = theano.function(
        [symbolic_batch_index, symbolic_learning_rate], ll_train,
        updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]}
    )

    test_model = theano.function(
        [symbolic_batch_index], ll_eval,
        givens = {symbolic_x_HR: X_test_shared[batch_slice]}
    )
    
    def train_epoch(learning_rate):
        costs = []
        for i in range(N_train_batches):
            cost_batch = train_model(i, learning_rate)
            costs += [cost_batch]
        return numpy.mean(costs)
    
    def test_epoch():
        costs = []
        for i in range(N_test_batches):
            cost_batch = test_model(i)
            costs += [cost_batch]
        return numpy.mean(costs)
    
    # Training
    
    epochs = []
    cost_train = []
    cost_test = []

    for epoch in range(N_epochs):
        
        start = time.time()
        
        # Shuffle train data
        numpy.random.shuffle(X_train)
        X_train_shared.set_value(X_train)
        
        train_cost = train_epoch(learning_rate)
        test_cost = test_epoch()
        
        duration = time.time() - start
        
        epochs.append(epoch)
        cost_train.append(train_cost)
        cost_test.append(test_cost)
        
        # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost)
        print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, duration, learning_rate))
        print("    log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))