def test_mogsm(self): mcgsm = MCGSM( dim_in=0, dim_out=3, num_components=2, num_scales=2, num_features=0) p0 = 0.3 p1 = 0.7 N = 20000 m0 = array([[2], [0], [0]]) m1 = array([[0], [2], [1]]) C0 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) C1 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) input = zeros([0, N]) output = hstack([ dot(cholesky(C0), randn(mcgsm.dim_out, round(p0 * N))) + m0, dot(cholesky(C1), randn(mcgsm.dim_out, round(p1 * N))) + m1]) * (rand(1, N) + 0.5) mcgsm.train(input, output, parameters={ 'verbosity': 0, 'max_iter': 10, 'train_means': True}) mogsm = MoGSM(3, 2, 2) # translate parameters from MCGSM to MoGSM mogsm.priors = sum(exp(mcgsm.priors), 1) / sum(exp(mcgsm.priors)) for k in range(mogsm.num_components): mogsm[k].mean = mcgsm.means[:, k] mogsm[k].covariance = inv(dot(mcgsm.cholesky_factors[k], mcgsm.cholesky_factors[k].T)) mogsm[k].scales = exp(mcgsm.scales[k, :]) mogsm[k].priors = exp(mcgsm.priors[k, :]) / sum(exp(mcgsm.priors[k, :])) self.assertAlmostEqual(mcgsm.evaluate(input, output), mogsm.evaluate(output), 5) mogsm_samples = mogsm.sample(N) mcgsm_samples = mcgsm.sample(input) # generated samples should have the same distribution for i in range(mogsm.dim): self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[0]) > 0.0001) self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[1]) > 0.0001) self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[2]) > 0.0001) posterior = mcgsm.posterior(input, mcgsm_samples) # average posterior should correspond to prior for k in range(mogsm.num_components): self.assertLess(abs(1 - mean(posterior[k]) / mogsm.priors[k]), 0.1)
def train_model(img, input_mask, output_mask): # generate data inputs, outputs = generate_data_from_image( img, input_mask, output_mask, 120000) # split data into training and validation sets data_train = inputs[:, :100000], outputs[:, :100000] data_valid = inputs[:, 100000:], outputs[:, 100000:] # compute normalizing transformation pre = WhiteningPreconditioner(*data_train) # intialize model model = MCGSM( dim_in=data_train[0].shape[0], dim_out=data_train[1].shape[0], num_components=8, num_scales=4, num_features=30) # fit parameters model.initialize(*pre(*data_train)) model.train(*chain(pre(*data_train), pre(*data_valid)), parameters={ 'verbosity': 1, 'max_iter': 1000, 'threshold': 1e-7, 'val_iter': 5, 'val_look_ahead': 10, 'num_grad': 20, }) return model, pre
def test_pickle(self): mcgsm0 = MCGSM(11, 2, 4, 7, 21) mcgsm0.linear_features = randn(mcgsm0.num_components, mcgsm0.dim_in) mcgsm0.means = randn(mcgsm0.dim_out, mcgsm0.num_components) tmp_file = mkstemp()[1] # store model with open(tmp_file, "w") as handle: dump({"mcgsm": mcgsm0}, handle) # load model with open(tmp_file) as handle: mcgsm1 = load(handle)["mcgsm"] # make sure parameters haven't changed self.assertEqual(mcgsm0.dim_in, mcgsm1.dim_in) self.assertEqual(mcgsm0.dim_out, mcgsm1.dim_out) self.assertEqual(mcgsm0.num_components, mcgsm1.num_components) self.assertEqual(mcgsm0.num_scales, mcgsm1.num_scales) self.assertEqual(mcgsm0.num_features, mcgsm1.num_features) self.assertLess(max(abs(mcgsm0.scales - mcgsm1.scales)), 1e-20) self.assertLess(max(abs(mcgsm0.weights - mcgsm1.weights)), 1e-20) self.assertLess(max(abs(mcgsm0.features - mcgsm1.features)), 1e-20) self.assertLess(max(abs(mcgsm0.linear_features - mcgsm1.linear_features)), 1e-20) self.assertLess(max(abs(mcgsm0.means - mcgsm1.means)), 1e-20) for chol0, chol1 in zip(mcgsm0.cholesky_factors, mcgsm1.cholesky_factors): self.assertLess(max(abs(chol0 - chol1)), 1e-20) for pred0, pred1 in zip(mcgsm0.predictors, mcgsm1.predictors): self.assertLess(max(abs(pred0 - pred1)), 1e-20)
def test_basics(self): dim_in = 10 dim_out = 3 num_components = 7 num_scales = 5 num_features = 50 num_samples = 100 # create model mcgsm = MCGSM(dim_in, dim_out, num_components, num_scales, num_features) # generate output input = randn(dim_in, num_samples) output = mcgsm.sample(input) loglik = mcgsm.loglikelihood(input, output) post = mcgsm.posterior(input, output) samples = mcgsm.sample_posterior(input, output) # check hyperparameters self.assertEqual(mcgsm.dim_in, dim_in) self.assertEqual(mcgsm.dim_out, dim_out) self.assertEqual(mcgsm.num_components, num_components) self.assertEqual(mcgsm.num_scales, num_scales) self.assertEqual(mcgsm.num_features, num_features) # check parameters self.assertEqual(mcgsm.priors.shape[0], num_components) self.assertEqual(mcgsm.priors.shape[1], num_scales) self.assertEqual(mcgsm.scales.shape[0], num_components) self.assertEqual(mcgsm.scales.shape[1], num_scales) self.assertEqual(mcgsm.weights.shape[0], num_components) self.assertEqual(mcgsm.weights.shape[1], num_features) self.assertEqual(mcgsm.features.shape[0], dim_in) self.assertEqual(mcgsm.features.shape[1], num_features) self.assertEqual(len(mcgsm.cholesky_factors), num_components) self.assertEqual(len(mcgsm.predictors), num_components) self.assertEqual(mcgsm.cholesky_factors[0].shape[0], dim_out) self.assertEqual(mcgsm.cholesky_factors[0].shape[1], dim_out) self.assertEqual(mcgsm.predictors[0].shape[0], dim_out) self.assertEqual(mcgsm.predictors[0].shape[1], dim_in) self.assertEqual(mcgsm.linear_features.shape[0], num_components) self.assertEqual(mcgsm.linear_features.shape[1], dim_in) self.assertEqual(mcgsm.means.shape[0], dim_out) self.assertEqual(mcgsm.means.shape[1], num_components) # check dimensionality of output self.assertEqual(output.shape[0], dim_out) self.assertEqual(output.shape[1], num_samples) self.assertEqual(loglik.shape[0], 1) self.assertEqual(loglik.shape[1], num_samples) self.assertEqual(post.shape[0], num_components) self.assertEqual(post.shape[1], num_samples) self.assertLess(max(samples), mcgsm.num_components) self.assertGreaterEqual(min(samples), 0) self.assertEqual(samples.shape[0], 1) self.assertEqual(samples.shape[1], num_samples)
def test_train(self): mcgsm = MCGSM(8, 3, 4, 2, 20) priors = mcgsm.priors scales = mcgsm.scales weights = mcgsm.weights features = mcgsm.features predictor = mcgsm.predictors[0] mcgsm.train( randn(mcgsm.dim_in, 20000), randn(mcgsm.dim_out, 20000), parameters={ 'verbosity': 0, 'max_iter': 0, }) # this should raise errors self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(1, 2000)) self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(2000)) self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(mcgsm.dim_out, 2000), randn(mcgsm.dim_in - 1, 1000), randn(mcgsm.dim_out, 1000)) # parameters should not have changed self.assertLess(max(abs(mcgsm.priors - priors)), 1e-20) self.assertLess(max(abs(mcgsm.scales - scales)), 1e-20) self.assertLess(max(abs(mcgsm.weights - weights)), 1e-20) self.assertLess(max(abs(mcgsm.features - features)), 1e-20) self.assertLess(max(abs(mcgsm.predictors[0] - predictor)), 1e-20) count = [] def callback(i, mcgsm): count.append(i) return max_iter = 10 cb_iter = 2 # make sure training doesn't throw any errors mcgsm.train( randn(mcgsm.dim_in, 10000), randn(mcgsm.dim_out, 10000), parameters={ 'verbosity': 0, 'max_iter': max_iter, 'threshold': 0., 'batch_size': 1999, 'callback': callback, 'cb_iter': cb_iter, }) # test callback self.assertTrue(range(cb_iter, max_iter + 1, cb_iter) == count)
def test_sample(self): mcgsm = MCGSM(1, 1, 1, 1, 1) mcgsm.scales = [[0.0]] mcgsm.predictors = [[0.0]] samples = mcgsm.sample(zeros([1, 10000])).flatten() p = kstest(samples, lambda x: norm.cdf(x, scale=1.0))[1] # make sure Gaussian random number generation works self.assertTrue(p > 0.0001)
def test_evaluate(self): mcgsm = MCGSM(5, 3, 4, 2, 10) inputs = randn(mcgsm.dim_in, 100) outputs = mcgsm.sample(inputs) pre = WhiteningPreconditioner(inputs, outputs) loglik1 = -mcgsm.evaluate(inputs, outputs, pre) loglik2 = (mcgsm.loglikelihood(*pre(inputs, outputs)).mean() + pre.logjacobian(inputs, outputs).mean()) / log(2.) / mcgsm.dim_out self.assertAlmostEqual(loglik1, loglik2, 8)
def test_conditional_loglikelihood(self): mcgsm = MCGSM(3, 1, 2, 1, 4) mcgsm.linear_features = randn(mcgsm.num_components, mcgsm.dim_in) / 5.0 mcgsm.means = randn(mcgsm.dim_out, mcgsm.num_components) / 5.0 M = 100 inputs = randn(mcgsm.dim_in, M) outputs = mcgsm.sample(inputs) loglik0 = mcgsm.loglikelihood(inputs, outputs) loglik1 = [] N = 1000 # estimate log-likelihood via sampling for _ in range(N): labels = mcgsm.sample_prior(inputs) loglik1.append(mcgsm.loglikelihood(inputs, outputs, labels)) loglik1 = vstack(loglik1) d = abs(logmeanexp(loglik1, 0) - loglik0).ravel() s = std(loglik1, 0, ddof=1).ravel() for i in range(M): self.assertLess(d[i], 6.0 * s[i] / sqrt(N))
def test_gradient(self): mcgsm = MCGSM(5, 2, 2, 4, 10) cholesky_factors = [] for k in range(mcgsm.num_components): cholesky_factors.append(cholesky(cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)))) mcgsm.cholesky_factors = cholesky_factors mcgsm.linear_features = randn(mcgsm.num_components, mcgsm.dim_in) / 5. mcgsm.means = randn(mcgsm.dim_out, mcgsm.num_components) / 5. err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5) self.assertLess(err, 1e-8) # without regularization for param in ['priors', 'scales', 'weights', 'features', 'chol', 'pred', 'linear_features', 'means']: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5, parameters={ 'train_prior': param == 'priors', 'train_scales': param == 'scales', 'train_weights': param == 'weights', 'train_features': param == 'features', 'train_cholesky_factors': param == 'chol', 'train_predictors': param == 'pred', 'train_linear_features': param == 'linear_features', 'train_means': param == 'means', }) self.assertLess(err, 1e-8) # with regularization for norm in ['L1', 'L2']: for param in ['priors', 'scales', 'weights', 'features', 'chol', 'pred', 'linear_features', 'means']: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-7, parameters={ 'train_prior': param == 'priors', 'train_scales': param == 'scales', 'train_weights': param == 'weights', 'train_features': param == 'features', 'train_cholesky_factors': param == 'chol', 'train_predictors': param == 'pred', 'train_linear_features': param == 'linear_features', 'train_means': param == 'means', 'regularize_features': {'strength': 0.4, 'norm': norm}, 'regularize_predictors': {'strength': 0.5, 'norm': norm}, 'regularize_weights': {'strength': 0.7, 'norm': norm}, 'regularize_linear_features': {'strength': 0.3, 'norm': norm}, 'regularize_means': {'strength': 0.6, 'norm': norm}, }) self.assertLess(err, 1e-6)
def test_gradient(self): mcgsm = MCGSM(5, 2, 2, 4, 10) cholesky_factors = [] for k in range(mcgsm.num_components): cholesky_factors.append(cholesky(cov(randn(mcgsm.dim_out, mcgsm.dim_out ** 2)))) mcgsm.cholesky_factors = cholesky_factors mcgsm.linear_features = randn(mcgsm.num_components, mcgsm.dim_in) / 5.0 mcgsm.means = randn(mcgsm.dim_out, mcgsm.num_components) / 5.0 err = mcgsm._check_gradient(randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5) self.assertLess(err, 1e-8) # without regularization for param in ["priors", "scales", "weights", "features", "chol", "pred", "linear_features", "means"]: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5, parameters={ "train_prior": param == "priors", "train_scales": param == "scales", "train_weights": param == "weights", "train_features": param == "features", "train_cholesky_factors": param == "chol", "train_predictors": param == "pred", "train_linear_features": param == "linear_features", "train_means": param == "means", }, ) self.assertLess(err, 1e-8) # with regularization for norm in ["L1", "L2"]: for param in ["priors", "scales", "weights", "features", "chol", "pred", "linear_features", "means"]: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-7, parameters={ "train_prior": param == "priors", "train_scales": param == "scales", "train_weights": param == "weights", "train_features": param == "features", "train_cholesky_factors": param == "chol", "train_predictors": param == "pred", "train_linear_features": param == "linear_features", "train_means": param == "means", "regularize_features": {"strength": 0.4, "norm": norm}, "regularize_predictors": {"strength": 0.5, "norm": norm}, "regularize_weights": {"strength": 0.7, "norm": norm}, "regularize_linear_features": {"strength": 0.3, "norm": norm}, "regularize_means": {"strength": 0.6, "norm": norm}, }, ) self.assertLess(err, 1e-6)
def main(argv): # load image and turn into grayscale img = rgb2gray(imread('media/newyork.png')) # generate data inputs, outputs = generate_data_from_image( img, input_mask, output_mask, 220000) # split data into training, test, and validation sets inputs = split(inputs, [100000, 200000], 1) outputs = split(outputs, [100000, 200000], 1) data_train = inputs[0], outputs[0] data_test = inputs[1], outputs[1] data_valid = inputs[2], outputs[2] # compute normalizing transformation pre = WhiteningPreconditioner(*data_train) # intialize model model = MCGSM( dim_in=data_train[0].shape[0], dim_out=data_train[1].shape[0], num_components=8, num_scales=4, num_features=32) # fit parameters model.initialize(*pre(*data_train)) model.train(*chain(pre(*data_train), pre(*data_valid)), parameters={ 'verbosity': 1, 'max_iter': 1000, 'threshold': 1e-7, 'val_iter': 5, 'val_look_ahead': 10, 'num_grad': 20, }) # evaluate model print 'Average log-likelihood: {0:.4f} [bit/px]'.format( -model.evaluate(data_test[0], data_test[1], pre)) # synthesize a new image img_sample = sample_image(img, model, input_mask, output_mask, pre) imwrite('newyork_sample.png', img_sample, cmap='gray', vmin=min(img), vmax=max(img)) # save model with open('image_model.pck', 'wb') as handle: dump({ 'model': model, 'input_mask': input_mask, 'output_mask': output_mask}, handle, 1) return 0
def test_train(self): mcgsm = MCGSM(8, 3, 4, 2, 20) priors = mcgsm.priors scales = mcgsm.scales weights = mcgsm.weights features = mcgsm.features predictor = mcgsm.predictors[0] mcgsm.train(randn(mcgsm.dim_in, 20000), randn(mcgsm.dim_out, 20000), parameters={ 'verbosity': 0, 'max_iter': 0, }) # this should raise errors self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(1, 2000)) self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(2000)) self.assertRaises(RuntimeError, mcgsm.train, randn(mcgsm.dim_in - 1, 2000), randn(mcgsm.dim_out, 2000), randn(mcgsm.dim_in - 1, 1000), randn(mcgsm.dim_out, 1000)) # parameters should not have changed self.assertLess(max(abs(mcgsm.priors - priors)), 1e-20) self.assertLess(max(abs(mcgsm.scales - scales)), 1e-20) self.assertLess(max(abs(mcgsm.weights - weights)), 1e-20) self.assertLess(max(abs(mcgsm.features - features)), 1e-20) self.assertLess(max(abs(mcgsm.predictors[0] - predictor)), 1e-20) count = [] def callback(i, mcgsm): count.append(i) return max_iter = 10 cb_iter = 2 # make sure training doesn't throw any errors mcgsm.train(randn(mcgsm.dim_in, 10000), randn(mcgsm.dim_out, 10000), parameters={ 'verbosity': 0, 'max_iter': max_iter, 'threshold': 0., 'batch_size': 1999, 'callback': callback, 'cb_iter': cb_iter, }) # test callback self.assertTrue(range(cb_iter, max_iter + 1, cb_iter) == count)
def test_pickle(self): mcgsm0 = MCGSM(11, 2, 4, 7, 21) mcgsm0.linear_features = randn(mcgsm0.num_components, mcgsm0.dim_in) mcgsm0.means = randn(mcgsm0.dim_out, mcgsm0.num_components) tmp_file = mkstemp()[1] # store model with open(tmp_file, 'w') as handle: dump({'mcgsm': mcgsm0}, handle) # load model with open(tmp_file) as handle: mcgsm1 = load(handle)['mcgsm'] # make sure parameters haven't changed self.assertEqual(mcgsm0.dim_in, mcgsm1.dim_in) self.assertEqual(mcgsm0.dim_out, mcgsm1.dim_out) self.assertEqual(mcgsm0.num_components, mcgsm1.num_components) self.assertEqual(mcgsm0.num_scales, mcgsm1.num_scales) self.assertEqual(mcgsm0.num_features, mcgsm1.num_features) self.assertLess(max(abs(mcgsm0.scales - mcgsm1.scales)), 1e-20) self.assertLess(max(abs(mcgsm0.weights - mcgsm1.weights)), 1e-20) self.assertLess(max(abs(mcgsm0.features - mcgsm1.features)), 1e-20) self.assertLess( max(abs(mcgsm0.linear_features - mcgsm1.linear_features)), 1e-20) self.assertLess(max(abs(mcgsm0.means - mcgsm1.means)), 1e-20) for chol0, chol1 in zip(mcgsm0.cholesky_factors, mcgsm1.cholesky_factors): self.assertLess(max(abs(chol0 - chol1)), 1e-20) for pred0, pred1 in zip(mcgsm0.predictors, mcgsm1.predictors): self.assertLess(max(abs(pred0 - pred1)), 1e-20)
def add_layer(self): """ Add another spatial LSTM to the network and reinitialize MCGSM. """ self.num_layers += 1 # reinitialize MCGSM self.mcgsm = MCGSM( dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=self.mcgsm.num_components, num_scales=self.mcgsm.num_scales, num_features=self.mcgsm.num_features) # add slot for another layer self.slstm.append(None)
def test_data_gradient(self): for dim_in in [5, 0]: mcgsm = MCGSM(dim_in, 3, 4, 5, 10) cholesky_factors = [] for k in range(mcgsm.num_components): cholesky_factors.append( cholesky(cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)))) mcgsm.cholesky_factors = cholesky_factors inputs = randn(mcgsm.dim_in, 100) outputs = ones_like(mcgsm.sample(inputs)) # compute density gradient and loglikelihood dx, dy, ll = mcgsm._data_gradient(inputs, outputs) self.assertLess( max(abs(ll - mcgsm.loglikelihood(inputs, outputs))), 1e-8) h = 1e-5 dx_ = zeros_like(dx) dy_ = zeros_like(dy) for i in range(mcgsm.dim_in): inputs_p = inputs.copy() inputs_m = inputs.copy() inputs_p[i] += h inputs_m[i] -= h dx_[i] = (mcgsm.loglikelihood(inputs_p, outputs) - mcgsm.loglikelihood(inputs_m, outputs)) / (2. * h) for i in range(mcgsm.dim_out): outputs_p = outputs.copy() outputs_m = outputs.copy() outputs_p[i] += h outputs_m[i] -= h dy_[i] = (mcgsm.loglikelihood(inputs, outputs_p) - mcgsm.loglikelihood(inputs, outputs_m)) / (2. * h) self.assertLess(max(abs(dy_ - dy)), 1e-8) if mcgsm.dim_in > 0: self.assertLess(max(abs(dx_ - dx)), 1e-8)
def test_sample_video(self): xmask = dstack([ asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype='bool'), asarray([[1, 1, 1], [1, 0, 0], [0, 0, 0]], dtype='bool') ]) ymask = dstack([ asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]], dtype='bool'), asarray([[0, 0, 0], [0, 1, 0], [0, 0, 0]], dtype='bool') ]) model = MCGSM(13, 1) video_init = randn(64, 64, 5) video_sample = sample_video(video_init, model, xmask, ymask) # the first frame should be untouched self.assertLess(max(abs(video_init[:, :, 0] - video_sample[:, :, 0])), 1e-10)
def test_data_gradient(self): mcgsm = MCGSM(5, 3, 4, 5, 10) cholesky_factors = [] for k in range(mcgsm.num_components): cholesky_factors.append(cholesky(cov(randn(mcgsm.dim_out, mcgsm.dim_out ** 2)))) mcgsm.cholesky_factors = cholesky_factors inputs = randn(mcgsm.dim_in, 100) outputs = ones_like(mcgsm.sample(inputs)) # compute density gradient and loglikelihood dx, dy, ll = mcgsm._data_gradient(inputs, outputs) self.assertLess(max(abs(ll - mcgsm.loglikelihood(inputs, outputs))), 1e-8) h = 1e-5 dx_ = zeros_like(dx) dy_ = zeros_like(dy) for i in range(mcgsm.dim_in): inputs_p = inputs.copy() inputs_m = inputs.copy() inputs_p[i] += h inputs_m[i] -= h dx_[i] = (mcgsm.loglikelihood(inputs_p, outputs) - mcgsm.loglikelihood(inputs_m, outputs)) / (2.0 * h) for i in range(mcgsm.dim_out): outputs_p = outputs.copy() outputs_m = outputs.copy() outputs_p[i] += h outputs_m[i] -= h dy_[i] = (mcgsm.loglikelihood(inputs, outputs_p) - mcgsm.loglikelihood(inputs, outputs_m)) / (2.0 * h) self.assertLess(max(abs(dy_ - dy)), 1e-8) self.assertLess(max(abs(dx_ - dx)), 1e-8)
def __init__(self, num_channels=1, num_hiddens=10, num_components=4, num_scales=4, num_features=16, num_layers=1, nb_size=3, nonlinearity='TanH', verbosity=1, extended=False, input_mask=None, output_mask=None): self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask: self.input_mask = input_mask if output_mask: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM( dim_in=num_hiddens, dim_out=num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features) self.preconditioner = None # see PatchRIDE self._indicators = False
def test_sample_conditionally(self): mcgsm = MCGSM(3, 2, 2, 2, 4) # make sure there are differences between components mcgsm.weights = -log(rand(*mcgsm.weights.shape)) * 10. mcgsm.scales = square(mcgsm.scales * 3.) inputs = randn(mcgsm.dim_in, 100000) # sample directly outputs0 = mcgsm.sample(inputs) # sample indirectly labels = mcgsm.sample_prior(inputs) outputs1 = mcgsm.sample(inputs, labels) p = ks_2samp(outputs0.ravel(), outputs1.ravel())[1] self.assertGreater(p, 1e-5)
def robust_linear_regression(x, y, num_scales=3, max_iter=1000): """ Performs linear regression with Gaussian scale mixture residuals. $$y = ax + b + \\varepsilon,$$ where $\\varepsilon$ is assumed to be Gaussian scale mixture distributed. @type x: array_like @param x: list of one-dimensional inputs @type y: array_like @param y: list of one-dimensional outputs @type num_scales: int @param num_scales: number of Gaussian scale mixture components @type max_iter: int @param max_iter: number of optimization steps in parameter search @rtype: tuple @return: slope and y-intercept """ x = asarray(x).reshape(1, -1) y = asarray(y).reshape(1, -1) # preprocess inputs m = mean(x) s = std(x) x = (x - m) / s # preprocess outputs using simple linear regression C = cov(x, y) a = C[0, 1] / C[0, 0] b = mean(y) - a * mean(x) y = y - (a * x + b) # robust linear regression model = MCGSM(dim_in=1, dim_out=1, num_components=1, num_scales=num_scales, num_features=0) model.initialize(x, y) model.train(x, y, parameters={'train_means': True, 'max_iter': max_iter}) a = (a + float(model.predictors[0])) / s b = (b + float(model.means)) - a * m return a, b
def test_sample_image(self): xmask = asarray([[1, 1], [1, 0]], dtype='bool') ymask = asarray([[0, 0], [0, 1]], dtype='bool') img_init = asarray([[1., 2.], [3., 4.]]) model = MCGSM(3, 1) img_sample = sample_image(img_init, model, xmask, ymask) # only the bottom right-pixel should have been replaced self.assertLess(max(abs((img_init - img_sample).ravel()[:3])), 1e-10) # test using preconditioner wt = WhiteningPreconditioner(randn(3, 1000), randn(1, 1000)) sample_image(img_init, model, xmask, ymask, wt) # test what happens if invalid preconditioner is given self.assertRaises(TypeError, sample_image, (img_init, model, xmask, ymask, 10.)) self.assertRaises(TypeError, sample_image, (img_init, model, xmask, ymask, model))
def test_patchmcgsm_train(self): xmask = ones([2, 2], dtype='bool') ymask = zeros([2, 2], dtype='bool') xmask[-1, -1] = False ymask[-1, -1] = True model = PatchMCGSM(2, 2, xmask, ymask, model=MCGSM(sum(xmask), 1, 1, 1)) data = randn(4, 10000) model.initialize(data) converged = model.train(data, parameters={ 'verbosity': 0, 'max_iter': 200, 'treshold': 1e-4 }) self.assertTrue(converged)
def test_sample_conditionally(self): mcgsm = MCGSM(3, 2, 2, 2, 4) # make sure there are differences between components mcgsm.weights = -log(rand(*mcgsm.weights.shape)) * 10.0 mcgsm.scales = square(mcgsm.scales * 3.0) inputs = randn(mcgsm.dim_in, 100000) # sample directly outputs0 = mcgsm.sample(inputs) # sample indirectly labels = mcgsm.sample_prior(inputs) outputs1 = mcgsm.sample(inputs, labels) p = ks_2samp(outputs0.ravel(), outputs1.ravel())[1] self.assertGreater(p, 1e-5)
class RIDE(object): """ An implementation of the recurrent image density estimator (RIDE). B{References:} - Theis, L. and Bethge, M. (2015). I{Generative Image Modeling Using Spatial LSTMs.} """ # maximum batch size used by Caffe internally MAX_BATCH_SIZE = 200 def __init__( self, num_channels=1, num_hiddens=10, num_components=8, num_scales=4, num_features=16, num_layers=1, nb_size=5, nonlinearity="TanH", verbosity=1, extended=False, input_mask=None, output_mask=None, ): """ @type num_channels: C{int} @param num_channels: dimensionality of each pixel @type num_hiddens: C{int} @param num_hiddens: number of LSTM units in each spatial LSTM layer @type num_components: C{int} @param num_components: number of mixture components used by the MCGSM @type num_scales: C{int} @param num_scales: number of scales used by the MCGSM @type num_features: C{int} @param num_features: number of quadratic features used by the MCGSM @type num_layers: C{int} @param num_layers: number of layers of spatial LSTM units @type nb_size: C{int} @param nb_size: controls the neighborhood of pixels read from an image @type nonlinearity: C{str} @param nonlinearity: nonlinearity used by spatial LSTM (e.g., TanH, ReLU) @type verbosity: C{int} @param verbosity: controls how much information is printed during training, etc. @type extended: C{bool} @param extended: use previous memory states as additional inputs to LSTM (more parameters) @type input_mask C{ndarray} @param input_mask: Boolean mask used to define custom input neighborhood of pixels @type output_mask C{ndarray} @param output_mask: determines the position of the output pixel relative to the neighborhood """ self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask is not None: self.input_mask = input_mask if output_mask is not None: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM( dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features, ) self.preconditioner = None def add_layer(self): """ Add another spatial LSTM to the network and reinitialize MCGSM. """ self.num_layers += 1 # reinitialize MCGSM self.mcgsm = MCGSM( dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=self.mcgsm.num_components, num_scales=self.mcgsm.num_scales, num_features=self.mcgsm.num_features, ) # add slot for another layer self.slstm.append(None) def _precondition(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs (conditional whitening). @type inputs: C{ndarray} @param inputs: pixel neighborhoods stored column-wise @type outputs: C{ndarray} @param outputs: output pixels stored column-wise """ shape = inputs.shape if outputs is None: if self.preconditioner is None: raise RuntimeError("No preconditioning possible.") inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T # avoids memory issues MAX_SAMPLES = 500000 if self.preconditioner is None: if inputs.shape[1] > MAX_SAMPLES: idx = random_select(MAX_SAMPLES, inputs.shape[1]) self.preconditioner = WhiteningPreconditioner(inputs[:, idx], outputs[:, idx]) else: self.preconditioner = WhiteningPreconditioner(inputs, outputs) for b in range(0, inputs.shape[1], MAX_SAMPLES): inputs[:, b : b + MAX_SAMPLES], outputs[:, b : b + MAX_SAMPLES] = self.preconditioner( inputs[:, b : b + MAX_SAMPLES], outputs[:, b : b + MAX_SAMPLES] ) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _precondition_inverse(self, inputs, outputs=None): """ Reintroduce correlations removed by conditional whitening. @type inputs: C{ndarray} @param inputs: pixel neighborhoods stored column-wise @type outputs: C{ndarray} @param outputs: output pixels stored column-wise """ if self.preconditioner is None: raise RuntimeError("No preconditioner set.") shape = inputs.shape if outputs is None: inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner.inverse(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.inverse(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _adjust_gradient(self, inputs, outputs): """ Adjust gradients to take into account preconditioning. @type inputs: C{ndarray} @param inputs: gradient with respect to conditionally whitened inputs @type outputs: C{ndarray} @param outputs: gradient with respect to conditionally whitened outputs """ if self.preconditioner is None: raise RuntimeError("No preconditioner set.") shape = inputs.shape inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.adjust_gradient(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _preprocess(self, images): """ Extract causal neighborhoods from images. @type images: C{ndarray}/C{list} @param images: array or list of images to process @rtype: C{tuple} @return: one array storing inputs (neighborhoods) and one array storing outputs (pixels) """ def process(image): inputs, outputs = generate_data_from_image(image, self.input_mask, self.output_mask) inputs = asarray( inputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1 ), dtype="float32", ) outputs = asarray( outputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1 ), dtype="float32", ) return inputs, outputs inputs, outputs = zip(*mapp(process, images)) return asarray(inputs), asarray(outputs) def loglikelihood(self, images): """ Returns a log-likelihood for each reachable pixel (in nats). @type images: C{ndarray}/C{list} @param images: array or list of images for which to evaluate log-likelihood @rtype: C{ndarray} @return: an array of log-likelihoods for each image and predicted pixel """ inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print "Computing Jacobian..." logjacobian = self.preconditioner.logjacobian( inputs.reshape(-1, sum(self.input_mask)).T, outputs.reshape(-1, self.num_channels).T ) if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) else: logjacobian = 0.0 # compute hidden unit activations hiddens = inputs batch_size = min([hiddens.shape[0], self.MAX_BATCH_SIZE]) if self.verbosity > 0: print "Computing hidden states..." for l in range(self.num_layers): # create SLSTM if ( self.slstm[l].num_rows != hiddens.shape[1] or self.slstm[l].num_cols != hiddens.shape[2] or self.slstm[l].batch_size != batch_size ): self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print "Computing likelihood..." # evaluate log-likelihood loglik = ( self.mcgsm.loglikelihood(hiddens.reshape(-1, self.num_hiddens).T, outputs.reshape(-1, self.num_channels).T) + logjacobian ) return loglik.reshape(hiddens.shape[0], hiddens.shape[1], hiddens.shape[2]) def evaluate(self, images): """ Computes the average negative log-likelihood in bits per pixel. @type images: C{ndarray}/C{list} @param images: an array or list of test images @rtype: C{float} @return: average negative log-likelihood in bits per pixel """ return -mean(self.loglikelihood(images)) / log(2.0) / self.num_channels def train( self, images, batch_size=50, num_epochs=20, method="SGD", train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1.0, decay1=0.9, decay2=0.999, precondition=True, ): """ Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO). @type images: C{ndarray}/C{list} @param images: an array or a list of training images (e.g., Nx32x32x3) @type batch_size: C{int} @param batch_size: batch size used by SGD @type num_epochs: C{int} @param num_epochs: number of passes through the training set @type method: C{str} @param method: either 'SGD', 'SFO', or 'ADAM' @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type train_top_layer: C{bool} @param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained @type momentum: C{float} @param momentum: momentum rate used by SGD @type learning_rate: C{float} @param learning_rate: learning rate used by SGD @type decay1: C{float} @param decay1: hyperparameter used by ADAM @type decay2: C{float} @param decay2: hyperparameter used by ADAM @type precondition: C{bool} @param precondition: whether or not to perform conditional whitening @rtype: C{list} @return: evolution of negative log-likelihood (bits per pixel) over the training """ if images.shape[1] < self.input_mask.shape[0] or images.shape[2] < self.input_mask.shape[1]: raise ValueError("Images too small.") if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if precondition: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) if self.verbosity > 0: print "Creating SLSTMs..." # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params["slstm"][l]) self.mcgsm._set_parameters(params["mcgsm"], {"train_means": train_means}) # select batch and compute hidden activations Y = outputs[idx : idx + batch_size] H = inputs[idx : idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # average log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta["slstm"][l]["inputs"] del df_dtheta["slstm"][l]["inputs"] else: # no need to compute derivatives with respect to input units df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta["mcgsm"] = ( self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={"train_means": train_means}) * log(2.0) * self.mcgsm.dim_out ) return f, df_dtheta # collect current parameters params = {} params["slstm"] = [0.0] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params["slstm"][l] = self.slstm[l].parameters() params["mcgsm"] = self.mcgsm._parameters({"train_means": train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) if self.verbosity > 0: print "Training..." if method.upper() == "SFO": try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt["slstm"][l]) self.mcgsm._set_parameters(params_opt["mcgsm"], {"train_means": train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == "SGD": loss = [] diff = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff["slstm"][l] = {} for key in params["slstm"][l]: diff["slstm"][l][key] = zeros_like(params["slstm"][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff["slstm"][l][key] = momentum * diff["slstm"][l][key] - df["slstm"][l][key] params["slstm"][l][key] = params["slstm"][l][key] + learning_rate * diff["slstm"][l][key] # update MCGSM parameters diff["mcgsm"] = momentum * diff["mcgsm"] - df["mcgsm"] params["mcgsm"] = params["mcgsm"] + learning_rate * diff["mcgsm"] if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss elif method.upper() == "ADAM": loss = [] diff_mean = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} diff_sqrd = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])} for l in train_layers: diff_mean["slstm"][l] = {} diff_sqrd["slstm"][l] = {} for key in params["slstm"][l]: diff_mean["slstm"][l][key] = zeros_like(params["slstm"][l][key]) diff_sqrd["slstm"][l][key] = zeros_like(params["slstm"][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.0) / self.num_channels) # include bias correction in step width step_width = learning_rate / (1.0 - power(decay1, t)) * sqrt(1.0 - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params["slstm"][l]: diff_mean["slstm"][l][key] = ( decay1 * diff_mean["slstm"][l][key] + (1.0 - decay1) * df["slstm"][l][key] ) diff_sqrd["slstm"][l][key] = decay2 * diff_sqrd["slstm"][l][key] + (1.0 - decay2) * square( df["slstm"][l][key] ) params["slstm"][l][key] = params["slstm"][l][key] - step_width * diff_mean["slstm"][l][ key ] / (1e-8 + sqrt(diff_sqrd["slstm"][l][key])) # update MCGSM parameters diff_mean["mcgsm"] = decay1 * diff_mean["mcgsm"] + (1.0 - decay1) * df["mcgsm"] diff_sqrd["mcgsm"] = decay2 * diff_sqrd["mcgsm"] + (1.0 - decay2) * square(df["mcgsm"]) params["mcgsm"] = params["mcgsm"] - step_width * diff_mean["mcgsm"] / ( 1e-8 + sqrt(diff_sqrd["mcgsm"]) ) if self.verbosity > 0: print "{0:>5} {1:>10.4f} {2:>10.4f}".format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :]) ) return loss else: raise ValueError("Unknown method '{0}'.".format(method)) def finetune(self, images, max_iter=1000, train_means=False, num_samples_train=500000, num_samples_valid=100000): """ Train MCGSM using L-BFGS while keeping parameters of spatial LSTMs fixed. @type images: C{ndarray}/C{list} @param images: an array or a list of images @type max_iter: C{int} @param max_iter: maximum number of L-BFGS iterations @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type num_samples_train: C{int} @param num_samples_train: number of training examples extracted from images @type num_samples_valid: C{int} @type num_samples_valid: number of validation examples used for early stopping @rtype: C{bool} @return: true if training converged, false otherwise """ if images.shape[0] > min([200000, num_samples_train]): images = images[random_select(min([200000, num_samples_train]), images.shape[0])] if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if self.preconditioner: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs if self.verbosity > 0: print "Computing hidden states..." for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min([hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print "Preparing inputs and outputs..." # form inputs to MCGSM hiddens = hiddens.reshape(-1, self.num_hiddens).T outputs = outputs.reshape(-1, self.num_channels).T if hiddens.shape[1] > num_samples_train: num_samples_valid = min([num_samples_valid, hiddens.shape[1] - num_samples_train]) # select subset of data points for finetuning idx = random_select(num_samples_train + num_samples_valid, hiddens.shape[1]) if num_samples_valid > 0: # split data into training and validation set hiddens_train = asarray(hiddens[:, idx[:num_samples_train]], order="F") outputs_train = asarray(outputs[:, idx[:num_samples_train]], order="F") hiddens_valid = asarray(hiddens[:, idx[num_samples_train:]], order="F") outputs_valid = asarray(outputs[:, idx[num_samples_train:]], order="F") # finetune with early stopping based on validation performance return self.mcgsm.train( hiddens_train, outputs_train, hiddens_valid, outputs_valid, parameters={"verbosity": self.verbosity, "train_means": train_means, "max_iter": max_iter}, ) else: hiddens = asarray(hiddens[:, idx], order="F") outputs = asarray(outputs[:, idx], order="F") if self.verbosity > 0: print "Finetuning..." return self.mcgsm.train( hiddens, outputs, parameters={"verbosity": self.verbosity, "train_means": train_means, "max_iter": max_iter} ) def hidden_states(self, images, return_all=False, layer=None): """ Compute hidden states of LSTM units for given images. By default, the last layer's hidden units are computed. @type images: C{ndarray}/C{list} @param images: array or list of images to process @type return_all: C{bool} @param return_all: if true, also return preconditioned inputs and outputs @type layer: C{int} @param layer: a positive integer controlling which layer's hidden units to compute @rtype: C{ndarray}/C{tuple} @return: hidden states or a tuple of inputs, hidden states, and outputs """ if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs batch_size = min([hiddens.shape[0], self.MAX_BATCH_SIZE]) if layer is None or layer < 1: layer = self.num_layers for l in range(layer): if ( self.slstm[l].num_rows != hiddens.shape[1] or self.slstm[l].num_cols != hiddens.shape[2] or self.slstm[l].batch_size != batch_size ): self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) hiddens = self.slstm[l].forward(hiddens) if return_all: return inputs, hiddens, outputs return hiddens def gradient(self, images): """ Returns the average log-likelihood [nat] and its gradient with respect to pixel values. @type images: C{ndarray} @param images: images at which to evaluate the density's gradient @rtype: C{tuple} @return: average log-likelihood and gradient with respect to images """ if self.verbosity > 0: print "Preprocessing..." inputs, outputs = self._preprocess(images) if self.preconditioner: if self.verbosity > 0: print "Preconditioning..." # remove correlations inputs, outputs = self._precondition(inputs, outputs) if self.verbosity > 0: print "Creating SLSTMs..." # create SLSTMs batch_size = min([images.shape[0], self.MAX_BATCH_SIZE]) for l in range(self.num_layers): if ( self.slstm[l] is None or self.slstm[l].batch_size != batch_size or self.slstm[l].num_rows != inputs.shape[1] or self.slstm[l].num_cols != inputs.shape[2] ): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity, ) # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): hiddens = self.slstm[l].forward(hiddens) # form inputs to MCGSM H_flat = hiddens.reshape(-1, self.num_hiddens).T Y_flat = outputs.reshape(-1, self.num_channels).T # compute gradients df_dh, df_dy, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*hiddens.shape) / H_flat.shape[1] df_dy = df_dy.T.reshape(*outputs.shape) / H_flat.shape[1] # average log-likelihood f = sum(loglik) / H_flat.shape[1] for l in range(self.num_layers)[::-1]: df_dh = self.slstm[l].backward(df_dh, force_backward=True)["inputs"] if self.preconditioner: df_dh, df_dy = self._adjust_gradient(df_dh, df_dy) # locate output pixel in output mask for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break gradient = zeros_like(images) # make sure mask and gradient have compatible dimensionality if gradient.ndim == 4 and self.input_mask.ndim == 2: gradient = gradient[:, :, :, 0] for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): patch = gradient[:, i : i + self.input_mask.shape[0], j : j + self.output_mask.shape[1]] patch[:, self.input_mask] += df_dh[:, i, j] patch[:, self.output_mask] += df_dy[:, i, j] return f, gradient.reshape(*images.shape) def sample(self, images, min_values=None, max_values=None, mask=None, return_loglik=False): """ Sample one or several images. @type images: C{ndarray}/C{list} @param images: an array or a list of images to initialize pixels at boundaries @type min_values: C{ndarray}/C{list} @param min_values: list of lower bounds for each channel (for increased stability) @type max_values: C{ndarray}/C{list} @param max_values: list of upper bounds for each channel (for increased stability) @type mask: C{ndarray} @param mask: replace only certain pixels indicated by this Boolean mask @rtype: C{ndarray} @return: sampled images of the size of the images given as input """ # reshape images into four-dimensional arrays shape = images.shape if images.ndim == 2: images = images[None, :, :, None] elif images.ndim == 3: if self.num_channels > 1: images = images[None] else: images = images[:, :, :, None] # create spatial LSTMs for sampling for l in range(self.num_layers): if ( self.slstm[l].num_rows != 1 or self.slstm[l].num_cols != 1 or self.slstm[l].batch_size != images.shape[0] ): self.slstm[l] = SLSTM( num_rows=1, num_cols=1, num_channels=sum(self.input_mask) if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=images.shape[0], nonlinearity=self.nonlinearity, slstm=self.slstm[l], extended=self.extended, ) # container for hidden and memory unit activations hiddens = [] memory = [] for l in range(self.num_layers): hiddens.append(defaultdict(lambda: 0.0)) memory.append(defaultdict(lambda: 0.0)) # locate output pixel for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break if min_values is not None: min_values = asarray(min_values).reshape(1, 1, 1, -1) if self.output_mask.ndim > 2: min_values = min_values[:, :, :, self.output_mask[i_off, j_off]] if max_values is not None: max_values = asarray(max_values).reshape(1, 1, 1, -1) if self.output_mask.ndim > 2: max_values = max_values[:, :, :, self.output_mask[i_off, j_off]] # unnormalized log-density of generated sample logq = 0.0 for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): # extract patches from images patches = images[:, i : i + self.input_mask.shape[0], j : j + self.input_mask.shape[1]] # extract causal neighborhoods from patches inputs = [] for k in range(images.shape[0]): inputs.append(generate_data_from_image(patches[k, :, :], self.input_mask, self.output_mask)[0]) inputs = asarray(inputs) inputs = inputs.reshape(inputs.shape[0], 1, 1, -1) if self.preconditioner: inputs = self._precondition(inputs) # set hidden unit activations for l in range(self.num_layers): self.slstm[l].net.blobs["h_init_i_jm1"].data[:] = hiddens[l][i, j - 1] self.slstm[l].net.blobs["h_init_im1_j"].data[:] = hiddens[l][i - 1, j] self.slstm[l].net.blobs["c_init_i_jm1"].data[:] = memory[l][i, j - 1] self.slstm[l].net.blobs["c_init_im1_j"].data[:] = memory[l][i - 1, j] # compute hidden unit activations activations = inputs for l in range(self.num_layers): activations = self.slstm[l].forward(activations) # store hidden unit activations for l in range(self.num_layers): hiddens[l][i, j] = self.slstm[l].net.blobs["outputs"].data.copy() memory[l][i, j] = self.slstm[l].net.blobs["c_0_0"].data.copy() if mask is not None and not mask[i + i_off, j + j_off]: # skip sampling of this pixel continue for _ in range(10): # sample MCGSM outputs = self.mcgsm.sample(hiddens[-1][i, j].reshape(-1, self.num_hiddens).T) if not any(isnan(outputs)): break print "Warning: NaNs detected." if return_loglik: logq += self.mcgsm.loglikelihood(hiddens[-1][i, j].reshape(-1, self.num_hiddens).T, outputs) outputs = outputs.T.reshape(outputs.shape[1], 1, 1, outputs.shape[0]) if self.preconditioner: inputs, outputs = self._precondition_inverse(inputs, outputs) if max_values is not None: outputs[outputs > max_values] = max_values[outputs > max_values] if min_values is not None: outputs[outputs < min_values] = min_values[outputs < min_values] # insert sampled pixels into images if self.output_mask.ndim > 2: images[:, i + i_off, j + j_off][:, self.output_mask[i_off, j_off]] = outputs else: images[:, i + i_off, j + j_off] = outputs images = images.reshape(*shape) if return_loglik: return images, logq return images def _logq(self, images, mask): """ Computes an unnormalized conditional log-likelihood used by Metropolis-Hastings (e.g., for inpainting). """ inputs, hiddens, outputs = self.hidden_states(images, return_all=True) # locate output pixel for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break # unnormalized log-density of generated sample logq = 0.0 for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): if not mask[i + i_off, j + j_off]: # skip evaluation of this pixel continue logq += self.mcgsm.loglikelihood( hiddens[:, i, j, :].reshape(-1, self.num_hiddens).T, outputs[:, i, j, :] ) return logq def __setstate__(self, state): """ Method used by pickle module, for backwards compatibility reasons. """ self.__dict__ = state if not hasattr(self, "nonlinearity"): self.nonlinearity = "TanH" if not hasattr(self, "extended"): self.extended = False
class RIDE(object): """ An implementation of the recurrent image density estimator (RIDE). B{References:} - Theis, L. and Bethge, M. (2015). I{Generative Image Modeling Using Spatial LSTMs.} """ # maximum batch size used by Caffe internally MAX_BATCH_SIZE = 200 def __init__(self, num_channels=1, num_hiddens=10, num_components=8, num_scales=4, num_features=16, num_layers=1, nb_size=5, nonlinearity='TanH', verbosity=1, extended=False, input_mask=None, output_mask=None): """ @type num_channels: C{int} @param num_channels: dimensionality of each pixel @type num_hiddens: C{int} @param num_hiddens: number of LSTM units in each spatial LSTM layer @type num_components: C{int} @param num_components: number of mixture components used by the MCGSM @type num_scales: C{int} @param num_scales: number of scales used by the MCGSM @type num_features: C{int} @param num_features: number of quadratic features used by the MCGSM @type num_layers: C{int} @param num_layers: number of layers of spatial LSTM units @type nb_size: C{int} @param nb_size: controls the neighborhood of pixels read from an image @type nonlinearity: C{str} @param nonlinearity: nonlinearity used by spatial LSTM (e.g., TanH, ReLU) @type verbosity: C{int} @param verbosity: controls how much information is printed during training, etc. @type extended: C{bool} @param extended: use previous memory states as additional inputs to LSTM (more parameters) @type input_mask C{ndarray} @param input_mask: Boolean mask used to define custom input neighborhood of pixels @type output_mask C{ndarray} @param output_mask: determines the position of the output pixel relative to the neighborhood """ self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask is not None: self.input_mask = input_mask if output_mask is not None: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM(dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features) self.preconditioner = None def add_layer(self): """ Add another spatial LSTM to the network and reinitialize MCGSM. """ self.num_layers += 1 # reinitialize MCGSM self.mcgsm = MCGSM(dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=self.mcgsm.num_components, num_scales=self.mcgsm.num_scales, num_features=self.mcgsm.num_features) # add slot for another layer self.slstm.append(None) def _precondition(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs (conditional whitening). @type inputs: C{ndarray} @param inputs: pixel neighborhoods stored column-wise @type outputs: C{ndarray} @param outputs: output pixels stored column-wise """ shape = inputs.shape if outputs is None: if self.preconditioner is None: raise RuntimeError('No preconditioning possible.') inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T # avoids memory issues MAX_SAMPLES = 500000 if self.preconditioner is None: if inputs.shape[1] > MAX_SAMPLES: idx = random_select(MAX_SAMPLES, inputs.shape[1]) self.preconditioner = WhiteningPreconditioner( inputs[:, idx], outputs[:, idx]) else: self.preconditioner = WhiteningPreconditioner( inputs, outputs) for b in range(0, inputs.shape[1], MAX_SAMPLES): inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES] = \ self.preconditioner(inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES]) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _precondition_inverse(self, inputs, outputs=None): """ Reintroduce correlations removed by conditional whitening. @type inputs: C{ndarray} @param inputs: pixel neighborhoods stored column-wise @type outputs: C{ndarray} @param outputs: output pixels stored column-wise """ if self.preconditioner is None: raise RuntimeError('No preconditioner set.') shape = inputs.shape if outputs is None: inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner.inverse(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.inverse(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _adjust_gradient(self, inputs, outputs): """ Adjust gradients to take into account preconditioning. @type inputs: C{ndarray} @param inputs: gradient with respect to conditionally whitened inputs @type outputs: C{ndarray} @param outputs: gradient with respect to conditionally whitened outputs """ if self.preconditioner is None: raise RuntimeError('No preconditioner set.') shape = inputs.shape inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.adjust_gradient(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _preprocess(self, images): """ Extract causal neighborhoods from images. @type images: C{ndarray}/C{list} @param images: array or list of images to process @rtype: C{tuple} @return: one array storing inputs (neighborhoods) and one array storing outputs (pixels) """ def process(image): inputs, outputs = generate_data_from_image(image, self.input_mask, self.output_mask) inputs = asarray(inputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') outputs = asarray(outputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') return inputs, outputs inputs, outputs = zip(*mapp(process, images)) return asarray(inputs), asarray(outputs) def loglikelihood(self, images): """ Returns a log-likelihood for each reachable pixel (in nats). @type images: C{ndarray}/C{list} @param images: array or list of images for which to evaluate log-likelihood @rtype: C{ndarray} @return: an array of log-likelihoods for each image and predicted pixel """ inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print 'Computing Jacobian...' logjacobian = self.preconditioner.logjacobian( inputs.reshape(-1, sum(self.input_mask)).T, outputs.reshape(-1, self.num_channels).T) if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) else: logjacobian = 0. # compute hidden unit activations hiddens = inputs batch_size = min([hiddens.shape[0], self.MAX_BATCH_SIZE]) if self.verbosity > 0: print 'Computing hidden states...' for l in range(self.num_layers): # create SLSTM if self.slstm[l].num_rows != hiddens.shape[1] \ or self.slstm[l].num_cols != hiddens.shape[2] \ or self.slstm[l].batch_size != batch_size: self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print 'Computing likelihood...' # evaluate log-likelihood loglik = self.mcgsm.loglikelihood( hiddens.reshape(-1, self.num_hiddens).T, outputs.reshape(-1, self.num_channels).T) + logjacobian return loglik.reshape(hiddens.shape[0], hiddens.shape[1], hiddens.shape[2]) def evaluate(self, images): """ Computes the average negative log-likelihood in bits per pixel. @type images: C{ndarray}/C{list} @param images: an array or list of test images @rtype: C{float} @return: average negative log-likelihood in bits per pixel """ return -mean(self.loglikelihood(images)) / log(2.) / self.num_channels def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO). @type images: C{ndarray}/C{list} @param images: an array or a list of training images (e.g., Nx32x32x3) @type batch_size: C{int} @param batch_size: batch size used by SGD @type num_epochs: C{int} @param num_epochs: number of passes through the training set @type method: C{str} @param method: either 'SGD', 'SFO', or 'ADAM' @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type train_top_layer: C{bool} @param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained @type momentum: C{float} @param momentum: momentum rate used by SGD @type learning_rate: C{float} @param learning_rate: learning rate used by SGD @type decay1: C{float} @param decay1: hyperparameter used by ADAM @type decay2: C{float} @param decay2: hyperparameter used by ADAM @type precondition: C{bool} @param precondition: whether or not to perform conditional whitening @rtype: C{list} @return: evolution of negative log-likelihood (bits per pixel) over the training """ if images.shape[1] < self.input_mask.shape[0] or images.shape[ 2] < self.input_mask.shape[1]: raise ValueError('Images too small.') if self.verbosity > 0: print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) if self.verbosity > 0: print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # average log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward( df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient( H_flat, Y_flat, parameters={'train_means': train_means }) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) if self.verbosity > 0: print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.) / self.num_channels) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][ l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][ key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params[ 'mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like( params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like( params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f / log(2.) / self.num_channels) # include bias correction in step width step_width = learning_rate / ( 1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + ( 1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + ( 1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method)) def finetune(self, images, max_iter=1000, train_means=False, num_samples_train=500000, num_samples_valid=100000): """ Train MCGSM using L-BFGS while keeping parameters of spatial LSTMs fixed. @type images: C{ndarray}/C{list} @param images: an array or a list of images @type max_iter: C{int} @param max_iter: maximum number of L-BFGS iterations @type train_means: C{bool} @param train_means: whether or not to optimize the mean parameters of the MCGSM @type num_samples_train: C{int} @param num_samples_train: number of training examples extracted from images @type num_samples_valid: C{int} @type num_samples_valid: number of validation examples used for early stopping @rtype: C{bool} @return: true if training converged, false otherwise """ if images.shape[0] > min([200000, num_samples_train]): images = images[random_select(min([200000, num_samples_train]), images.shape[0])] if self.verbosity > 0: print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs if self.verbosity > 0: print 'Computing hidden states...' for l in range(self.num_layers): self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min( [hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print 'Preparing inputs and outputs...' # form inputs to MCGSM hiddens = hiddens.reshape(-1, self.num_hiddens).T outputs = outputs.reshape(-1, self.num_channels).T if hiddens.shape[1] > num_samples_train: num_samples_valid = min( [num_samples_valid, hiddens.shape[1] - num_samples_train]) # select subset of data points for finetuning idx = random_select(num_samples_train + num_samples_valid, hiddens.shape[1]) if num_samples_valid > 0: # split data into training and validation set hiddens_train = asarray(hiddens[:, idx[:num_samples_train]], order='F') outputs_train = asarray(outputs[:, idx[:num_samples_train]], order='F') hiddens_valid = asarray(hiddens[:, idx[num_samples_train:]], order='F') outputs_valid = asarray(outputs[:, idx[num_samples_train:]], order='F') # finetune with early stopping based on validation performance return self.mcgsm.train(hiddens_train, outputs_train, hiddens_valid, outputs_valid, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter }) else: hiddens = asarray(hiddens[:, idx], order='F') outputs = asarray(outputs[:, idx], order='F') if self.verbosity > 0: print 'Finetuning...' return self.mcgsm.train(hiddens, outputs, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter }) def hidden_states(self, images, return_all=False, layer=None): """ Compute hidden states of LSTM units for given images. By default, the last layer's hidden units are computed. @type images: C{ndarray}/C{list} @param images: array or list of images to process @type return_all: C{bool} @param return_all: if true, also return preconditioned inputs and outputs @type layer: C{int} @param layer: a positive integer controlling which layer's hidden units to compute @rtype: C{ndarray}/C{tuple} @return: hidden states or a tuple of inputs, hidden states, and outputs """ if self.verbosity > 0: print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs batch_size = min([hiddens.shape[0], self.MAX_BATCH_SIZE]) if layer is None or layer < 1: layer = self.num_layers for l in range(layer): if self.slstm[l].num_rows != hiddens.shape[1] \ or self.slstm[l].num_cols != hiddens.shape[2] \ or self.slstm[l].batch_size != batch_size: self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if return_all: return inputs, hiddens, outputs return hiddens def gradient(self, images): """ Returns the average log-likelihood [nat] and its gradient with respect to pixel values. @type images: C{ndarray} @param images: images at which to evaluate the density's gradient @rtype: C{tuple} @return: average log-likelihood and gradient with respect to images """ if self.verbosity > 0: print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) if self.verbosity > 0: print 'Creating SLSTMs...' # create SLSTMs batch_size = min([images.shape[0], self.MAX_BATCH_SIZE]) for l in range(self.num_layers): if self.slstm[l] is None or \ self.slstm[l].batch_size != batch_size or \ self.slstm[l].num_rows != inputs.shape[1] or \ self.slstm[l].num_cols != inputs.shape[2]: self.slstm[l] = SLSTM(num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=batch_size, nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): hiddens = self.slstm[l].forward(hiddens) # form inputs to MCGSM H_flat = hiddens.reshape(-1, self.num_hiddens).T Y_flat = outputs.reshape(-1, self.num_channels).T # compute gradients df_dh, df_dy, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*hiddens.shape) / H_flat.shape[1] df_dy = df_dy.T.reshape(*outputs.shape) / H_flat.shape[1] # average log-likelihood f = sum(loglik) / H_flat.shape[1] for l in range(self.num_layers)[::-1]: df_dh = self.slstm[l].backward(df_dh, force_backward=True)['inputs'] if self.preconditioner: df_dh, df_dy = self._adjust_gradient(df_dh, df_dy) # locate output pixel in output mask for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break gradient = zeros_like(images) # make sure mask and gradient have compatible dimensionality if gradient.ndim == 4 and self.input_mask.ndim == 2: gradient = gradient[:, :, :, 0] for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): patch = gradient[:, i:i + self.input_mask.shape[0], j:j + self.output_mask.shape[1]] patch[:, self.input_mask] += df_dh[:, i, j] patch[:, self.output_mask] += df_dy[:, i, j] return f, gradient.reshape(*images.shape) def sample(self, images, min_values=None, max_values=None, mask=None, return_loglik=False): """ Sample one or several images. @type images: C{ndarray}/C{list} @param images: an array or a list of images to initialize pixels at boundaries @type min_values: C{ndarray}/C{list} @param min_values: list of lower bounds for each channel (for increased stability) @type max_values: C{ndarray}/C{list} @param max_values: list of upper bounds for each channel (for increased stability) @type mask: C{ndarray} @param mask: replace only certain pixels indicated by this Boolean mask @rtype: C{ndarray} @return: sampled images of the size of the images given as input """ # reshape images into four-dimensional arrays shape = images.shape if images.ndim == 2: images = images[None, :, :, None] elif images.ndim == 3: if self.num_channels > 1: images = images[None] else: images = images[:, :, :, None] # create spatial LSTMs for sampling for l in range(self.num_layers): if self.slstm[l].num_rows != 1 \ or self.slstm[l].num_cols != 1 \ or self.slstm[l].batch_size != images.shape[0]: self.slstm[l] = SLSTM(num_rows=1, num_cols=1, num_channels=sum(self.input_mask) if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=images.shape[0], nonlinearity=self.nonlinearity, slstm=self.slstm[l], extended=self.extended) # container for hidden and memory unit activations hiddens = [] memory = [] for l in range(self.num_layers): hiddens.append(defaultdict(lambda: 0.)) memory.append(defaultdict(lambda: 0.)) # locate output pixel for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break if min_values is not None: min_values = asarray(min_values).reshape(1, 1, 1, -1) if self.output_mask.ndim > 2: min_values = min_values[:, :, :, self.output_mask[i_off, j_off]] if max_values is not None: max_values = asarray(max_values).reshape(1, 1, 1, -1) if self.output_mask.ndim > 2: max_values = max_values[:, :, :, self.output_mask[i_off, j_off]] # unnormalized log-density of generated sample logq = 0. for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): # extract patches from images patches = images[:, i:i + self.input_mask.shape[0], j:j + self.input_mask.shape[1]] # extract causal neighborhoods from patches inputs = [] for k in range(images.shape[0]): inputs.append( generate_data_from_image(patches[k, :, :], self.input_mask, self.output_mask)[0]) inputs = asarray(inputs) inputs = inputs.reshape(inputs.shape[0], 1, 1, -1) if self.preconditioner: inputs = self._precondition(inputs) # set hidden unit activations for l in range(self.num_layers): self.slstm[l].net.blobs['h_init_i_jm1'].data[:] = hiddens[ l][i, j - 1] self.slstm[l].net.blobs['h_init_im1_j'].data[:] = hiddens[ l][i - 1, j] self.slstm[l].net.blobs['c_init_i_jm1'].data[:] = memory[ l][i, j - 1] self.slstm[l].net.blobs['c_init_im1_j'].data[:] = memory[ l][i - 1, j] # compute hidden unit activations activations = inputs for l in range(self.num_layers): activations = self.slstm[l].forward(activations) # store hidden unit activations for l in range(self.num_layers): hiddens[l][ i, j] = self.slstm[l].net.blobs['outputs'].data.copy() memory[l][ i, j] = self.slstm[l].net.blobs['c_0_0'].data.copy() if mask is not None and not mask[i + i_off, j + j_off]: # skip sampling of this pixel continue for _ in range(10): # sample MCGSM outputs = self.mcgsm.sample(hiddens[-1][i, j].reshape( -1, self.num_hiddens).T) if not any(isnan(outputs)): break print 'Warning: NaNs detected.' if return_loglik: logq += self.mcgsm.loglikelihood( hiddens[-1][i, j].reshape(-1, self.num_hiddens).T, outputs) outputs = outputs.T.reshape(outputs.shape[1], 1, 1, outputs.shape[0]) if self.preconditioner: inputs, outputs = self._precondition_inverse( inputs, outputs) if max_values is not None: outputs[outputs > max_values] = max_values[ outputs > max_values] if min_values is not None: outputs[outputs < min_values] = min_values[ outputs < min_values] # insert sampled pixels into images if self.output_mask.ndim > 2: images[:, i + i_off, j + j_off][:, self.output_mask[i_off, j_off]] = outputs else: images[:, i + i_off, j + j_off] = outputs images = images.reshape(*shape) if return_loglik: return images, logq return images def _logq(self, images, mask): """ Computes an unnormalized conditional log-likelihood used by Metropolis-Hastings (e.g., for inpainting). """ inputs, hiddens, outputs = self.hidden_states(images, return_all=True) # locate output pixel for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break # unnormalized log-density of generated sample logq = 0. for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): if not mask[i + i_off, j + j_off]: # skip evaluation of this pixel continue logq += self.mcgsm.loglikelihood( hiddens[:, i, j, :].reshape(-1, self.num_hiddens).T, outputs[:, i, j, :]) return logq def __setstate__(self, state): """ Method used by pickle module, for backwards compatibility reasons. """ self.__dict__ = state if not hasattr(self, 'nonlinearity'): self.nonlinearity = 'TanH' if not hasattr(self, 'extended'): self.extended = False
def test_gradient(self): mcgsm = MCGSM(5, 2, 2, 4, 10) cholesky_factors = [] for k in range(mcgsm.num_components): cholesky_factors.append( cholesky(cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)))) mcgsm.cholesky_factors = cholesky_factors mcgsm.linear_features = randn(mcgsm.num_components, mcgsm.dim_in) / 5. mcgsm.means = randn(mcgsm.dim_out, mcgsm.num_components) / 5. err = mcgsm._check_gradient(randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5) self.assertLess(err, 1e-8) # without regularization for param in [ 'priors', 'scales', 'weights', 'features', 'chol', 'pred', 'linear_features', 'means' ]: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-5, parameters={ 'train_prior': param == 'priors', 'train_scales': param == 'scales', 'train_weights': param == 'weights', 'train_features': param == 'features', 'train_cholesky_factors': param == 'chol', 'train_predictors': param == 'pred', 'train_linear_features': param == 'linear_features', 'train_means': param == 'means', }) self.assertLess(err, 1e-8) # with regularization for norm in ['L1', 'L2']: for param in [ 'priors', 'scales', 'weights', 'features', 'chol', 'pred', 'linear_features', 'means' ]: err = mcgsm._check_gradient( randn(mcgsm.dim_in, 1000), randn(mcgsm.dim_out, 1000), 1e-7, parameters={ 'train_prior': param == 'priors', 'train_scales': param == 'scales', 'train_weights': param == 'weights', 'train_features': param == 'features', 'train_cholesky_factors': param == 'chol', 'train_predictors': param == 'pred', 'train_linear_features': param == 'linear_features', 'train_means': param == 'means', 'regularize_features': { 'strength': 0.4, 'norm': norm }, 'regularize_predictors': { 'strength': 0.5, 'norm': norm }, 'regularize_weights': { 'strength': 0.7, 'norm': norm }, 'regularize_linear_features': { 'strength': 0.3, 'norm': norm }, 'regularize_means': { 'strength': 0.6, 'norm': norm }, }) self.assertLess(err, 1e-6)
def main(argv): parser = ArgumentParser(argv[0], description=__doc__) parser.add_argument('--data', '-d', type=str, default='data/BSDS300_8x8.mat') parser.add_argument('--num_train', '-N', type=int, default=1000000) parser.add_argument('--num_valid', '-V', type=int, default=200000) parser.add_argument('--num_components', '-n', type=int, default=128) parser.add_argument('--num_scales', '-s', type=int, default=4) parser.add_argument('--num_features', '-f', type=int, default=48) parser.add_argument('--train_means', '-M', type=int, default=0) parser.add_argument('--indices', '-I', type=int, default=[], nargs='+') parser.add_argument('--initialize', '-i', type=str, default=None) parser.add_argument('--verbosity', '-v', type=int, default=1) parser.add_argument('--max_iter', '-m', type=int, default=2000) args = parser.parse_args(argv[1:]) experiment = Experiment() data_train = loadmat(args.data)['patches_train'] data_valid = loadmat(args.data)['patches_valid'] if args.initialize: results = Experiment(args.initialize) models = results['models'] preconditioners = results['preconditioners'] else: models = [None] * data_train.shape[1] preconditioners = [None] * data_train.shape[1] def preprocess(data, i, N): if N > 0 and N < data.shape[0]: # select subset of data idx = random_select(N, data.shape[0]) return data[idx, :i].T, data[idx, i][None, :] return data.T[:i], data.T[[i]] for i in range(data_train.shape[1]): if args.indices and i not in args.indices: # skip this one continue print 'Training model {0}/{1}...'.format(i + 1, data_train.shape[1]) inputs_train, outputs_train = preprocess(data_train, i, args.num_train) inputs_valid, outputs_valid = preprocess(data_valid, i, args.num_valid) if i > 0: if preconditioners[i] is None: preconditioners[i] = WhiteningPreconditioner( inputs_train, outputs_train) inputs_train, outputs_train = preconditioners[i](inputs_train, outputs_train) inputs_valid, outputs_valid = preconditioners[i](inputs_valid, outputs_valid) if models[i] is None: models[i] = MCGSM(dim_in=i, dim_out=1, num_components=args.num_components, num_features=args.num_features, num_scales=args.num_scales) models[i].train(inputs_train, outputs_train, inputs_valid, outputs_valid, parameters={ 'verbosity': 1, 'max_iter': args.max_iter, 'train_means': args.train_means > 0 }) else: preconditioners[i] = None if models[i] is None: models[i] = MoGSM(dim=1, num_components=4, num_scales=8) models[i].train(outputs_train, outputs_valid, parameters={ 'verbosity': 1, 'threshold': -1., 'train_means': 1, 'max_iter': 100 }) experiment['args'] = args experiment['models'] = models experiment['preconditioners'] = preconditioners experiment.save( 'results/BSDS300/snapshots/mcgsm_{0}_{1}.{{0}}.{{1}}.xpck'.format( i, args.num_components)) if not args.indices: experiment['args'] = args experiment['models'] = models experiment['preconditioners'] = preconditioners experiment.save('results/BSDS300/mcgsm.{0}.{1}.xpck') return 0
def test_mogsm(self): mcgsm = MCGSM(dim_in=0, dim_out=3, num_components=2, num_scales=2, num_features=0) p0 = 0.3 p1 = 0.7 N = 20000 m0 = array([[2], [0], [0]]) m1 = array([[0], [2], [1]]) C0 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) C1 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) input = zeros([0, N]) output = hstack([ dot(cholesky(C0), randn(mcgsm.dim_out, round(p0 * N))) + m0, dot(cholesky(C1), randn(mcgsm.dim_out, round(p1 * N))) + m1 ]) * (rand(1, N) + 0.5) mcgsm.train(input, output, parameters={ 'verbosity': 0, 'max_iter': 10, 'train_means': True }) mogsm = MoGSM(3, 2, 2) # translate parameters from MCGSM to MoGSM mogsm.priors = sum(exp(mcgsm.priors), 1) / sum(exp(mcgsm.priors)) for k in range(mogsm.num_components): mogsm[k].mean = mcgsm.means[:, k] mogsm[k].covariance = inv( dot(mcgsm.cholesky_factors[k], mcgsm.cholesky_factors[k].T)) mogsm[k].scales = exp(mcgsm.scales[k, :]) mogsm[k].priors = exp(mcgsm.priors[k, :]) / sum( exp(mcgsm.priors[k, :])) self.assertAlmostEqual(mcgsm.evaluate(input, output), mogsm.evaluate(output), 5) mogsm_samples = mogsm.sample(N) mcgsm_samples = mcgsm.sample(input) # generated samples should have the same distribution for i in range(mogsm.dim): self.assertTrue( ks_2samp(mogsm_samples[i], mcgsm_samples[0]) > 0.0001) self.assertTrue( ks_2samp(mogsm_samples[i], mcgsm_samples[1]) > 0.0001) self.assertTrue( ks_2samp(mogsm_samples[i], mcgsm_samples[2]) > 0.0001) posterior = mcgsm.posterior(input, mcgsm_samples) # average posterior should correspond to prior for k in range(mogsm.num_components): self.assertLess(abs(1 - mean(posterior[k]) / mogsm.priors[k]), 0.1)
def main(argv): # load image and turn into grayscale img = rgb2gray(imread('media/newyork.png')) # generate data inputs, outputs = generate_data_from_image(img, input_mask, output_mask, 220000) # split data into training, test, and validation sets inputs = split(inputs, [100000, 200000], 1) outputs = split(outputs, [100000, 200000], 1) data_train = inputs[0], outputs[0] data_test = inputs[1], outputs[1] data_valid = inputs[2], outputs[2] # compute normalizing transformation pre = WhiteningPreconditioner(*data_train) # intialize model model = MCGSM(dim_in=data_train[0].shape[0], dim_out=data_train[1].shape[0], num_components=8, num_scales=4, num_features=32) # fit parameters model.initialize(*pre(*data_train)) model.train(*chain(pre(*data_train), pre(*data_valid)), parameters={ 'verbosity': 1, 'max_iter': 1000, 'threshold': 1e-7, 'val_iter': 5, 'val_look_ahead': 10, 'num_grad': 20, }) # evaluate model print 'Average log-likelihood: {0:.4f} [bit/px]'.format( -model.evaluate(data_test[0], data_test[1], pre)) # synthesize a new image img_sample = sample_image(img, model, input_mask, output_mask, pre) imwrite('newyork_sample.png', img_sample, cmap='gray', vmin=min(img), vmax=max(img)) # save model with open('image_model.pck', 'wb') as handle: dump( { 'model': model, 'input_mask': input_mask, 'output_mask': output_mask }, handle, 1) return 0
class RIDE_BSDS300(object): """ Basically the same model as L{RIDE} but for the BSDS300 dataset where the bottom-right pixel is commonly ignored. This model should be used in combination with L{PatchRIDE}. """ MAX_BATCH_SIZE = 10000 def __init__(self, num_channels=1, num_hiddens=10, num_components=4, num_scales=4, num_features=16, num_layers=1, nb_size=3, nonlinearity='TanH', verbosity=1, extended=False, input_mask=None, output_mask=None): self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask: self.input_mask = input_mask if output_mask: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM(dim_in=num_hiddens, dim_out=num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features) self.preconditioner = None # see PatchRIDE self._indicators = False def add_layer(self): """ Add another spatial LSTM to the network and reinitialize MCGSM. """ self.num_layers += 1 # reinitialize MCGSM self.mcgsm = MCGSM(dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=self.mcgsm.num_components, num_scales=self.mcgsm.num_scales, num_features=self.mcgsm.num_features) # add slot for another layer self.slstm.append(None) def _precondition(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs. """ shape = inputs.shape if outputs is None: if self.preconditioner is None: raise RuntimeError('No preconditioning possible.') inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T # avoids memory issues MAX_SAMPLES = 5000000 if self.preconditioner is None: inputs_ = inputs if self._indicators: # half of the inputs are indicators, don't preprocess them inputs_ = inputs.copy() inputs_[inputs.shape[0] // 2:] = randn( inputs.shape[0] // 2, *inputs.shape[1:]) if inputs.shape[1] > MAX_SAMPLES: idx = random_select(MAX_SAMPLES, inputs.shape[1]) self.preconditioner = WhiteningPreconditioner( inputs_[:, idx], outputs[:, idx]) else: self.preconditioner = WhiteningPreconditioner( inputs_, outputs) # precondition for b in range(0, inputs.shape[1], MAX_SAMPLES): inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES] = \ self.preconditioner(inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES]) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _precondition_inverse(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs. """ if self.preconditioner is None: raise RuntimeError('No preconditioner set.') shape = inputs.shape if outputs is None: inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner.inverse(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.inverse(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _preprocess(self, images): """ Extract causal neighborhoods from images. """ def process(image): inputs, outputs = generate_data_from_image(image, self.input_mask, self.output_mask) inputs = asarray(inputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') outputs = asarray(outputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') return inputs, outputs inputs, outputs = zip(*mapp(process, images)) return asarray(inputs), asarray(outputs) def loglikelihood(self, images): """ Returns a log-likelihood for each pixel except the bottom-right pixel (in nats). """ inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print 'Computing Jacobian...' logjacobian = self.preconditioner.logjacobian( inputs.reshape(-1, sum(self.input_mask)).T, outputs.reshape(-1, self.num_channels).T) if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) else: logjacobian = 0. # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): # create SLSTM self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min( [hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print 'Computing likelihood...' # evaluate log-likelihood loglik = self.mcgsm.loglikelihood( hiddens.reshape(-1, self.num_hiddens).T, outputs.reshape(-1, self.num_channels).T) + logjacobian # remove bottom-right pixel loglik = loglik.reshape(hiddens.shape[0], -1) loglik = loglik[:, :-1] return loglik def evaluate(self, images): """ Computes the average negative log-likelihood in bits per pixel. """ MAX_IMAGES = 100000 loglik = [] for b in range(0, len(images), MAX_IMAGES): loglik.append(mean(self.loglikelihood(images[b:b + MAX_IMAGES]))) return -mean(loglik) / log(2.) def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward( df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient( H_flat, Y_flat, parameters={'train_means': train_means }) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][ l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][ key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params[ 'mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm']) } for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like( params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like( params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / ( 1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + ( 1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + ( 1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method)) def finetune(self, images, max_iter=1000, train_means=False, num_samples_train=500000, num_samples_valid=100000): """ Train MCGSM using L-BFGS while keeping parameters of SLSTM fixed. @type images: C{ndarray}/C{list} @param images: an array or a list of images """ if images.shape[0] > num_samples_train: images = images[random_select(num_samples_train, images.shape[0])] print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs print 'Forward...' for l in range(self.num_layers): self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min( [hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) print 'Reshape...' # remove bottom-right pixels (BSDS300) hiddens = hiddens.reshape(hiddens.shape[0], -1, self.num_hiddens) outputs = outputs.reshape(outputs.shape[0], -1, self.num_channels) hiddens = hiddens[:, :-1] outputs = outputs[:, :-1] # form inputs to MCGSM hiddens = hiddens.reshape(-1, self.num_hiddens).T outputs = outputs.reshape(-1, self.num_channels).T print 'Finetuning...' if hiddens.shape[1] > num_samples_train: num_samples_valid = min( [num_samples_valid, hiddens.shape[1] - num_samples_train]) # select subset of data points for finetuning idx = random_select(num_samples_train + num_samples_valid, hiddens.shape[1]) if num_samples_valid > 0: # split data into training and validation set hiddens_train = asarray(hiddens[:, idx[:num_samples_train]], order='F') outputs_train = asarray(outputs[:, idx[:num_samples_train]], order='F') hiddens_valid = asarray(hiddens[:, idx[num_samples_train:]], order='F') outputs_valid = asarray(outputs[:, idx[num_samples_train:]], order='F') # finetune with early stopping based on validation performance return self.mcgsm.train(hiddens_train, outputs_train, hiddens_valid, outputs_valid, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter }) else: hiddens = asarray(hiddens[:, idx], order='F') outputs = asarray(outputs[:, idx], order='F') return self.mcgsm.train(hiddens, outputs, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter }) def hidden_states(self, images, return_all=False): print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): self.slstm[l] = SLSTM(num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min( [hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if return_all: return inputs, hiddens, outputs return hiddens def sample(self, images, min_values=None, max_values=None): """ Sample one or several images. @type images: C{ndarray} @param images: an array or a list of images to initialize pixels at boundaries """ if min_values is not None: min_values = asarray(min_values).reshape(1, 1, 1, -1) if max_values is not None: max_values = asarray(max_values).reshape(1, 1, 1, -1) # reshape images into four-dimensional arrays shape = images.shape if images.ndim == 2: images = images[None, :, :, None] elif images.ndim == 3: if self.num_channels > 1: images = images[None] else: images = images[:, :, :, None] # create spatial LSTMs for sampling slstm = [] for l in range(self.num_layers): slstm.append( SLSTM(num_rows=1, num_cols=1, num_channels=sum(self.input_mask) if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=images.shape[0], nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity)) # container for hidden and memory unit activations hiddens = [] memory = [] for l in range(self.num_layers): hiddens.append(defaultdict(lambda: 0.)) memory.append(defaultdict(lambda: 0.)) # locate output pixel for i_off, j_off in zip(range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): # extract patches from images patches = images[:, i:i + self.input_mask.shape[0], j:j + self.input_mask.shape[1]] # extract causal neighborhoods from patches inputs = [] for k in range(images.shape[0]): inputs.append( generate_data_from_image(patches[k, :, :], self.input_mask, self.output_mask)[0]) inputs = asarray(inputs) inputs = inputs.reshape(inputs.shape[0], 1, 1, -1) if self.preconditioner: inputs = self._precondition(inputs) # set hidden unit activations for l in range(self.num_layers): slstm[l].net.blobs['h_init_i_jm1'].data[:] = hiddens[l][i, j - 1] slstm[l].net.blobs['h_init_im1_j'].data[:] = hiddens[l][i - 1, j] slstm[l].net.blobs['c_init_i_jm1'].data[:] = memory[l][i, j - 1] slstm[l].net.blobs['c_init_im1_j'].data[:] = memory[l][i - 1, j] # compute hidden unit activations activations = inputs for l in range(self.num_layers): activations = slstm[l].forward(activations) # store hidden unit activations for l in range(self.num_layers): hiddens[l][i, j] = slstm[l].net.blobs['outputs'].data.copy() memory[l][i, j] = slstm[l].net.blobs['c_0_0'].data.copy() for _ in range(10): # sample MCGSM outputs = self.mcgsm.sample(hiddens[-1][i, j].reshape( -1, self.num_hiddens).T) outputs = outputs.T.reshape(outputs.shape[1], 1, 1, outputs.shape[0]) if not any(isnan(outputs)): break print 'Warning: NaNs detected.' if self.preconditioner: inputs, outputs = self._precondition_inverse( inputs, outputs) if max_values is not None: outputs[outputs > max_values] = max_values[ outputs > max_values] if min_values is not None: outputs[outputs < min_values] = min_values[ outputs < min_values] # insert sampled pixels into images images[:, i + i_off, j + j_off][self.output_mask[i_off, j_off]] = outputs return images.reshape(*shape) def __setstate__(self, state): self.__dict__ = state if not hasattr(self, 'nonlinearity'): self.nonlinearity = 'TanH' if not hasattr(self, 'extended'): self.extended = False
def __init__(self, num_channels=1, num_hiddens=10, num_components=8, num_scales=4, num_features=16, num_layers=1, nb_size=5, nonlinearity='TanH', verbosity=1, extended=False, input_mask=None, output_mask=None): """ @type num_channels: C{int} @param num_channels: dimensionality of each pixel @type num_hiddens: C{int} @param num_hiddens: number of LSTM units in each spatial LSTM layer @type num_components: C{int} @param num_components: number of mixture components used by the MCGSM @type num_scales: C{int} @param num_scales: number of scales used by the MCGSM @type num_features: C{int} @param num_features: number of quadratic features used by the MCGSM @type num_layers: C{int} @param num_layers: number of layers of spatial LSTM units @type nb_size: C{int} @param nb_size: controls the neighborhood of pixels read from an image @type nonlinearity: C{str} @param nonlinearity: nonlinearity used by spatial LSTM (e.g., TanH, ReLU) @type verbosity: C{int} @param verbosity: controls how much information is printed during training, etc. @type extended: C{bool} @param extended: use previous memory states as additional inputs to LSTM (more parameters) @type input_mask C{ndarray} @param input_mask: Boolean mask used to define custom input neighborhood of pixels @type output_mask C{ndarray} @param output_mask: determines the position of the output pixel relative to the neighborhood """ self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask is not None: self.input_mask = input_mask if output_mask is not None: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM(dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features) self.preconditioner = None
parser.add_argument('--repetitions', '-r', type=int, default=2) args = parser.parse_args(sys.argv[1:]) ### print socket.gethostname() print datetime.now() print args print ### data = randn(args.dim_in, args.num_data), randn(args.dim_out, args.num_data) model = MCGSM( dim_in=args.dim_in, dim_out=args.dim_out, num_components=12, num_features=40, num_scales=6) ### print 'model.loglikelihood' t = time() for r in range(args.repetitions): model.loglikelihood(*data) print '{0:12.8f} seconds'.format((time() - t) / float(args.repetitions)) print ### print 'model._check_performance' for batch_size in [1000, 2000, 5000]: t = model._check_performance(*data, repetitions=args.repetitions, parameters={'batch_size': batch_size})
class RIDE_BSDS300(object): """ Basically the same model as L{RIDE} but for the BSDS300 dataset where the bottom-right pixel is commonly ignored. This model should be used in combination with L{PatchRIDE}. """ MAX_BATCH_SIZE = 10000 def __init__(self, num_channels=1, num_hiddens=10, num_components=4, num_scales=4, num_features=16, num_layers=1, nb_size=3, nonlinearity='TanH', verbosity=1, extended=False, input_mask=None, output_mask=None): self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask: self.input_mask = input_mask if output_mask: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM( dim_in=num_hiddens, dim_out=num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features) self.preconditioner = None # see PatchRIDE self._indicators = False def add_layer(self): """ Add another spatial LSTM to the network and reinitialize MCGSM. """ self.num_layers += 1 # reinitialize MCGSM self.mcgsm = MCGSM( dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=self.mcgsm.num_components, num_scales=self.mcgsm.num_scales, num_features=self.mcgsm.num_features) # add slot for another layer self.slstm.append(None) def _precondition(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs. """ shape = inputs.shape if outputs is None: if self.preconditioner is None: raise RuntimeError('No preconditioning possible.') inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T # avoids memory issues MAX_SAMPLES = 5000000 if self.preconditioner is None: inputs_ = inputs if self._indicators: # half of the inputs are indicators, don't preprocess them inputs_ = inputs.copy() inputs_[inputs.shape[0] // 2:] = randn(inputs.shape[0] // 2, *inputs.shape[1:]) if inputs.shape[1] > MAX_SAMPLES: idx = random_select(MAX_SAMPLES, inputs.shape[1]) self.preconditioner = WhiteningPreconditioner(inputs_[:, idx], outputs[:, idx]) else: self.preconditioner = WhiteningPreconditioner(inputs_, outputs) # precondition for b in range(0, inputs.shape[1], MAX_SAMPLES): inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES] = \ self.preconditioner(inputs[:, b:b + MAX_SAMPLES], outputs[:, b:b + MAX_SAMPLES]) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _precondition_inverse(self, inputs, outputs=None): """ Remove any correlations within and between inputs and outputs. """ if self.preconditioner is None: raise RuntimeError('No preconditioner set.') shape = inputs.shape if outputs is None: inputs = inputs.reshape(-1, inputs.shape[-1]).T inputs = self.preconditioner.inverse(inputs) inputs = inputs.T.reshape(*shape) return inputs else: inputs = inputs.reshape(-1, inputs.shape[-1]).T outputs = outputs.reshape(-1, outputs.shape[-1]).T inputs, outputs = self.preconditioner.inverse(inputs, outputs) inputs = inputs.T.reshape(*shape) outputs = outputs.T.reshape(shape[0], shape[1], shape[2], -1) return inputs, outputs def _preprocess(self, images): """ Extract causal neighborhoods from images. """ def process(image): inputs, outputs = generate_data_from_image( image, self.input_mask, self.output_mask) inputs = asarray( inputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') outputs = asarray( outputs.T.reshape( image.shape[0] - self.input_mask.shape[0] + 1, image.shape[1] - self.input_mask.shape[1] + 1, -1), dtype='float32') return inputs, outputs inputs, outputs = zip(*mapp(process, images)) return asarray(inputs), asarray(outputs) def loglikelihood(self, images): """ Returns a log-likelihood for each pixel except the bottom-right pixel (in nats). """ inputs, outputs = self._preprocess(images) if self.preconditioner is not None: if self.verbosity > 0: print 'Computing Jacobian...' logjacobian = self.preconditioner.logjacobian( inputs.reshape(-1, sum(self.input_mask)).T, outputs.reshape(-1, self.num_channels).T) if self.verbosity > 0: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) else: logjacobian = 0. # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): # create SLSTM self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min([hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if self.verbosity > 0: print 'Computing likelihood...' # evaluate log-likelihood loglik = self.mcgsm.loglikelihood( hiddens.reshape(-1, self.num_hiddens).T, outputs.reshape(-1, self.num_channels).T) + logjacobian # remove bottom-right pixel loglik = loglik.reshape(hiddens.shape[0], -1) loglik = loglik[:, :-1] return loglik def evaluate(self, images): """ Computes the average negative log-likelihood in bits per pixel. """ MAX_IMAGES = 100000 loglik = [] for b in range(0, len(images), MAX_IMAGES): loglik.append(mean(self.loglikelihood(images[b:b + MAX_IMAGES]))) return -mean(loglik) / log(2.) def train(self, images, batch_size=50, num_epochs=20, method='SGD', train_means=False, train_top_layer=False, momentum=0.9, learning_rate=1., decay1=0.9, decay2=0.999, precondition=True): """ @type images: C{ndarray}/C{list} @param images: an array or a list of images """ print 'Preprocessing...' inputs, outputs = self._preprocess(images) if precondition: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # indicates which layers will be trained train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers) print 'Creating SLSTMs...' # create SLSTMs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=inputs.shape[1], num_cols=inputs.shape[2], num_channels=inputs.shape[3] if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=min([batch_size, self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) # compute loss function and its gradient def f_df(params, idx): # set model parameters for l in train_layers: self.slstm[l].set_parameters(params['slstm'][l]) self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means}) # select batch and compute hidden activations Y = outputs[idx:idx + batch_size] H = inputs[idx:idx + batch_size] for l in range(self.num_layers): H = self.slstm[l].forward(H) # form inputs to MCGSM H_flat = H.reshape(-1, self.num_hiddens).T Y_flat = Y.reshape(-1, self.num_channels).T norm_const = -H_flat.shape[1] # compute gradients df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat) df_dh = df_dh.T.reshape(*H.shape) / norm_const # ignore bottom-right pixel (BSDS300) df_dh[:, -1, -1] = 0. # average negative log-likelihood f = sum(loglik) / norm_const df_dtheta = {} df_dtheta['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break if l > min(train_layers): # derivative with respect to inputs of layer l are derivatives # of hidden states of layer l - 1 df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh, force_backward=True) df_dh = df_dtheta['slstm'][l]['inputs'] del df_dtheta['slstm'][l]['inputs'] else: # no need to compute derivatives with respect to input units df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh) # compute gradient of MCGSM df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={'train_means': train_means}) * log(2.) * self.mcgsm.dim_out return f, df_dtheta # collect current parameters params = {} params['slstm'] = [0.] * self.num_layers for l in range(self.num_layers)[::-1]: if l not in train_layers: break params['slstm'][l] = self.slstm[l].parameters() params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means}) # a start index for each batch start_indices = range( 0, inputs.shape[0] - batch_size + 1, batch_size) print 'Training...' if method.upper() == 'SFO': try: # optimize using sum-of-functions optimizer optimizer = SFO(f_df, params, start_indices, display=self.verbosity) params_opt = optimizer.optimize(num_passes=num_epochs) # set model parameters for l in range(self.num_layers): self.slstm[l].set_parameters(params_opt['slstm'][l]) self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means}) except KeyboardInterrupt: pass return optimizer.hist_f_flat elif method.upper() == 'SGD': loss = [] diff = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff['slstm'][l] = {} for key in params['slstm'][l]: diff['slstm'][l][key] = zeros_like(params['slstm'][l][key]) for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff['slstm'][l][key] = momentum * diff['slstm'][l][key] - df['slstm'][l][key] params['slstm'][l][key] = params['slstm'][l][key] + learning_rate * diff['slstm'][l][key] # update MCGSM parameters diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm'] params['mcgsm'] = params['mcgsm'] + learning_rate * diff['mcgsm'] if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss elif method.upper() == 'ADAM': loss = [] diff_mean = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} diff_sqrd = { 'slstm': [0.] * self.num_layers, 'mcgsm': zeros_like(params['mcgsm'])} for l in train_layers: diff_mean['slstm'][l] = {} diff_sqrd['slstm'][l] = {} for key in params['slstm'][l]: diff_mean['slstm'][l][key] = zeros_like(params['slstm'][l][key]) diff_sqrd['slstm'][l][key] = zeros_like(params['slstm'][l][key]) # step counter t = 1 for n in range(num_epochs): for b in range(0, inputs.shape[0] - batch_size + 1, batch_size): # compute gradients f, df = f_df(params, b) loss.append(f) # include bias correction in step width step_width = learning_rate / (1. - power(decay1, t)) * sqrt(1. - power(decay2, t)) t += 1 # update SLSTM parameters for l in train_layers: for key in params['slstm'][l]: diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \ + (1. - decay1) * df['slstm'][l][key] diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \ + (1. - decay2) * square(df['slstm'][l][key]) params['slstm'][l][key] = params['slstm'][l][key] - \ step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key])) # update MCGSM parameters diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (1. - decay1) * df['mcgsm'] diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (1. - decay2) * square(df['mcgsm']) params['mcgsm'] = params['mcgsm'] - \ step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm'])) if self.verbosity > 0: print '{0:>5} {1:>10.4f} {2:>10.4f}'.format( n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):])) return loss else: raise ValueError('Unknown method \'{0}\'.'.format(method)) def finetune(self, images, max_iter=1000, train_means=False, num_samples_train=500000, num_samples_valid=100000): """ Train MCGSM using L-BFGS while keeping parameters of SLSTM fixed. @type images: C{ndarray}/C{list} @param images: an array or a list of images """ if images.shape[0] > num_samples_train: images = images[random_select(num_samples_train, images.shape[0])] print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs print 'Forward...' for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min([hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) print 'Reshape...' # remove bottom-right pixels (BSDS300) hiddens = hiddens.reshape(hiddens.shape[0], -1, self.num_hiddens) outputs = outputs.reshape(outputs.shape[0], -1, self.num_channels) hiddens = hiddens[:, :-1] outputs = outputs[:, :-1] # form inputs to MCGSM hiddens = hiddens.reshape(-1, self.num_hiddens).T outputs = outputs.reshape(-1, self.num_channels).T print 'Finetuning...' if hiddens.shape[1] > num_samples_train: num_samples_valid = min([num_samples_valid, hiddens.shape[1] - num_samples_train]) # select subset of data points for finetuning idx = random_select(num_samples_train + num_samples_valid, hiddens.shape[1]) if num_samples_valid > 0: # split data into training and validation set hiddens_train = asarray(hiddens[:, idx[:num_samples_train]], order='F') outputs_train = asarray(outputs[:, idx[:num_samples_train]], order='F') hiddens_valid = asarray(hiddens[:, idx[num_samples_train:]], order='F') outputs_valid = asarray(outputs[:, idx[num_samples_train:]], order='F') # finetune with early stopping based on validation performance return self.mcgsm.train( hiddens_train, outputs_train, hiddens_valid, outputs_valid, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter}) else: hiddens = asarray(hiddens[:, idx], order='F') outputs = asarray(outputs[:, idx], order='F') return self.mcgsm.train(hiddens, outputs, parameters={ 'verbosity': self.verbosity, 'train_means': train_means, 'max_iter': max_iter}) def hidden_states(self, images, return_all=False): print 'Preprocessing...' inputs, outputs = self._preprocess(images) if self.preconditioner: print 'Preconditioning...' # remove correlations inputs, outputs = self._precondition(inputs, outputs) # compute hidden unit activations hiddens = inputs for l in range(self.num_layers): self.slstm[l] = SLSTM( num_rows=hiddens.shape[1], num_cols=hiddens.shape[2], num_channels=hiddens.shape[3], num_hiddens=self.num_hiddens, batch_size=min([hiddens.shape[0], self.MAX_BATCH_SIZE]), nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity) hiddens = self.slstm[l].forward(hiddens) if return_all: return inputs, hiddens, outputs return hiddens def sample(self, images, min_values=None, max_values=None): """ Sample one or several images. @type images: C{ndarray} @param images: an array or a list of images to initialize pixels at boundaries """ if min_values is not None: min_values = asarray(min_values).reshape(1, 1, 1, -1) if max_values is not None: max_values = asarray(max_values).reshape(1, 1, 1, -1) # reshape images into four-dimensional arrays shape = images.shape if images.ndim == 2: images = images[None, :, :, None] elif images.ndim == 3: if self.num_channels > 1: images = images[None] else: images = images[:, :, :, None] # create spatial LSTMs for sampling slstm = [] for l in range(self.num_layers): slstm.append(SLSTM( num_rows=1, num_cols=1, num_channels=sum(self.input_mask) if l < 1 else self.num_hiddens, num_hiddens=self.num_hiddens, batch_size=images.shape[0], nonlinearity=self.nonlinearity, extended=self.extended, slstm=self.slstm[l], verbosity=self.verbosity)) # container for hidden and memory unit activations hiddens = [] memory = [] for l in range(self.num_layers): hiddens.append(defaultdict(lambda: 0.)) memory.append(defaultdict(lambda: 0.)) # locate output pixel for i_off, j_off in zip( range(self.output_mask.shape[0]), range(self.output_mask.shape[1])): if any(self.output_mask[i_off, j_off]): break for i in range(images.shape[1] - self.input_mask.shape[0] + 1): for j in range(images.shape[2] - self.input_mask.shape[1] + 1): # extract patches from images patches = images[:, i:i + self.input_mask.shape[0], j:j + self.input_mask.shape[1]] # extract causal neighborhoods from patches inputs = [] for k in range(images.shape[0]): inputs.append( generate_data_from_image( patches[k, :, :], self.input_mask, self.output_mask)[0]) inputs = asarray(inputs) inputs = inputs.reshape(inputs.shape[0], 1, 1, -1) if self.preconditioner: inputs = self._precondition(inputs) # set hidden unit activations for l in range(self.num_layers): slstm[l].net.blobs['h_init_i_jm1'].data[:] = hiddens[l][i, j - 1] slstm[l].net.blobs['h_init_im1_j'].data[:] = hiddens[l][i - 1, j] slstm[l].net.blobs['c_init_i_jm1'].data[:] = memory[l][i, j - 1] slstm[l].net.blobs['c_init_im1_j'].data[:] = memory[l][i - 1, j] # compute hidden unit activations activations = inputs for l in range(self.num_layers): activations = slstm[l].forward(activations) # store hidden unit activations for l in range(self.num_layers): hiddens[l][i, j] = slstm[l].net.blobs['outputs'].data.copy() memory[l][i, j] = slstm[l].net.blobs['c_0_0'].data.copy() for _ in range(10): # sample MCGSM outputs = self.mcgsm.sample( hiddens[-1][i, j].reshape(-1, self.num_hiddens).T) outputs = outputs.T.reshape(outputs.shape[1], 1, 1, outputs.shape[0]) if not any(isnan(outputs)): break print 'Warning: NaNs detected.' if self.preconditioner: inputs, outputs = self._precondition_inverse(inputs, outputs) if max_values is not None: outputs[outputs > max_values] = max_values[outputs > max_values] if min_values is not None: outputs[outputs < min_values] = min_values[outputs < min_values] # insert sampled pixels into images images[:, i + i_off, j + j_off][self.output_mask[i_off, j_off]] = outputs return images.reshape(*shape) def __setstate__(self, state): self.__dict__ = state if not hasattr(self, 'nonlinearity'): self.nonlinearity = 'TanH' if not hasattr(self, 'extended'): self.extended = False
def __init__( self, num_channels=1, num_hiddens=10, num_components=8, num_scales=4, num_features=16, num_layers=1, nb_size=5, nonlinearity="TanH", verbosity=1, extended=False, input_mask=None, output_mask=None, ): """ @type num_channels: C{int} @param num_channels: dimensionality of each pixel @type num_hiddens: C{int} @param num_hiddens: number of LSTM units in each spatial LSTM layer @type num_components: C{int} @param num_components: number of mixture components used by the MCGSM @type num_scales: C{int} @param num_scales: number of scales used by the MCGSM @type num_features: C{int} @param num_features: number of quadratic features used by the MCGSM @type num_layers: C{int} @param num_layers: number of layers of spatial LSTM units @type nb_size: C{int} @param nb_size: controls the neighborhood of pixels read from an image @type nonlinearity: C{str} @param nonlinearity: nonlinearity used by spatial LSTM (e.g., TanH, ReLU) @type verbosity: C{int} @param verbosity: controls how much information is printed during training, etc. @type extended: C{bool} @param extended: use previous memory states as additional inputs to LSTM (more parameters) @type input_mask C{ndarray} @param input_mask: Boolean mask used to define custom input neighborhood of pixels @type output_mask C{ndarray} @param output_mask: determines the position of the output pixel relative to the neighborhood """ self.verbosity = verbosity self.num_channels = num_channels self.num_hiddens = num_hiddens self.num_layers = num_layers self.nonlinearity = nonlinearity self.extended = extended self.input_mask, self.output_mask = generate_masks([nb_size] * num_channels) if input_mask is not None: self.input_mask = input_mask if output_mask is not None: self.output_mask = output_mask self.num_channels = sum(self.output_mask) self.slstm = [None] * num_layers self.mcgsm = MCGSM( dim_in=self.num_hiddens, dim_out=self.num_channels, num_components=num_components, num_scales=num_scales, num_features=num_features, ) self.preconditioner = None
def main(argv): experiment = Experiment() parser = ArgumentParser(argv[0], description=__doc__) parser.add_argument('--data', '-d', type=str, default='data/vanhateren_deq2_train.mat') parser.add_argument('--num_data', '-N', type=int, default=1000000) parser.add_argument('--num_valid', '-V', type=int, default=200000) parser.add_argument('--input_size', '-i', type=int, default=9) parser.add_argument('--max_iter', '-I', type=int, default=3000) parser.add_argument('--num_components', '-c', type=int, default=128) parser.add_argument('--num_features', '-f', type=int, default=48) parser.add_argument('--num_scales', '-s', type=int, default=4) parser.add_argument('--verbosity', '-v', type=int, default=1) parser.add_argument('--output', '-o', type=str, default='results/vanhateren_deq2/mcgsm.{0}.{1}.xpck') args = parser.parse_args(argv[1:]) ### DATA HANDLING if args.verbosity > 0: print 'Loading data...' # load data images = loadmat(args.data)['data'] # define causal neighborhood input_mask, output_mask = generate_masks(input_size=args.input_size, output_size=1) # extract causal neighborhoods num_samples = int((args.num_data + args.num_valid) / images.shape[0] + .9) def extract(image): return generate_data_from_image(image, input_mask, output_mask, num_samples) inputs, outputs = zip(*mapp(extract, images)) inputs, outputs = hstack(inputs), hstack(outputs) inputs_train = inputs[:, :args.num_data] outputs_train = outputs[:, :args.num_data] inputs_valid = inputs[:, args.num_data:] outputs_valid = outputs[:, args.num_data:] if inputs_valid.size < 100: print 'Not enough data for validation.' inputs_valid = None outputs_valid = None ### MODEL TRAINING if args.verbosity > 0: print 'Preconditioning...' preconditioner = WhiteningPreconditioner(inputs_train, outputs_train) inputs_train, outputs_train = preconditioner(inputs_train, outputs_train) if inputs_valid is not None: inputs_valid, outputs_valid = preconditioner(inputs_valid, outputs_valid) # free memory del inputs del outputs if args.verbosity > 0: print 'Training model...' model = MCGSM(dim_in=inputs_train.shape[0], dim_out=outputs_train.shape[0], num_components=args.num_components, num_features=args.num_features, num_scales=args.num_scales) def callback(i, mcgsm): experiment['args'] = args experiment['model'] = mcgsm experiment['preconditioner'] = preconditioner experiment['input_mask'] = input_mask experiment['output_mask'] = output_mask experiment.save(args.output) model.train(inputs_train, outputs_train, inputs_valid, outputs_valid, parameters={ 'verbosity': args.verbosity, 'cb_iter': 500, 'callback': callback, 'max_iter': args.max_iter }) ### SAVE RESULTS experiment['args'] = args experiment['model'] = model experiment['preconditioner'] = preconditioner experiment['input_mask'] = input_mask experiment['output_mask'] = output_mask experiment.save(args.output) return 0
def main(argv): experiment = Experiment() parser = ArgumentParser(argv[0], description=__doc__) parser.add_argument('--data', '-d', type=str, default='data/vanhateren_deq2_train.mat') parser.add_argument('--num_data', '-N', type=int, default=1000000) parser.add_argument('--num_valid', '-V', type=int, default=200000) parser.add_argument('--input_size', '-i', type=int, default=9) parser.add_argument('--max_iter', '-I', type=int, default=3000) parser.add_argument('--num_components', '-c', type=int, default=128) parser.add_argument('--num_features', '-f', type=int, default=48) parser.add_argument('--num_scales', '-s', type=int, default=4) parser.add_argument('--verbosity', '-v', type=int, default=1) parser.add_argument('--output', '-o', type=str, default='results/vanhateren_deq2/mcgsm.{0}.{1}.xpck') args = parser.parse_args(argv[1:]) ### DATA HANDLING if args.verbosity > 0: print 'Loading data...' # load data images = loadmat(args.data)['data'] # define causal neighborhood input_mask, output_mask = generate_masks(input_size=args.input_size, output_size=1) # extract causal neighborhoods num_samples = int((args.num_data + args.num_valid) / images.shape[0] + .9) def extract(image): return generate_data_from_image( image, input_mask, output_mask, num_samples) inputs, outputs = zip(*mapp(extract, images)) inputs, outputs = hstack(inputs), hstack(outputs) inputs_train = inputs[:, :args.num_data] outputs_train = outputs[:, :args.num_data] inputs_valid = inputs[:, args.num_data:] outputs_valid = outputs[:, args.num_data:] if inputs_valid.size < 100: print 'Not enough data for validation.' inputs_valid = None outputs_valid = None ### MODEL TRAINING if args.verbosity > 0: print 'Preconditioning...' preconditioner = WhiteningPreconditioner(inputs_train, outputs_train) inputs_train, outputs_train = preconditioner(inputs_train, outputs_train) if inputs_valid is not None: inputs_valid, outputs_valid = preconditioner(inputs_valid, outputs_valid) # free memory del inputs del outputs if args.verbosity > 0: print 'Training model...' model = MCGSM( dim_in=inputs_train.shape[0], dim_out=outputs_train.shape[0], num_components=args.num_components, num_features=args.num_features, num_scales=args.num_scales) def callback(i, mcgsm): experiment['args'] = args experiment['model'] = mcgsm experiment['preconditioner'] = preconditioner experiment['input_mask'] = input_mask experiment['output_mask'] = output_mask experiment.save(args.output) model.train( inputs_train, outputs_train, inputs_valid, outputs_valid, parameters={ 'verbosity': args.verbosity, 'cb_iter': 500, 'callback': callback, 'max_iter': args.max_iter}) ### SAVE RESULTS experiment['args'] = args experiment['model'] = model experiment['preconditioner'] = preconditioner experiment['input_mask'] = input_mask experiment['output_mask'] = output_mask experiment.save(args.output) return 0
parser.add_argument('--repetitions', '-r', type=int, default=2) args = parser.parse_args(sys.argv[1:]) ### print socket.gethostname() print datetime.now() print args print ### data = randn(args.dim_in, args.num_data), randn(args.dim_out, args.num_data) model = MCGSM(dim_in=args.dim_in, dim_out=args.dim_out, num_components=12, num_features=40, num_scales=6) ### print 'model.loglikelihood' t = time() for r in range(args.repetitions): model.loglikelihood(*data) print '{0:12.8f} seconds'.format((time() - t) / float(args.repetitions)) print ### print 'model._check_performance' for batch_size in [1000, 2000, 5000]: t = model._check_performance(*data,