示例#1
0
def explore_MN(burnin_steps=2, test_steps=2):

    M_arr = []
    N_arr = []
    N = 100
    #N = 50
    for M in np.linspace(1, 1e6, 5):
        #for M in np.linspace(1, 1e3, 4):
        M_arr.append(int(M))
        N_arr.append(int(N))
    M = 1e6
    #M = 1e3
    for N in np.linspace(1, 200, 5):
        #for N in np.linspace(1,50,4):
        M_arr.append(int(M))
        N_arr.append(int(N))

    T_arr = []

    for ii in range(len(M_arr)):
        M = M_arr[ii]
        N = N_arr[ii]

        print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N)

        # make the model
        model = models.toy(num_subfunctions=N, num_dims=M)
        # initialize the optimizer
        optimizer = SFO(model.f_df,
                        model.theta_init,
                        model.subfunction_references,
                        display=1)
        # burn in the optimizer, to make sure the subspace has eg. reached its full size
        optimizer.optimize(num_passes=burnin_steps)

        # time spent in optimizer during burning
        t0 = optimizer.time_pass - optimizer.time_func
        steps0 = np.sum(optimizer.eval_count)
        optimizer.optimize(num_passes=test_steps)
        t1 = optimizer.time_pass - optimizer.time_func
        t_diff = t1 - t0
        steps1 = np.sum(optimizer.eval_count)
        actual_test_steps = float(steps1 - steps0) / float(N)
        T_arr.append(t_diff / actual_test_steps)
        print T_arr[-1]

    return np.array(M_arr), np.array(N_arr), np.array(T_arr)
def explore_MN(burnin_steps=2, test_steps=2):

    M_arr = []
    N_arr = []
    N = 100
    #N = 50
    for M in np.linspace(1, 1e6, 5):
    #for M in np.linspace(1, 1e3, 4):
        M_arr.append(int(M))
        N_arr.append(int(N))
    M = 1e6
    #M = 1e3
    for N in np.linspace(1,200,5):
    #for N in np.linspace(1,50,4):
        M_arr.append(int(M))
        N_arr.append(int(N))

    T_arr = []

    for ii in range(len(M_arr)):
        M = M_arr[ii]
        N = N_arr[ii]

        print "case %d of %d, M=%g, N=%g"%(ii+1, len(M_arr), M, N)

        # make the model
        model = models.toy(num_subfunctions=N, num_dims=M)
        # initialize the optimizer
        optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1)
        # burn in the optimizer, to make sure the subspace has eg. reached its full size
        optimizer.optimize(num_passes=burnin_steps)

        # time spent in optimizer during burning
        t0 = optimizer.time_pass - optimizer.time_func
        steps0 = np.sum(optimizer.eval_count)
        optimizer.optimize(num_passes=test_steps)
        t1 = optimizer.time_pass - optimizer.time_func
        t_diff = t1 - t0
        steps1 = np.sum(optimizer.eval_count)
        actual_test_steps = float(steps1 - steps0)/float(N)
        T_arr.append(t_diff/actual_test_steps)
        print T_arr[-1]
        
    return np.array(M_arr), np.array(N_arr), np.array(T_arr)
示例#3
0
def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses,
         batch_size, **kwargs):

    # Get true model
    true_model = get_true_model(shape, spacing, origin, nbl, space_order)

    # Get smooth model
    smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order)

    # Compute initial born perturbation from m - m0
    dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2))

    # Geometry
    nsrc = xs.shape[0]
    nrec = xr.shape[0]
    geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0)

    # Compute observed data in parallel (inverse crime).
    # In real life we would read the SEG-Y data here.
    futures = []
    for i in range(geometry0.nsrc):
        args = [dm, i, smooth_model, geometry0, space_order]
        futures.append(forward_modeling.remote(*args))
    dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc),
                    dtype=np.float32)
    for i in range(geometry0.nsrc):
        dobs[:, i] = ray.get(futures[i])

    # List containing an identifying element for each subfunction
    sub_refs = set_subreferences(dobs, geometry0, batch_size)

    # Initial guess
    theta_init = np.zeros(smooth_model.shape, dtype=np.float32)

    # # initialize the optimizer
    optimizer = SFO(f_df_multi_shots, theta_init, sub_refs,
                    [geometry0, smooth_model, space_order])

    # # run the optimizer for npasses pass through the data
    theta = optimizer.optimize(num_passes=npasses)

    # Write inverted reflectivity to disk
    file = open('output/dvel-final.bin', "wb")
    scopy = theta.reshape(smooth_model.shape).astype(
        np.float32).copy(order='C')
    file.write(scopy)

    # Create a plot with the minibatch function values
    plt.plot(np.array(optimizer.hist_f_flat))
    plt.xlabel('Iteration')
    plt.ylabel('Minibatch Function Value')
    plt.title('Convergence Trace')
    plt.savefig('output/history_sfo.png')
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, 
					covW0T, covW1T, covW2T, covb0T, covb1T, covb2T],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11])
	return out

# Creating the optimizer

optimizer = SFO(f_df, init_params, subfuncs)

# Running the optimization

init_loss = f_df(init_params,subfuncs[0])[0]
print init_loss

keyin=''
while keyin!='y':
	opt_params = optimizer.optimize(num_passes=24*4)
	end_loss = f_df(opt_params,subfuncs[0])[0]
	print 'Current loss: ', end_loss
	W=opt_params[0]
	pp.scatter(W[0,:],W[1,:]); pp.show()
	keyin=raw_input('End optimization? (y)')

samples=sample(opt_params)
pp.scatter(samples[:,0],samples[:,1]); pp.show()
示例#5
0
def optim_vae_sfo(model,
                  x,
                  v_init,
                  w_init,
                  n_batch,
                  n_passes,
                  hook,
                  n_resample=20,
                  resample_keepmem=False,
                  bernoulli_x=False,
                  display=0):

    # Shuffle columns of dataset x
    ndict.shuffleCols(x)

    # create minibatches
    n_tot = x.itervalues().next().shape[1]
    minibatches = []
    n_minibatches = n_tot / n_batch
    if (n_tot % n_batch) != 0: raise Exception()

    # Divide into minibatches
    def make_minibatch(i):
        _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch)
        _eps = model.gen_eps(n_batch)
        if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x'])
        return [i, _x, _eps]

    for i in range(n_minibatches):
        minibatches.append(make_minibatch(i))

    L = [0.]
    n_L = [0]

    def f_df(w, minibatch):

        i_minibatch = minibatch[0]
        x_minibatch = minibatch[1]
        eps_minibatch = minibatch[2]

        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch,
                                                  eps_minibatch)

        # Get gradient w.r.t. priors
        logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w'])
        gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw}

        f = (logpx.sum() + logpz.sum() - logqz.sum())
        L[0] += -f / (1. * n_batch)
        n_L[0] += 1
        f += float(n_batch) / n_tot * logpv
        f += float(n_batch) / n_tot * logpw

        for i in gv:
            gv[i] *= -1. / n_batch
        for i in gw:
            gw[i] *= -1. / n_batch
        f *= -1. / n_batch

        #print 'norms gv:'
        #ndict.pNorm(gv)
        #print 'norms gw'
        #ndict.pNorm(gw)

        return f, {'v': gv, 'w': gw}

    w_init = {'v': v_init, 'w': w_init}

    from sfo import SFO
    optimizer = SFO(f_df, w_init, minibatches, display=display)

    #optimizer.check_grad()

    # loop
    for i in range(n_passes):
        w = optimizer.optimize(num_passes=1)
        LB = L[0] / (1. * n_L[0])
        hook(i, w['v'], w['w'], LB)
        L[0] = 0
        n_L[0] = 0
        # Reset noise epsilon of some minibatches
        for j in range(n_minibatches):
            if n_resample > 0 and i % n_resample == j % n_resample:
                minibatches[j] = make_minibatch(j)
                optimizer.replace_subfunction(j, resample_keepmem,
                                              minibatches[j])

    print "Finished!"
示例#6
0
    def train(self,
              images,
              batch_size=50,
              num_epochs=20,
              method='SGD',
              train_means=False,
              train_top_layer=False,
              momentum=0.9,
              learning_rate=1.,
              decay1=0.9,
              decay2=0.999,
              precondition=True):
        """
		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of images
		"""

        print 'Preprocessing...'

        inputs, outputs = self._preprocess(images)

        if precondition:
            print 'Preconditioning...'

            # remove correlations
            inputs, outputs = self._precondition(inputs, outputs)

        # indicates which layers will be trained
        train_layers = [self.num_layers -
                        1] if train_top_layer else range(self.num_layers)

        print 'Creating SLSTMs...'

        # create SLSTMs
        for l in range(self.num_layers):
            self.slstm[l] = SLSTM(
                num_rows=inputs.shape[1],
                num_cols=inputs.shape[2],
                num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
                num_hiddens=self.num_hiddens,
                batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
                nonlinearity=self.nonlinearity,
                extended=self.extended,
                slstm=self.slstm[l],
                verbosity=self.verbosity)

        # compute loss function and its gradient
        def f_df(params, idx):
            # set model parameters
            for l in train_layers:
                self.slstm[l].set_parameters(params['slstm'][l])
            self.mcgsm._set_parameters(params['mcgsm'],
                                       {'train_means': train_means})

            # select batch and compute hidden activations
            Y = outputs[idx:idx + batch_size]
            H = inputs[idx:idx + batch_size]

            for l in range(self.num_layers):
                H = self.slstm[l].forward(H)

            # form inputs to MCGSM
            H_flat = H.reshape(-1, self.num_hiddens).T
            Y_flat = Y.reshape(-1, self.num_channels).T

            norm_const = -H_flat.shape[1]

            # compute gradients
            df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
            df_dh = df_dh.T.reshape(*H.shape) / norm_const

            # ignore bottom-right pixel (BSDS300)
            df_dh[:, -1, -1] = 0.

            # average negative log-likelihood
            f = sum(loglik) / norm_const

            df_dtheta = {}
            df_dtheta['slstm'] = [0.] * self.num_layers

            for l in range(self.num_layers)[::-1]:
                if l not in train_layers:
                    break
                if l > min(train_layers):
                    # derivative with respect to inputs of layer l are derivatives
                    # of hidden states of layer l - 1
                    df_dtheta['slstm'][l] = self.slstm[l].backward(
                        df_dh, force_backward=True)
                    df_dh = df_dtheta['slstm'][l]['inputs']
                    del df_dtheta['slstm'][l]['inputs']

                else:
                    # no need to compute derivatives with respect to input units
                    df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh)

            # compute gradient of MCGSM
            df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(
                H_flat, Y_flat, parameters={'train_means': train_means
                                            }) * log(2.) * self.mcgsm.dim_out

            return f, df_dtheta

        # collect current parameters
        params = {}
        params['slstm'] = [0.] * self.num_layers
        for l in range(self.num_layers)[::-1]:
            if l not in train_layers:
                break
            params['slstm'][l] = self.slstm[l].parameters()
        params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means})

        # a start index for each batch
        start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size)

        print 'Training...'

        if method.upper() == 'SFO':
            try:
                # optimize using sum-of-functions optimizer
                optimizer = SFO(f_df,
                                params,
                                start_indices,
                                display=self.verbosity)
                params_opt = optimizer.optimize(num_passes=num_epochs)

                # set model parameters
                for l in range(self.num_layers):
                    self.slstm[l].set_parameters(params_opt['slstm'][l])
                self.mcgsm._set_parameters(params_opt['mcgsm'],
                                           {'train_means': train_means})

            except KeyboardInterrupt:
                pass

            return optimizer.hist_f_flat

        elif method.upper() == 'SGD':
            loss = []
            diff = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff['slstm'][l][key] = zeros_like(params['slstm'][l][key])

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff['slstm'][l][key] = momentum * diff['slstm'][
                                l][key] - df['slstm'][l][key]
                            params['slstm'][l][key] = params['slstm'][l][
                                key] + learning_rate * diff['slstm'][l][key]

                    # update MCGSM parameters
                    diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm']
                    params['mcgsm'] = params[
                        'mcgsm'] + learning_rate * diff['mcgsm']

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        elif method.upper() == 'ADAM':
            loss = []
            diff_mean = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }
            diff_sqrd = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff_mean['slstm'][l] = {}
                diff_sqrd['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff_mean['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])
                    diff_sqrd['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])

            # step counter
            t = 1

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # include bias correction in step width
                    step_width = learning_rate / (
                        1. - power(decay1, t)) * sqrt(1. - power(decay2, t))
                    t += 1

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \
                             + (1. - decay1) * df['slstm'][l][key]
                            diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \
                             + (1. - decay2) * square(df['slstm'][l][key])

                            params['slstm'][l][key] = params['slstm'][l][key] - \
                             step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key]))

                    # update MCGSM parameters
                    diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (
                        1. - decay1) * df['mcgsm']
                    diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (
                        1. - decay2) * square(df['mcgsm'])
                    params['mcgsm'] = params['mcgsm'] - \
                     step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm']))

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        else:
            raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20  # number visible units
J = 10  # number hidden units
D = 100000  # full data batch size
N = int(np.sqrt(D) / 10.)  # number minibatches
# generate random training data
v = randn(M, D)

# create the array of subfunction specific arguments
sub_refs = []
for i in range(N):
    # extract a single minibatch of training data.
    sub_refs.append(v[:, i::N])

# initialize parameters
theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)}
# initialize the optimizer
optimizer = SFO(f_df, theta_init, sub_refs)
# # uncomment the following line to test the gradient of f_df
# optimizer.check_grad()
# run the optimizer for 1 pass through the data
theta = optimizer.optimize(num_passes=1)
# continue running the optimizer for another 20 passes through the data
theta = optimizer.optimize(num_passes=20)

# plot the convergence trace
plt.plot(np.array(optimizer.hist_f_flat))
plt.xlabel('Iteration')
plt.ylabel('Minibatch Function Value')
plt.title('Convergence Trace')
plt.show()
	#pp.hist(np.sqrt(np.sum(samples[-1]**2,axis=1)),50,normed=True,color='r')
	#pp.figure(8)
	#pp.suptitle(r'Learned $\beta$ Schedule')
	#pp.axes(xlabel='t', ylabel=r'$\beta$')
	#pp.plot(np.arange(nsteps),(1.0/(1.0+np.exp(-opt_params[-1])))*beta_max)
	pp.show()

exit()

if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
			cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32)
			cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32)
			cov_biases=np.zeros(nhid_cov).astype(np.float32)
			cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32)
			cov_b=np.zeros(ntgates).astype(np.float32)
M = 20  # number visible units
J = 10  # number hidden units
D = 100000  # full data batch size
N = int(np.sqrt(D) / 10.0)  # number minibatches
# generate random training data
v = randn(M, D)

# create the array of subfunction specific arguments
sub_refs = []
for i in range(N):
    # extract a single minibatch of training data.
    sub_refs.append(v[:, i::N])

# initialize parameters
theta_init = {"W": randn(J, M), "b_h": randn(J, 1), "b_v": randn(M, 1)}
# initialize the optimizer
optimizer = SFO(f_df, theta_init, sub_refs)
# # uncomment the following line to test the gradient of f_df
# optimizer.check_grad()
# run the optimizer for 1 pass through the data
theta = optimizer.optimize(num_passes=1)
# continue running the optimizer for another 20 passes through the data
theta = optimizer.optimize(num_passes=20)

# plot the convergence trace
plt.plot(np.array(optimizer.hist_f_flat))
plt.xlabel("Iteration")
plt.ylabel("Minibatch Function Value")
plt.title("Convergence Trace")
plt.show()
示例#10
0
    def fit(self, train_X, optimizer, param_init = None, sample_every=None):
		self.opt = optimizer
		n_train, n_vis = train_X.shape
		batch_size = self.batch_size

		if sample_every == None:
			sample_every = 10000000

		#theano.config.profile = True
		#theano.config.exception_verbosity='high'

		assert(n_vis == self.nv)

		train_X = self.shared_dataset(train_X)
		n_batches = np.ceil(n_train / float(batch_size)).astype('int')

		# theano variables for managing data (index minibatches, n examples in batch)
		index, n_ex = T.iscalars('batch_index', 'n_ex')
		batch_start = index*batch_size
		batch_stop = T.minimum(n_ex, (index + 1)*batch_size)
		effective_batch_size = batch_stop - batch_start

		# theano variables for learning
		lr = T.scalar('lr', dtype=theano.config.floatX)
		mom = T.scalar('mom', dtype=theano.config.floatX)

		if self.k == 1:
			# this one is for scaning over a batch and getting connectivity for each example
			# return grads too because T.grads through scan is awful
			# takes ~3x longer, but can experiment connectivity
			#K, grads = self.mpf.rbm_K2G(self.X, effective_batch_size)

			# this tiles out the minibatch matrix into a 3D tensor to compute connectivity
			#K, offs, y, y1, z= self.mpf.rbm_K(self.X, effective_batch_size)
			K = self.mpf.rbm_K(self.X, effective_batch_size)

		elif self.k == 2:
			if DEBUG:
				return_values = self.mpf.debug_rbm_K_2wise(self.X, effective_batch_size)	
				K = return_values[-1]
			else:
				K = self.mpf.rbm_K_2wise(self.X, effective_batch_size)
		else:
			raise('NotImplemented')

		reg = self.L1_reg * self.mpf.L1 + self.L2_reg * self.mpf.L2
		reg_grad = T.grad(reg, self.mpf.theta)

		# if not scan (tile out matrix into tensor)
		cost = K + reg
		grads = T.grad(cost, self.mpf.theta)

		# otherwise
		#grads = grads + reg_grad

		if param_init == None:
			self.mpf.theta.set_value(random_theta(D, DH, k=self.k))
		else:
			self.mpf.theta.set_value(np.asarray(np.concatenate(param_init), dtype=theano.config.floatX))

		if optimizer == 'sgd':
			updates = []
			theta = self.mpf.theta
			theta_update = self.mpf.theta_update

			upd = mom * theta_update - lr * grads
			updates.append((theta_update, upd))
			updates.append((theta, theta + upd))

			print 'compiling theano function'
			if DEBUG:
				return_values = list(return_values)
				return_values.append(cost)
				return_values.append(grads)
				train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=return_values, updates=updates, givens={self.X: train_X[batch_start:batch_stop]})
			else:
				train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=cost, updates=updates, givens={self.X: train_X[batch_start:batch_stop]})

			self.current_epoch = 0
			start = time.time()
			learning_rate_init = self.learning_rate
			while self.current_epoch < self.n_epochs:
				print 'epoch:', self.current_epoch
				self.current_epoch += 1
				effective_mom = self.final_momentum if self.current_epoch > self.momentum_switchover else self.initial_momentum

				avg_epoch_cost = 0
				last_debug = None
				for minibatch_idx in xrange(n_batches):
					avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_mom)
					#print '\t\t', np.isnan(gr).sum(), np.isnan(yy).sum(), np.isnan(yy1).sum(), np.isnan(zz).sum()
					if DEBUG:
						return_values, avg_cost, gradients = avg_cost[:-2], avg_cost[-2], avg_cost[-1]
						print_debug(return_values, last_debug)
						last_debug = return_values
					avg_epoch_cost += avg_cost
					#print '\t', minibatch_idx, avg_cost
				print '\t avg epoch cost:', avg_epoch_cost/n_batches
				self.learning_rate *= self.learning_rate_decay

				theta_fit = split_theta(self.mpf.theta.get_value(), self.mpf.n_visible, self.mpf.n_hidden, k=self.mpf.k)
				if (self.current_epoch % sample_every == 0):
					sample_and_save(theta_fit, self.mpf.n_hidden, self.current_epoch, learning_rate_init, self.mpf.k, self.opt)

			theta_opt = self.mpf.theta.get_value()
			end = time.time()

		elif optimizer == 'cg' or optimizer == 'bfgs':
			print "compiling theano functions"
			get_batch_size = theano.function([index, n_ex], effective_batch_size, name='get_batch_size')
			batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')


			def train_fn_cost_grads(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_losses_grads = [batch_cost_gradst(i, n_train) for i in xrange(n_batches)]

				train_losses = [i[0] for i in train_losses_grads]
				train_grads = [i[1] for i in train_losses_grads]

				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				print len(train_losses), len(train_grads)
				print train_losses[0].shape, train_grads[0].shape
				returns = np.average(train_losses, weights=train_batch_sizes), np.average(train_grads, weights=train_batch_sizes, axis=0)
				return returns


			def train_fn_cost(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_costs = [batch_cost(i, n_train) for i in xrange(n_batches)]
				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				return np.average(train_costs, weights=train_batch_sizes)

			def train_fn_grads(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_grads = [batch_grads(i, n_train) for i in xrange(n_batches)]
				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				return np.average(train_grads, weights=train_batch_sizes, axis=0)


			###############
			# TRAIN MODEL #
			###############
			def my_callback():
				print 'wtf'

			from scipy.optimize import minimize
			from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b
			if optimizer == 'cg':
				pass
			elif optimizer == 'bfgs':
				print 'using bfgs'
				#theta_opt, f_theta_opt, info = fmin_l_bfgs_b(train_fn, self.mpf.theta.get_value(), iprint=1, maxfun=self.n_epochs)
				start = time.time()
				disp = True
				print 'ready to minimize'
				#result_obj = minimize(train_fn, self.mpf.theta.get_value(), jac=True, method='BFGS', options={'maxiter':self.n_epochs, 'disp':disp}, callback=my_callback())
				#theta_opt = fmin_bfgs(f=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs)
				theta_opt, fff, ddd = fmin_l_bfgs_b(func=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs)
				print 'done minimize ya right'
				end = time.time()

		elif optimizer == 'sof':
			print "compiling theano functions"
			batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')


			def train_fn(theta_value, i):
				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)

				train_losses, train_grads = batch_cost_grads(i, n_train)
				
				return train_losses, train_grads

			###############
			# TRAIN MODEL #
			###############
			if param_init == None:
				theta.set_value(random_theta(D, DH))
			else:
				w0, bh0, bv0 = param_init
				self.mpf.theta.set_value(np.asarray(np.concatenate((w0, bh0, bv0)), dtype=theano.config.floatX))


			print 'using sof'
			sys.path.append('/export/mlrg/ebuchman/Programming/Sum-of-Functions-Optimizer')
			from sfo import SFO
			print 'n batches', n_batches
			print 'n epochs' , self.n_epochs
			optimizer = SFO(train_fn, self.mpf.theta.get_value(), np.arange(n_batches))
			start = time.time()
			theta_opt = optimizer.optimize(num_passes = self.n_epochs)
			end = time.time()

		
		self.mpf.theta.set_value(theta_opt.astype(theano.config.floatX), borrow=True)
		return end-start
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9])
	return out


if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
			cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32)
			cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32)
			cov_biases=np.zeros(nhid_cov).astype(np.float32)
			cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32)
			cov_b=np.zeros(ntgates).astype(np.float32)
示例#12
0
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0):
    
    # Shuffle columns of dataset x
    ndict.shuffleCols(x)
    
    # create minibatches
    n_tot = x.itervalues().next().shape[1]
    minibatches = []
    n_minibatches = n_tot / n_batch
    if (n_tot%n_batch) != 0: raise Exception()
    
    # Divide into minibatches
    def make_minibatch(i):
        _x = ndict.getCols(x, i * n_batch, (i+1) * n_batch)
        _eps = model.gen_eps(n_batch)
        if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x'])
        return [i, _x, _eps]

    for i in range(n_minibatches):
        minibatches.append(make_minibatch(i))
      
    L = [0.]
    n_L = [0]
    
    def f_df(w, minibatch):
        
        i_minibatch = minibatch[0]
        x_minibatch = minibatch[1]
        eps_minibatch = minibatch[2]
        
        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch)
        
        # Get gradient w.r.t. priors
        logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w'])
        gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw}
        
        f = (logpx.sum() + logpz.sum() - logqz.sum())
        L[0] += -f/(1.*n_batch)
        n_L[0] += 1
        f += float(n_batch)/n_tot * logpv
        f += float(n_batch)/n_tot * logpw
        
        for i in gv: gv[i] *= -1./n_batch
        for i in gw: gw[i] *= -1./n_batch
        f *= -1./n_batch
        
        #print 'norms gv:'
        #ndict.pNorm(gv)
        #print 'norms gw'
        #ndict.pNorm(gw)
        
        return f, {'v':gv,'w':gw}
    
    w_init = {'v':v_init, 'w':w_init}
    
    from sfo import SFO
    optimizer = SFO(f_df, w_init, minibatches, display=display)
    
    #optimizer.check_grad()
    
    # loop
    for i in range(n_passes):
        w = optimizer.optimize(num_passes=1)
        LB = L[0]/(1.*n_L[0])
        hook(i, w['v'], w['w'], LB)
        L[0] = 0
        n_L[0] = 0
        # Reset noise epsilon of some minibatches
        for j in range(n_minibatches):
            if n_resample > 0 and i%n_resample == j%n_resample:
                minibatches[j] = make_minibatch(j)
                optimizer.replace_subfunction(j, resample_keepmem, minibatches[j])
        
    print "Finished!"
示例#13
0
	def train(self, images,
			batch_size=50,
			num_epochs=20,
			method='SGD',
			train_means=False,
			train_top_layer=False,
			momentum=0.9,
			learning_rate=1.,
			decay1=0.9,
			decay2=0.999,
			precondition=True):
		"""
		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of images
		"""

		print 'Preprocessing...'

		inputs, outputs = self._preprocess(images)

		if precondition:
			print 'Preconditioning...'

			# remove correlations
			inputs, outputs = self._precondition(inputs, outputs)

		# indicates which layers will be trained
		train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers)

		print 'Creating SLSTMs...'

		# create SLSTMs
		for l in range(self.num_layers):
			self.slstm[l] = SLSTM(
				num_rows=inputs.shape[1],
				num_cols=inputs.shape[2],
				num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
				num_hiddens=self.num_hiddens,
				batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
				nonlinearity=self.nonlinearity,
				extended=self.extended,
				slstm=self.slstm[l],
				verbosity=self.verbosity)

		# compute loss function and its gradient
		def f_df(params, idx):
			# set model parameters
			for l in train_layers:
				self.slstm[l].set_parameters(params['slstm'][l])
			self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means})

			# select batch and compute hidden activations
			Y = outputs[idx:idx + batch_size]
			H = inputs[idx:idx + batch_size]

			for l in range(self.num_layers):
				H = self.slstm[l].forward(H)

			# form inputs to MCGSM
			H_flat = H.reshape(-1, self.num_hiddens).T
			Y_flat = Y.reshape(-1, self.num_channels).T

			norm_const = -H_flat.shape[1]

			# compute gradients
			df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
			df_dh = df_dh.T.reshape(*H.shape) / norm_const

			# ignore bottom-right pixel (BSDS300)
			df_dh[:, -1, -1] = 0.

			# average negative log-likelihood
			f = sum(loglik) / norm_const

			df_dtheta = {}
			df_dtheta['slstm'] = [0.] * self.num_layers

			for l in range(self.num_layers)[::-1]:
				if l not in train_layers:
					break
				if l > min(train_layers):
					# derivative with respect to inputs of layer l are derivatives
					# of hidden states of layer l - 1
					df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh, force_backward=True)
					df_dh = df_dtheta['slstm'][l]['inputs']
					del df_dtheta['slstm'][l]['inputs']

				else:
					# no need to compute derivatives with respect to input units
					df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh)

			# compute gradient of MCGSM
			df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(H_flat, Y_flat,
				parameters={'train_means': train_means}) * log(2.) * self.mcgsm.dim_out

			return f, df_dtheta

		# collect current parameters
		params = {}
		params['slstm'] = [0.] * self.num_layers
		for l in range(self.num_layers)[::-1]:
			if l not in train_layers:
				break
			params['slstm'][l] = self.slstm[l].parameters()
		params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means})

		# a start index for each batch
		start_indices = range(
			0, inputs.shape[0] - batch_size + 1, batch_size)

		print 'Training...'

		if method.upper() == 'SFO':
			try:
				# optimize using sum-of-functions optimizer
				optimizer = SFO(f_df, params, start_indices, display=self.verbosity)
				params_opt = optimizer.optimize(num_passes=num_epochs)

				# set model parameters
				for l in range(self.num_layers):
					self.slstm[l].set_parameters(params_opt['slstm'][l])
				self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means})

			except KeyboardInterrupt:
				pass

			return optimizer.hist_f_flat

		elif method.upper() == 'SGD':
			loss = []
			diff = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}

			for l in train_layers:
				diff['slstm'][l] = {}
				for key in params['slstm'][l]:
					diff['slstm'][l][key] = zeros_like(params['slstm'][l][key])

			for n in range(num_epochs):
				for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
					# compute gradients
					f, df = f_df(params, b)

					loss.append(f)

					# update SLSTM parameters
					for l in train_layers:
						for key in params['slstm'][l]:
							diff['slstm'][l][key] = momentum * diff['slstm'][l][key] - df['slstm'][l][key]
							params['slstm'][l][key] = params['slstm'][l][key] + learning_rate * diff['slstm'][l][key]

					# update MCGSM parameters
					diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm']
					params['mcgsm'] = params['mcgsm'] + learning_rate * diff['mcgsm']

					if self.verbosity > 0:
						print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
							n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):]))

			return loss

		elif method.upper() == 'ADAM':
			loss = []
			diff_mean = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}
			diff_sqrd = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}

			for l in train_layers:
				diff_mean['slstm'][l] = {}
				diff_sqrd['slstm'][l] = {}
				for key in params['slstm'][l]:
					diff_mean['slstm'][l][key] = zeros_like(params['slstm'][l][key])
					diff_sqrd['slstm'][l][key] = zeros_like(params['slstm'][l][key])

			# step counter
			t = 1

			for n in range(num_epochs):
				for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
					# compute gradients
					f, df = f_df(params, b)

					loss.append(f)

					# include bias correction in step width
					step_width = learning_rate / (1. - power(decay1, t)) * sqrt(1. - power(decay2, t))
					t += 1

					# update SLSTM parameters
					for l in train_layers:
						for key in params['slstm'][l]:
							diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \
								+ (1. - decay1) * df['slstm'][l][key]
							diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \
								+ (1. - decay2) * square(df['slstm'][l][key])

							params['slstm'][l][key] = params['slstm'][l][key] - \
								step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key]))

					# update MCGSM parameters
					diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (1. - decay1) * df['mcgsm']
					diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (1. - decay2) * square(df['mcgsm'])
					params['mcgsm'] = params['mcgsm'] - \
						step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm']))

					if self.verbosity > 0:
						print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
							n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):]))

			return loss


		else:
			raise ValueError('Unknown method \'{0}\'.'.format(method))
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11],params[12],params[13])
	return out


if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
			cov_centers=(np.random.randn(nx, nhid_cov)*1.0).astype(np.float32)
			cov_spreads=(np.zeros((nx, nhid_cov))-1.0).astype(np.float32)
			cov_biases=np.zeros(nhid_cov).astype(np.float32)
			cov_M=(np.random.randn(nhid_cov, ntgates*nx)*0.01).astype(np.float32)
			cov_b=np.zeros(ntgates).astype(np.float32)
示例#15
0
    def train(
        self,
        images,
        batch_size=50,
        num_epochs=20,
        method="SGD",
        train_means=False,
        train_top_layer=False,
        momentum=0.9,
        learning_rate=1.0,
        decay1=0.9,
        decay2=0.999,
        precondition=True,
    ):
        """
		Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO).

		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of training images (e.g., Nx32x32x3)

		@type  batch_size: C{int}
		@param batch_size: batch size used by SGD

		@type  num_epochs: C{int}
		@param num_epochs: number of passes through the training set

		@type  method: C{str}
		@param method: either 'SGD', 'SFO', or 'ADAM'

		@type  train_means: C{bool}
		@param train_means: whether or not to optimize the mean parameters of the MCGSM

		@type  train_top_layer: C{bool}
		@param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained

		@type  momentum: C{float}
		@param momentum: momentum rate used by SGD

		@type  learning_rate: C{float}
		@param learning_rate: learning rate used by SGD

		@type  decay1: C{float}
		@param decay1: hyperparameter used by ADAM

		@type  decay2: C{float}
		@param decay2: hyperparameter used by ADAM

		@type  precondition: C{bool}
		@param precondition: whether or not to perform conditional whitening

		@rtype: C{list}
		@return: evolution of negative log-likelihood (bits per pixel) over the training
		"""

        if images.shape[1] < self.input_mask.shape[0] or images.shape[2] < self.input_mask.shape[1]:
            raise ValueError("Images too small.")

        if self.verbosity > 0:
            print "Preprocessing..."

        inputs, outputs = self._preprocess(images)

        if precondition:
            if self.verbosity > 0:
                print "Preconditioning..."

                # remove correlations
            inputs, outputs = self._precondition(inputs, outputs)

            # indicates which layers will be trained
        train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers)

        if self.verbosity > 0:
            print "Creating SLSTMs..."

            # create SLSTMs
        for l in range(self.num_layers):
            self.slstm[l] = SLSTM(
                num_rows=inputs.shape[1],
                num_cols=inputs.shape[2],
                num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
                num_hiddens=self.num_hiddens,
                batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
                nonlinearity=self.nonlinearity,
                extended=self.extended,
                slstm=self.slstm[l],
                verbosity=self.verbosity,
            )

            # compute loss function and its gradient

        def f_df(params, idx):
            # set model parameters
            for l in train_layers:
                self.slstm[l].set_parameters(params["slstm"][l])
            self.mcgsm._set_parameters(params["mcgsm"], {"train_means": train_means})

            # select batch and compute hidden activations
            Y = outputs[idx : idx + batch_size]
            H = inputs[idx : idx + batch_size]

            for l in range(self.num_layers):
                H = self.slstm[l].forward(H)

                # form inputs to MCGSM
            H_flat = H.reshape(-1, self.num_hiddens).T
            Y_flat = Y.reshape(-1, self.num_channels).T

            norm_const = -H_flat.shape[1]

            # compute gradients
            df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
            df_dh = df_dh.T.reshape(*H.shape) / norm_const

            # average log-likelihood
            f = sum(loglik) / norm_const

            df_dtheta = {}
            df_dtheta["slstm"] = [0.0] * self.num_layers

            for l in range(self.num_layers)[::-1]:
                if l not in train_layers:
                    break
                if l > min(train_layers):
                    # derivative with respect to inputs of layer l are derivatives
                    # of hidden states of layer l - 1
                    df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh, force_backward=True)
                    df_dh = df_dtheta["slstm"][l]["inputs"]
                    del df_dtheta["slstm"][l]["inputs"]

                else:
                    # no need to compute derivatives with respect to input units
                    df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh)

                    # compute gradient of MCGSM
            df_dtheta["mcgsm"] = (
                self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={"train_means": train_means})
                * log(2.0)
                * self.mcgsm.dim_out
            )

            return f, df_dtheta

            # collect current parameters

        params = {}
        params["slstm"] = [0.0] * self.num_layers
        for l in range(self.num_layers)[::-1]:
            if l not in train_layers:
                break
            params["slstm"][l] = self.slstm[l].parameters()
        params["mcgsm"] = self.mcgsm._parameters({"train_means": train_means})

        # a start index for each batch
        start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size)

        if self.verbosity > 0:
            print "Training..."

        if method.upper() == "SFO":
            try:
                # optimize using sum-of-functions optimizer
                optimizer = SFO(f_df, params, start_indices, display=self.verbosity)
                params_opt = optimizer.optimize(num_passes=num_epochs)

                # set model parameters
                for l in range(self.num_layers):
                    self.slstm[l].set_parameters(params_opt["slstm"][l])
                self.mcgsm._set_parameters(params_opt["mcgsm"], {"train_means": train_means})

            except KeyboardInterrupt:
                pass

            return optimizer.hist_f_flat

        elif method.upper() == "SGD":
            loss = []
            diff = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}

            for l in train_layers:
                diff["slstm"][l] = {}
                for key in params["slstm"][l]:
                    diff["slstm"][l][key] = zeros_like(params["slstm"][l][key])

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f / log(2.0) / self.num_channels)

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params["slstm"][l]:
                            diff["slstm"][l][key] = momentum * diff["slstm"][l][key] - df["slstm"][l][key]
                            params["slstm"][l][key] = params["slstm"][l][key] + learning_rate * diff["slstm"][l][key]

                            # update MCGSM parameters
                    diff["mcgsm"] = momentum * diff["mcgsm"] - df["mcgsm"]
                    params["mcgsm"] = params["mcgsm"] + learning_rate * diff["mcgsm"]

                    if self.verbosity > 0:
                        print "{0:>5} {1:>10.4f} {2:>10.4f}".format(
                            n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :])
                        )

            return loss

        elif method.upper() == "ADAM":
            loss = []
            diff_mean = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}
            diff_sqrd = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}

            for l in train_layers:
                diff_mean["slstm"][l] = {}
                diff_sqrd["slstm"][l] = {}
                for key in params["slstm"][l]:
                    diff_mean["slstm"][l][key] = zeros_like(params["slstm"][l][key])
                    diff_sqrd["slstm"][l][key] = zeros_like(params["slstm"][l][key])

                    # step counter
            t = 1

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f / log(2.0) / self.num_channels)

                    # include bias correction in step width
                    step_width = learning_rate / (1.0 - power(decay1, t)) * sqrt(1.0 - power(decay2, t))
                    t += 1

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params["slstm"][l]:
                            diff_mean["slstm"][l][key] = (
                                decay1 * diff_mean["slstm"][l][key] + (1.0 - decay1) * df["slstm"][l][key]
                            )
                            diff_sqrd["slstm"][l][key] = decay2 * diff_sqrd["slstm"][l][key] + (1.0 - decay2) * square(
                                df["slstm"][l][key]
                            )

                            params["slstm"][l][key] = params["slstm"][l][key] - step_width * diff_mean["slstm"][l][
                                key
                            ] / (1e-8 + sqrt(diff_sqrd["slstm"][l][key]))

                            # update MCGSM parameters
                    diff_mean["mcgsm"] = decay1 * diff_mean["mcgsm"] + (1.0 - decay1) * df["mcgsm"]
                    diff_sqrd["mcgsm"] = decay2 * diff_sqrd["mcgsm"] + (1.0 - decay2) * square(df["mcgsm"])
                    params["mcgsm"] = params["mcgsm"] - step_width * diff_mean["mcgsm"] / (
                        1e-8 + sqrt(diff_sqrd["mcgsm"])
                    )

                    if self.verbosity > 0:
                        print "{0:>5} {1:>10.4f} {2:>10.4f}".format(
                            n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :])
                        )

            return loss

        else:
            raise ValueError("Unknown method '{0}'.".format(method))
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, 
					covW0T, covW1T, covW2T, covb0T, covb1T, covb2T],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11])
	return out

# Creating the optimizer

optimizer = SFO(f_df, init_params, subfuncs)

# Running the optimization

init_loss = f_df(init_params,subfuncs[0])[0]
print init_loss

keyin=''
while keyin!='y':
	opt_params = optimizer.optimize(num_passes=12)
	end_loss = f_df(opt_params,subfuncs[0])[0]
	print 'Current loss: ', end_loss
	W=opt_params[0]
	pp.scatter(W[0,:],W[1,:]); pp.show()
	keyin=raw_input('End optimization? (y)')

samples=sample(opt_params)
pp.scatter(samples[:,0],samples[:,1]); pp.show()