Пример #1
0
    def __init__(self, params, X, Y, samples=20, batch_size=None):
        self.Y = Y
        self.X = X
        samples = 20
        self.N = X.shape[0]
        batch_size = 100
        if batch_size is None:
            batch_size = self.N

        correct = self.N / batch_size

        self.dggplvm = DGGPLVM_model(params,
                                     correct,
                                     samples=20,
                                     batch_size=None)

        self.ELBO = self.dggplvm.ELBO
        self.f = self.dggplvm.f
        self.params = self.dggplvm.params
        self.estimate_U = self.dggplvm.estimate_U
        self.exec_f = self.dggplvm.exec_f
        self.estimate = self.dggplvm.estimate
        self.callback_counter = [0]
        self.print_interval = 10

        #RMSPROPのため
        self.param_updates = {n: np.zeros_like(v)
                              for n, v in params.items()
                              }  #update用の同じサイズの空の箱を用意
        self.moving_mean_squared = {
            n: np.zeros_like(v)
            for n, v in params.items()
        }  #RMSPROP用に更新の履歴(γを入れておく器をパラメータごとに用意
        self.learning_rates = {
            n: 1e-2 * np.ones_like(v)
            for n, v in params.items()
        }  #行進用の同じサイズの箱を用意

        #Scipyに入れるよう
        self.opt_param_names = ['Z', 'm', 'S_b', 'lhyp', 'ls']
        self.opt_param_values = [
            np.atleast_2d(params[n]) for n in self.opt_param_names
        ]

        self.shapes = [v.shape for v in self.opt_param_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes = [
            sum([np.prod(x) for x in self.shapes[:i]])
            for i in range(len(self.shapes) + 1)
        ]
        #[0, 40, 41, 59, 1659, 2299]な感じで累積和でパラメータの大きさの格納を行う

        #2段階用
        self.opt_local_names = ['m', 'S_b']
        self.opt_local_values = [
            np.atleast_2d(params[n]) for n in self.opt_local_names
        ]

        self.shapes_local = [v.shape for v in self.opt_local_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes_local = [
            sum([np.prod(x) for x in self.shapes_local[:i]])
            for i in range(len(self.shapes_local) + 1)
        ]

        self.opt_global_names = ['Z', 'lhyp', 'ls']
        self.opt_global_values = [
            np.atleast_2d(params[n]) for n in self.opt_global_names
        ]

        self.shapes_global = [v.shape for v in self.opt_global_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes_global = [
            sum([np.prod(x) for x in self.shapes_global[:i]])
            for i in range(len(self.shapes_global) + 1)
        ]
Пример #2
0
class DGGPLVM_opt:
    def __init__(self, params, X, Y, samples=20, batch_size=None):
        self.Y = Y
        self.X = X
        samples = 20
        self.N = X.shape[0]
        batch_size = 100
        if batch_size is None:
            batch_size = self.N

        correct = self.N / batch_size

        self.dggplvm = DGGPLVM_model(params,
                                     correct,
                                     samples=20,
                                     batch_size=None)

        self.ELBO = self.dggplvm.ELBO
        self.f = self.dggplvm.f
        self.params = self.dggplvm.params
        self.estimate_U = self.dggplvm.estimate_U
        self.exec_f = self.dggplvm.exec_f
        self.estimate = self.dggplvm.estimate
        self.callback_counter = [0]
        self.print_interval = 10

        #RMSPROPのため
        self.param_updates = {n: np.zeros_like(v)
                              for n, v in params.items()
                              }  #update用の同じサイズの空の箱を用意
        self.moving_mean_squared = {
            n: np.zeros_like(v)
            for n, v in params.items()
        }  #RMSPROP用に更新の履歴(γを入れておく器をパラメータごとに用意
        self.learning_rates = {
            n: 1e-2 * np.ones_like(v)
            for n, v in params.items()
        }  #行進用の同じサイズの箱を用意

        #Scipyに入れるよう
        self.opt_param_names = ['Z', 'm', 'S_b', 'lhyp', 'ls']
        self.opt_param_values = [
            np.atleast_2d(params[n]) for n in self.opt_param_names
        ]

        self.shapes = [v.shape for v in self.opt_param_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes = [
            sum([np.prod(x) for x in self.shapes[:i]])
            for i in range(len(self.shapes) + 1)
        ]
        #[0, 40, 41, 59, 1659, 2299]な感じで累積和でパラメータの大きさの格納を行う

        #2段階用
        self.opt_local_names = ['m', 'S_b']
        self.opt_local_values = [
            np.atleast_2d(params[n]) for n in self.opt_local_names
        ]

        self.shapes_local = [v.shape for v in self.opt_local_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes_local = [
            sum([np.prod(x) for x in self.shapes_local[:i]])
            for i in range(len(self.shapes_local) + 1)
        ]

        self.opt_global_names = ['Z', 'lhyp', 'ls']
        self.opt_global_values = [
            np.atleast_2d(params[n]) for n in self.opt_global_names
        ]

        self.shapes_global = [v.shape for v in self.opt_global_values]
        #Zの形は(40, 16)見たいな風にリストに入れてある
        self.sizes_global = [
            sum([np.prod(x) for x in self.shapes_global[:i]])
            for i in range(len(self.shapes_global) + 1)
        ]

    def get_grad(self, param_name, X, minibatch):
        #wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'KmmInv': KmmInv}

        if param_name in ['m', 'S_b']:
            grad = self.exec_f(self.dggplvm.g[param_name]['KL_X'], X,
                               minibatch) + self.estimate(
                                   self.dggplvm.g[param_name]['LL'], minibatch,
                                   X)[0]

        if param_name in ['mu', 'Sigma_b']:
            grad = self.exec_f(
                self.dggplvm.g[param_name]['KL_U'], X) + self.estimate(
                    self.dggplvm.g[param_name]['LL'], minibatch, X)[0]

        if param_name in ['Z', 'lhyp', 'ls']:
            grad_ls, grad_std = self.estimate(self.dggplvm.g[param_name]['LL'],
                                              minibatch, X)
            grad = self.exec_f(self.dggplvm.g[param_name]['KL_U'], X) + grad_ls

        # DEBUG
        if param_name == 'lhyp' and np.any(
                np.abs(grad) < grad_std / np.sqrt(self.dggplvm.samples)):
            #print 'Large noise, recomputing. lhyp grad mean:', grad, ', std:', grad_std / np.sqrt(self.clgp.samples)
            samples = self.dggplvm.samples * 10
            grad_ls, grad_std = self.estimate(self.dggplvm.g[param_name]['LL'],
                                              minibatch,
                                              X,
                                              samples=samples)
            grad = self.exec_f(self.dggplvm.g[param_name]['KL_U'], X) + grad_ls
            self.grad_std = grad_std

        return np.array(grad)

    #どのサンプルで最適化するのかminibatchで指定してもらう
    def opt_one_step(self,
                     params,
                     iteration,
                     minibatch,
                     opt='rmsprop',
                     learning_rate_adapt=0.2,
                     use_einsum=True):

        for param_name in params:
            # DEBUG
            if param_name in ['S_b']:  #lsはxの分散
                self.grad_ascent_one_step(
                    param_name,
                    minibatch, [param_name, self.X, minibatch],
                    learning_rate_decay=learning_rate_adapt * 100 /
                    (iteration + 100.0))

            elif param_name in ['m']:
                self.rmsprop_one_step_minibatch(
                    param_name,
                    minibatch, [param_name, self.X, minibatch],
                    learning_rate_adapt=learning_rate_adapt
                )  #, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))

            else:
                self.rmsprop_one_step(
                    param_name,
                    minibatch, [param_name, self.X, minibatch],
                    learning_rate_adapt=learning_rate_adapt
                )  #, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))

            if param_name in ['lhyp']:
                self.params[param_name] = np.clip(self.params[param_name], -8,
                                                  8)

            #if param_name in ['lhyp', 'Z']:
            #    self.dggplvm.update_KmmInv_cache()

    def opt_local_step(self,
                       local_params,
                       iteration,
                       minibatch,
                       opt='rmsprop',
                       learning_rate_adapt=0.2,
                       use_einsum=True):

        for param_name in local_params:
            # DEBUG
            if param_name in ['S_b']:  #lsはxの分散
                self.grad_ascent_one_step(
                    param_name,
                    minibatch, [param_name, self.X, minibatch],
                    learning_rate_decay=learning_rate_adapt * 100 /
                    (iteration + 100.0))

            elif param_name in ['m']:
                self.rmsprop_one_step_minibatch(
                    param_name,
                    minibatch, [param_name, self.X, minibatch],
                    learning_rate_adapt=learning_rate_adapt
                )  #, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))

    def opt_global_step(self,
                        global_params,
                        iteration,
                        minibatch,
                        opt='rmsprop',
                        learning_rate_adapt=0.2,
                        use_einsum=True):

        for param_name in global_params:
            # DEBUG
            self.rmsprop_one_step(
                param_name,
                minibatch, [param_name, self.X, minibatch],
                learning_rate_adapt=learning_rate_adapt
            )  #, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))

        if param_name in ['lhyp']:
            self.params[param_name] = np.clip(self.params[param_name], -8, 8)

    def grad_ascent_one_step(self,
                             param_name,
                             minibatch,
                             grad_args,
                             momentum=0.9,
                             learning_rate_decay=1):
        #grad_args=[param_name, self.Y, KmmInv_grad, self.mask]
        self.dggplvm.params[param_name][minibatch] += (
            learning_rate_decay * self.learning_rates[param_name][minibatch] *
            self.param_updates[param_name][minibatch])
        grad = self.get_grad(*grad_args)
        self.param_updates[param_name][minibatch] = grad

    def rmsprop_one_step_minibatch(self,
                                   param_name,
                                   minibatch,
                                   grad_args,
                                   decay=0.9,
                                   momentum=0,
                                   learning_rate_adapt=0.05,
                                   learning_rate_min=1e-6,
                                   learning_rate_max=10):
        # RMSPROP: Tieleman, T. and Hinton, G. (2012), Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning
        # Implementation based on https://github.com/BRML/climin/blob/master/climin/rmsprop.py

        # We use Nesterov momentum: first, we make a step according to the momentum and then we calculate the gradient.
        step1 = self.param_updates[param_name][minibatch] * momentum
        self.params[param_name][minibatch] += step1
        grad = self.get_grad(*grad_args)

        self.moving_mean_squared[param_name][minibatch] = (
            decay * self.moving_mean_squared[param_name][minibatch] +
            (1 - decay) * grad**2)
        step2 = self.learning_rates[param_name][minibatch] * grad / (
            self.moving_mean_squared[param_name][minibatch] + 1e-8)**0.5

        self.params[param_name][minibatch] += step2

        step = step1 + step2

        # Step rate adaption. If the current step and the momentum agree, we slightly increase the step rate for that dimension.
        if learning_rate_adapt:
            # This code might look weird, but it makes it work with both numpy and gnumpy.
            step_non_negative = step > 0
            step_before_non_negative = self.param_updates[param_name][
                minibatch] > 0
            agree = (step_non_negative
                     == step_before_non_negative) * 1.  #0か1が出る
            adapt = 1 + agree * learning_rate_adapt * 2 - learning_rate_adapt
            self.learning_rates[param_name][minibatch] *= adapt
            self.learning_rates[param_name][minibatch] = np.clip(
                self.learning_rates[param_name][minibatch], learning_rate_min,
                learning_rate_max)

        self.param_updates[param_name][minibatch] = step

    def rmsprop_one_step(self,
                         param_name,
                         minibatch,
                         grad_args,
                         decay=0.9,
                         momentum=0,
                         learning_rate_adapt=0.05,
                         learning_rate_min=1e-6,
                         learning_rate_max=10):
        # RMSPROP: Tieleman, T. and Hinton, G. (2012), Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning
        # Implementation based on https://github.com/BRML/climin/blob/master/climin/rmsprop.py

        # We use Nesterov momentum: first, we make a step according to the momentum and then we calculate the gradient.
        step1 = self.param_updates[param_name] * momentum
        self.params[param_name] += step1
        grad = self.get_grad(*grad_args)

        self.moving_mean_squared[param_name] = (
            decay * self.moving_mean_squared[param_name] +
            (1 - decay) * grad**2)
        step2 = self.learning_rates[param_name] * grad / (
            self.moving_mean_squared[param_name] + 1e-8)**0.5

        # DEBUG
        if param_name == 'lhyp':
            step2 = np.clip(step2, -0.1, 0.1)

        self.params[param_name] += step2

        step = step1 + step2

        # Step rate adaption. If the current step and the momentum agree, we slightly increase the step rate for that dimension.
        if learning_rate_adapt:
            # This code might look weird, but it makes it work with both numpy and gnumpy.
            step_non_negative = step > 0
            step_before_non_negative = self.param_updates[param_name] > 0
            agree = (step_non_negative
                     == step_before_non_negative) * 1.  #0か1が出る
            adapt = 1 + agree * learning_rate_adapt * 2 - learning_rate_adapt
            self.learning_rates[param_name] *= adapt
            self.learning_rates[param_name] = np.clip(
                self.learning_rates[param_name], learning_rate_min,
                learning_rate_max)

        self.param_updates[param_name] = step

    def choose_best_z(self, ind, Y_true, mask, samples=20):
        """
        Assign m[i] to the best location among all the inducing points.
        """
        orig_params = {'m': self.params['m'], 'ls': self.params['ls']}
        N = len(ind)
        M = self.params['Z'].shape[0]

        self.params['ls'] = self.params['ls'][ind]
        f = np.zeros((M + 1, N))
        for m in range(M + 1):
            if m < M:
                self.params['m'] = np.tile(self.params['Z'][m], (N, 1))
            else:
                self.params['m'] = orig_params['m'][ind]

            # KL.
            kl_x = self.exec_f(self.f['KL_X_all'])
            f[m] += kl_x

            # Likelihood.
            for modality in range(len(Y_true)):
                S, _ = self.estimate(self.f['S'],
                                     modality=modality,
                                     samples=samples)

                Y_ind = Y_true[modality][ind]
                mask_ind = mask[:, modality][ind]
                f[m] += np.log(np.maximum(np.sum(S * Y_ind, 1),
                                          1e-16)) * mask_ind

        self.params['m'], self.params['ls'] = orig_params['m'], orig_params[
            'ls']

        best_z = np.argmax(f, 0)

        # Do not change m if best_z == M.
        self.params['m'][ind[best_z < M]] = self.params['Z'][best_z[
            best_z < M]]

        return best_z

#########################################################################################################################################S
###L-BFGSでの最適化

    def unpack(self, x):
        x_param_values = [
            x[self.sizes[i - 1]:self.sizes[i]].reshape(self.shapes[i - 1])
            for i in range(1,
                           len(self.shapes) + 1)
        ]  #これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {n: v for (n, v) in zip(self.opt_param_names, x_param_values)}
        if 'lhyp' in params:
            params['lhyp'] = params['lhyp'].squeeze()

        if 'ls' in params:
            params['ls'] = params['ls'].reshape(1)

        return params

    def _convert_to_array(self, params):
        return np.hstack((params['Z'].flatten(), params['m'].flatten(),
                          params['S_b'].flatten(), params['mu'].flatten(),
                          params['Sigma_b'].flatten(),
                          params['lhyp'].flatten(), params['ls'].flatten()))

    #'Z', 'm', 'S_b', 'mu', 'Sigma_b', 'lhyp', 'ls'
    def _optimizer_f(self, hypInArray):
        params = self.unpack(hypInArray)
        self.params = params
        cost = self.ELBO(self.X, self.N)
        return -cost[0]

    def _optimizer_g(self, hypInArray):
        params = self.unpack(hypInArray)
        self.params = params
        gradient = []
        minibatch = np.arange(self.N)
        for i in self.opt_param_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient = np.hstack((gradient, g.flatten()))
        return gradient

    def train_by_optimizer(self, batch_size=None):

        print('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X, self.N)
        print('BEGINE Training, Log Likelihood = %.2f' % likelihood[0])
        #import minimize
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init = []
        from scipy.optimize import minimize

        init = self._convert_to_array(self.params)

        opt_results = minimize(self._optimizer_f,
                               init,
                               method='L-BFGS-B',
                               jac=self._optimizer_g,
                               options={
                                   'ftol': 0,
                                   'disp': True,
                                   'maxiter': 500
                               },
                               tol=0,
                               callback=self.callback)
        optimalHyp = deepcopy(opt_results.x)
        hype = self.unpack(optimalHyp)
        self.params = hype

        likelihood = self.dggplvm.ELBO(self.X, self.N)
        print('END Training, Log Likelihood = %.2f' % likelihood[0])

    def callback(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0] % self.print_interval == 0:
            opt_params = self.unpack(x)
            self.params = opt_params
            cost = self.ELBO(self.X, self.N)
            print('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) +
                  ' +- ' + str(cost[1]))
        self.callback_counter[0] += 1

##################################################################################################################################################

    def train_by_optimizer_local_and_global(self, batch_size=None):
        iteration = 0
        max_iteration = 100
        print('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X, self.N)
        print('BEGINE Training, Log Likelihood = %.2f' % likelihood[0])
        #import minimize
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init = []
        from scipy.optimize import minimize
        start = time.time()
        while iteration < max_iteration:

            init = np.hstack(
                (self.params['m'].flatten(), self.params['S_b'].flatten()))

            opt_results = minimize(self.local_optimizer_f,
                                   init,
                                   method='L-BFGS-B',
                                   jac=self.local_optimizer_g,
                                   options={
                                       'ftol': 0,
                                       'disp': True,
                                       'maxiter': 5000
                                   },
                                   tol=0,
                                   callback=self.callback_local)
            optimalHyp = deepcopy(opt_results.x)
            hype = self.unpack_local(optimalHyp)
            for param_name in self.opt_local_names:
                self.params[param_name] = hype[param_name]

            init = np.hstack(
                (self.params['Z'].flatten(), self.params['mu'].flatten(),
                 self.params['Sigma_b'].flatten(),
                 self.params['lhyp'].flatten(), self.params['ls'].flatten()))

            opt_results = minimize(self.global_optimizer_f,
                                   init,
                                   method='L-BFGS-B',
                                   jac=self.global_optimizer_g,
                                   options={
                                       'ftol': 0,
                                       'disp': True,
                                       'maxiter': 5000
                                   },
                                   tol=0,
                                   callback=self.callback_global)
            optimalHyp = deepcopy(opt_results.x)
            hype = self.unpack_global(optimalHyp)
            print('finished_local, Now iter' + str(self.callback_counter))
            for param_name in self.opt_global_names:
                self.params[param_name] = hype[param_name]

            likelihood = self.dggplvm.ELBO(self.X, self.N)
            print('finished_global, Now iter' + str(self.callback_counter))
            print(iteration)
            iteration += 1

        likelihood = self.dggplvm.ELBO(self.X, self.N)
        elapsed_time = time.time() - start
        print(elapsed_time)
        print('END Training, Log Likelihood = %.2f' % likelihood[0])

    def unpack_local(self, x):
        x_param_values = [
            x[self.sizes_local[i - 1]:self.sizes_local[i]].reshape(
                self.shapes_local[i - 1])
            for i in range(1,
                           len(self.shapes_local) + 1)
        ]  #これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {n: v for (n, v) in zip(self.opt_local_names, x_param_values)}

        return params

    def unpack_global(self, x):
        x_param_values = [
            x[self.sizes_global[i - 1]:self.sizes_global[i]].reshape(
                self.shapes_global[i - 1])
            for i in range(1,
                           len(self.shapes_global) + 1)
        ]  #これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {
            n: v
            for (n, v) in zip(self.opt_global_names, x_param_values)
        }
        if 'lhyp' in params:
            params['lhyp'] = params['lhyp'].squeeze()

        if 'ls' in params:
            params['ls'] = params['ls'].reshape(1)

        return params

    def local_optimizer_f(self, hypInArray):
        params = self.unpack_local(hypInArray)
        for param_name in self.opt_local_names:
            self.params[param_name] = params[param_name]
        cost = self.ELBO(self.X, self.N)
        return -cost[0]

    def local_optimizer_g(self, hypInArray):
        params = self.unpack_local(hypInArray)
        for param_name in self.opt_local_names:
            self.params[param_name] = params[param_name]
        gradient = []
        minibatch = np.arange(self.N)
        for i in self.opt_local_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient = np.hstack((gradient, g.flatten()))
        return gradient

    def global_optimizer_f(self, hypInArray):
        params = self.unpack_global(hypInArray)
        for param_name in self.opt_global_names:
            self.params[param_name] = params[param_name]
        cost = self.ELBO(self.X, self.N)
        return -cost[0]

    def global_optimizer_g(self, hypInArray):
        params = self.unpack_global(hypInArray)
        for param_name in self.opt_global_names:
            self.params[param_name] = params[param_name]
        gradient = []
        minibatch = np.arange(self.N)
        for i in self.opt_global_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient = np.hstack((gradient, g.flatten()))
        return gradient

    def callback_global(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0] % self.print_interval == 0:
            opt_params = self.unpack_global(x)
            for param_name in self.opt_global_names:
                self.params[param_name] = opt_params[param_name]
            cost = self.ELBO(self.X, self.N)
            print('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) +
                  ' +- ' + str(cost[1]))
        self.callback_counter[0] += 1

    def callback_local(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0] % self.print_interval == 0:
            opt_params = self.unpack_local(x)
            for param_name in self.opt_local_names:
                self.params[param_name] = opt_params[param_name]
            cost = self.ELBO(self.X, self.N)
            print('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) +
                  ' +- ' + str(cost[1]))
        self.callback_counter[0] += 1

#############################for experiment

    def experiment_train_by_optimizer_local_and_global(self, batch_size=None):
        iteration = 0
        max_iteration = 100
        print('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X, self.N)
        print('BEGINE Training, Log Likelihood = %.2f' % likelihood[0])
        #import minimize
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init = []
        from scipy.optimize import minimize

        while iteration < max_iteration:

            init = np.hstack(
                (self.params['m'].flatten(), self.params['S_b'].flatten()))

            opt_results = minimize(self.local_optimizer_f,
                                   init,
                                   method='L-BFGS-B',
                                   jac=self.local_optimizer_g,
                                   options={
                                       'ftol': 0,
                                       'disp': True,
                                       'maxiter': 500
                                   },
                                   tol=0,
                                   callback=self.callback_local)
            optimalHyp = deepcopy(opt_results.x)
            hype = self.unpack_local(optimalHyp)
            for param_name in self.opt_local_names:
                self.params[param_name] = hype[param_name]
            print('finished_local, Now iter' + str(self.callback_counter))
            test = 0
            while test < 20:
                init = np.hstack(
                    (self.params['Z'].flatten(), self.params['mu'].flatten(),
                     self.params['Sigma_b'].flatten(),
                     self.params['lhyp'].flatten(),
                     self.params['ls'].flatten()))

                opt_results = minimize(self.global_optimizer_f,
                                       init,
                                       method='L-BFGS-B',
                                       jac=self.global_optimizer_g,
                                       options={
                                           'ftol': 1.0e-6,
                                           'disp': True,
                                           'maxiter': 200
                                       },
                                       tol=0,
                                       callback=self.callback_global)
                optimalHyp = deepcopy(opt_results.x)
                hype = self.unpack_global(optimalHyp)

                for param_name in self.opt_global_names:
                    self.params[param_name] = hype[param_name]
                if self.callback_counter[0] % 20 == 0:
                    print('Now_global_iter:' + str(test))
                test += 1

            likelihood = self.dggplvm.ELBO(self.X, self.N)
            print('finished_global, Now iter' + str(self.callback_counter))
            print('finished_global, Now iter' + str(iteration))
            iteration += 1

        likelihood = self.dggplvm.ELBO(self.X, self.N)
        print('END Training, Log Likelihood = %.2f' % likelihood[0])