def __init__(self,D, M,Q,Domain_number,train_set_x,train_weight,train_label,batch_size,Hiddenlayerdim1,Hiddenlayerdim2,load=None):
        
        
        self.dggplvm = DGGPLVM_model(D, M,Q,Domain_number,Hiddenlayerdim1,Hiddenlayerdim2)
        
        self.wrt = self.dggplvm.wrt
        
        if load is None:
            self.dggplvm.compile_F(train_set_x,train_weight,train_label,batch_size)        
        
        if load is not None:
            with open('params.dump', 'rb') as f:
                mydict_load = pickle.load(f)
        
            for i in mydict_load[0]:
                self.wrt[i].set_value(mydict_load[0][i])
            self.dggplvm.import_F(train_set_x,train_weight,train_label,batch_size)        
        
        self.global_params=self.dggplvm.global_params
        

        self.f = self.dggplvm.f

        self.estimate = self.dggplvm.estimate
        
        self.callback_counter = [0]
        self.print_interval = 10
        
        self.correct=train_set_x.get_value().shape[0]/batch_size
        
        #RMSPROPのため
        self.param_updates = {n: np.zeros_like(v.get_value()) for n, v in self.wrt.items()}#update用の同じサイズの空の箱を用意
                              
        self.moving_mean_squared = {n: np.zeros_like(v.get_value()) for n, v in self.wrt.items()}#RMSPROP用に更新の履歴(γを入れておく器をパラメータごとに用意
                                    
        self.learning_rates = {n: 1e-2*np.ones_like(v.get_value()) for n, v in self.wrt.items()}#行進用の同じサイズの箱を用意                              

        if load is not None:
            self.param_updates = mydict_load[1]['param_updates']
            self.moving_mean_squared = mydict_load[1]['moving_mean_squared']
            self.learning_rates = mydict_load[1]['learning_rates']
class DGGPLVM_opt:
    def __init__(self,D, M,Q,Domain_number,train_set_x,train_weight,train_label,batch_size,Hiddenlayerdim1,Hiddenlayerdim2,load=None):
        
        
        self.dggplvm = DGGPLVM_model(D, M,Q,Domain_number,Hiddenlayerdim1,Hiddenlayerdim2)
        
        self.wrt = self.dggplvm.wrt
        
        if load is None:
            self.dggplvm.compile_F(train_set_x,train_weight,train_label,batch_size)        
        
        if load is not None:
            with open('params.dump', 'rb') as f:
                mydict_load = pickle.load(f)
        
            for i in mydict_load[0]:
                self.wrt[i].set_value(mydict_load[0][i])
            self.dggplvm.import_F(train_set_x,train_weight,train_label,batch_size)        
        
        self.global_params=self.dggplvm.global_params
        

        self.f = self.dggplvm.f

        self.estimate = self.dggplvm.estimate
        
        self.callback_counter = [0]
        self.print_interval = 10
        
        self.correct=train_set_x.get_value().shape[0]/batch_size
        
        #RMSPROPのため
        self.param_updates = {n: np.zeros_like(v.get_value()) for n, v in self.wrt.items()}#update用の同じサイズの空の箱を用意
                              
        self.moving_mean_squared = {n: np.zeros_like(v.get_value()) for n, v in self.wrt.items()}#RMSPROP用に更新の履歴(γを入れておく器をパラメータごとに用意
                                    
        self.learning_rates = {n: 1e-2*np.ones_like(v.get_value()) for n, v in self.wrt.items()}#行進用の同じサイズの箱を用意                              

        if load is not None:
            self.param_updates = mydict_load[1]['param_updates']
            self.moving_mean_squared = mydict_load[1]['moving_mean_squared']
            self.learning_rates = mydict_load[1]['learning_rates']
            

    def get_grad(self, param_name, index):
        #wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma_b': Sigma_b, 'lhyp': lhyp, 'ls': ls, 'KmmInv': KmmInv}
                
        if param_name in self.global_params:
            grad1,grad_std=self.estimate(param_name,index)
            grad = self.dggplvm.g[param_name]['KL_U']() + grad1*self.correct
            
        else:
            grad =  self.dggplvm.g[param_name]['KL_X']() + self.estimate(param_name,index)[0]*self.correct
                                  
        # DEBUG
        if param_name == 'lhyp' and np.any(np.abs(grad) < grad_std / np.sqrt(30)):
                #print 'Large noise, recomputing. lhyp grad mean:', grad, ', std:', grad_std / np.sqrt(self.clgp.samples)
            grad_ls, grad_std = self.estimate(param_name,index,300) 
            grad = self.dggplvm.g[param_name]['KL_U']() + grad_ls
            self.grad_std = grad_std

        return np.array(grad)

    def opt_one_step(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        #for param_name in self.dggplvm.local_params:
        #        self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
    
        for param_name in  self.dggplvm.wrt:
            if opt == 'grad_ascent' or param_name in ['gamma_S','beta_S','b_S','W_S']:#lsはxの分散
                self.grad_ascent_one_step(param_name, index,[param_name, index], 
                    learning_rate_decay = learning_rate_adapt * 100 / (iteration + 100.0))
            else:
                self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
            if param_name in ['lhyp','ls']:
                self.wrt[param_name].set_value(np.clip(self.wrt[param_name].get_value(), -5, 5))
                
    def opt_one_step_local(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        for param_name in self.dggplvm.local_params:
            if opt == 'grad_ascent' or param_name in ['S_b']:#lsはxの分散
                self.grad_ascent_one_step(param_name, index,[param_name, index], 
                    learning_rate_decay = learning_rate_adapt * 100 / (iteration + 100.0))
            else:
                self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
            if param_name in ['lhyp','ls']:
                self.wrt[param_name].set_value(np.clip(self.wrt[param_name].get_value(), -5, 5))
            
    def opt_one_step_global(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        for param_name in self.dggplvm.global_params:
            self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
    
    def opt_one_step2(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        max_epoch=20
        epoch=0
        while epoch < max_epoch:
            for param_name in self.dggplvm.local_params:
                self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
            epoch += 1        
        
        while epoch < max_epoch:
            for param_name in self.dggplvm.Z_params:
                self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
            epoch += 1 
        
        while epoch < 30:
            for param_name in self.dggplvm.hyp_params:
                self.rmsprop_one_step(param_name, index, [param_name, index], learning_rate_adapt = learning_rate_adapt)#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
            epoch += 1 
       
        #if param_name in ['lhyp']:
        #    self.params[param_name] = np.clip(self.params[param_name], -8, 8)
            
            #if param_name in ['lhyp', 'Z']:
            #    self.dggplvm.update_KmmInv_cache()
    
    def opt_one_step_grad(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        for param_name in self.dggplvm.local_params:
            self.grad_ascent_one_step(param_name, index, [param_name, index])#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
    
        for param_name in  self.dggplvm.global_params:
            if opt == 'grad_ascent' or param_name in ['ls']:#lsはxの分散
                self.grad_ascent_one_step(param_name, [param_name, index], 
                    learning_rate_decay = learning_rate_adapt * 100 / (iteration + 100.0))
            self.grad_ascent_one_step(param_name, index, [param_name, index])#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
    
    def grad_ascent_one_step(self, param_name, index, grad_args, momentum = 0.9, learning_rate_decay = 1):
        #grad_args=[param_name, self.Y, KmmInv_grad, self.mask]
        self.wrt[param_name].set_value(self.wrt[param_name].get_value(borrow=True) + (learning_rate_decay*self.learning_rates[param_name]* self.param_updates[param_name]))
        grad = self.get_grad(*grad_args)
        self.param_updates[param_name] = grad
    
    def opt_simple(self, iteration,index, opt = 'rmsprop', learning_rate_adapt = 0.2, use_einsum = True):        
        for param_name in self.dggplvm.local_params:
            self.grad_simple(param_name, index, [param_name, index])#, momentum = 0.9 - 0.4 * 100 / (iteration + 100.0))
    
        for param_name in  self.dggplvm.global_params:
            self.grad_simple(param_name, index, [param_name, index])
    
    def grad_simple(self, param_name, index, grad_args, momentum = 0.9, learning_rate_decay = 1):
        #grad_args=[param_name, self.Y, KmmInv_grad, self.mask]
        self.wrt[param_name].set_value(self.wrt[param_name].get_value(borrow=True) + self.get_grad(*grad_args)*0.01)
    
    def rmsprop_one_step(self, param_name, index, grad_args, decay = 0.9, momentum = 0, learning_rate_adapt = 0.05, 
        learning_rate_min = 1e-6, learning_rate_max = 10):
        # RMSPROP: Tieleman, T. and Hinton, G. (2012), Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning
        # Implementation based on https://github.com/BRML/climin/blob/master/climin/rmsprop.py
        
        # We use Nesterov momentum: first, we make a step according to the momentum and then we calculate the gradient.
        step1 = self.param_updates[param_name] * momentum
        self.wrt[param_name].set_value(self.wrt[param_name].get_value()+step1)
        grad = self.get_grad(*grad_args)

        self.moving_mean_squared[param_name] = (decay * self.moving_mean_squared[param_name] + (1 - decay) * grad ** 2)
        step2 = self.learning_rates[param_name] * grad / (self.moving_mean_squared[param_name] + 1e-8)**0.5

        # DEBUG
        if param_name == 'lhyp' or 'ls':
            step2 = np.clip(step2, -0.1, 0.1)
        
        self.wrt[param_name].set_value(self.wrt[param_name].get_value()+step2)
        #self.params[param_name] += step2

        step = step1 + step2

        # Step rate adaption. If the current step and the momentum agree, we slightly increase the step rate for that dimension.
        if learning_rate_adapt:
            # This code might look weird, but it makes it work with both numpy and gnumpy.
            step_non_negative = step > 0
            step_before_non_negative = self.param_updates[param_name] > 0
            agree = (step_non_negative == step_before_non_negative) * 1.#0か1が出る
            adapt = 1 + agree * learning_rate_adapt * 2 - learning_rate_adapt
            self.learning_rates[param_name] *= adapt
            self.learning_rates[param_name] = np.clip(self.learning_rates[param_name], learning_rate_min, learning_rate_max)

        self.param_updates[param_name] = step



#########################################################################################################################################S
###L-BFGSでの最適化

    def unpack(self, x):
        x_param_values = [x[self.sizes[i-1]:self.sizes[i]].reshape(self.shapes[i-1]) for i in range(1,len(self.shapes)+1)]#これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {n:v for (n,v) in zip(self.opt_param_names, x_param_values)}
        if 'lhyp' in params:
            params['lhyp']=params['lhyp'].squeeze()
        
        if 'ls' in params:
            params['ls']=params['ls'].reshape(1) 

        return params
    
    def _convert_to_array(self, params):
        return np.hstack((params['Z'].flatten(),params['m'].flatten(),params['S_b'].flatten(),params['mu'].flatten(),params['Sigma_b'].flatten(),params['lhyp'].flatten(),params['ls'].flatten()))
    #'Z', 'm', 'S_b', 'mu', 'Sigma_b', 'lhyp', 'ls'
    def _optimizer_f(self, hypInArray):
        params=self.unpack(hypInArray)
        self.params=params
        cost=self.ELBO(self.X,self.N)
        return -cost[0]
    
    def _optimizer_g(self, hypInArray):
        params=self.unpack(hypInArray)
        self.params=params
        gradient=[]
        minibatch = np.arange(self.N)
        for i in self.opt_param_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient=np.hstack((gradient,g.flatten()))        
        return gradient
    
    def train_by_optimizer(self,batch_size=None):
        
        print ('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        print ('BEGINE Training, Log Likelihood = %.2f'% likelihood[0])
        #import minimize 
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init=[]
        from scipy.optimize import minimize
        
        init=self._convert_to_array(self.params)
            
        opt_results = minimize(self._optimizer_f, init, method='L-BFGS-B', jac=self._optimizer_g, options={'ftol':0 , 'disp':True, 'maxiter': 500}, tol=0,callback=self.callback)
        optimalHyp = deepcopy(opt_results.x)
        hype=self.unpack(optimalHyp)
        self.params=hype
        
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        print ('END Training, Log Likelihood = %.2f'% likelihood[0])
        
    def callback(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0]%self.print_interval == 0:
            opt_params = self.unpack(x)
            self.params=opt_params
            cost=self.ELBO(self.X,self.N)
            print ('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) + ' +- ' + str(cost[1]))           
        self.callback_counter[0] += 1
    

##################################################################################################################################################
                         
    def train_by_optimizer_local_and_global(self,batch_size=None):
        iteration = 0
        max_iteration = 100
        print ('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        print ('BEGINE Training, Log Likelihood = %.2f'% likelihood[0])
        #import minimize 
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init=[]
        from scipy.optimize import minimize
        start = time.time()
        while iteration < max_iteration:   
            
            init=np.hstack((self.params['m'].flatten(),self.params['S_b'].flatten()))
            
            opt_results = minimize(self.local_optimizer_f, init, method='L-BFGS-B', jac=self.local_optimizer_g, options={'ftol':0 , 'disp':True, 'maxiter': 5000}, tol=0,callback=self.callback_local)
            optimalHyp = deepcopy(opt_results.x)
            hype=self.unpack_local(optimalHyp)
            for param_name in self.opt_local_names:
                self.params[param_name]=hype[param_name]
            
            init=np.hstack((self.params['Z'].flatten(),self.params['mu'].flatten(),self.params['Sigma_b'].flatten(),self.params['lhyp'].flatten(),self.params['ls'].flatten()))
            
            opt_results = minimize(self.global_optimizer_f, init, method='L-BFGS-B', jac=self.global_optimizer_g, options={'ftol':0 , 'disp':True, 'maxiter': 5000}, tol=0,callback=self.callback_global)
            optimalHyp = deepcopy(opt_results.x)
            hype=self.unpack_global(optimalHyp)
            print('finished_local, Now iter' + str(self.callback_counter))
            for param_name in self.opt_global_names:
                self.params[param_name]=hype[param_name]
            
            likelihood = self.dggplvm.ELBO(self.X,self.N)
            print('finished_global, Now iter' + str(self.callback_counter))
            print(iteration)
            iteration += 1
        
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        elapsed_time = time.time() - start
        print(elapsed_time)
        print ('END Training, Log Likelihood = %.2f'% likelihood[0])
        
    def unpack_local(self, x):
        x_param_values = [x[self.sizes_local[i-1]:self.sizes_local[i]].reshape(self.shapes_local[i-1]) for i in range(1,len(self.shapes_local)+1)]#これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {n:v for (n,v) in zip(self.opt_local_names, x_param_values)}

        return params
    
    def unpack_global(self, x):
        x_param_values = [x[self.sizes_global[i-1]:self.sizes_global[i]].reshape(self.shapes_global[i-1]) for i in range(1,len(self.shapes_global)+1)]#これは['m', 'ls', 'lhyp', 'S', 'Z']の5つ分
        #それぞれのパラメータについてリストで与えられたとしても、例えばmの場合i=1よってx[sizes[0]:sizes[1]]つまり適切な数のx[mの始まり:mの終わり]が出され、それがreshape((mのサイズ))で変換される
        params = {n:v for (n,v) in zip(self.opt_global_names, x_param_values)}
        if 'lhyp' in params:
            params['lhyp']=params['lhyp'].squeeze()
        
        if 'ls' in params:
            params['ls']=params['ls'].reshape(1) 

        return params
    
    def local_optimizer_f(self, hypInArray):
        params=self.unpack_local(hypInArray)
        for param_name in self.opt_local_names:
            self.params[param_name]=params[param_name]
        cost=self.ELBO(self.X,self.N)
        return -cost[0]
    
    def local_optimizer_g(self, hypInArray):
        params=self.unpack_local(hypInArray)
        for param_name in self.opt_local_names:
            self.params[param_name]=params[param_name]
        gradient=[]
        minibatch = np.arange(self.N)
        for i in self.opt_local_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient=np.hstack((gradient,g.flatten()))        
        return gradient
    
    def global_optimizer_f(self, hypInArray):
        params=self.unpack_global(hypInArray)
        for param_name in self.opt_global_names:
            self.params[param_name]=params[param_name]
        cost=self.ELBO(self.X,self.N)
        return -cost[0]
    
    def global_optimizer_g(self, hypInArray):
        params=self.unpack_global(hypInArray)
        for param_name in self.opt_global_names:
            self.params[param_name]=params[param_name]
        gradient=[]
        minibatch = np.arange(self.N)
        for i in self.opt_global_names:
            g = self.get_grad(i, self.X, minibatch)
            gradient=np.hstack((gradient,g.flatten()))        
        return gradient
    
    def callback_global(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0]%self.print_interval == 0:
            opt_params = self.unpack_global(x)
            for param_name in self.opt_global_names:
                self.params[param_name]=opt_params[param_name]
            cost=self.ELBO(self.X,self.N)
            print ('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) + ' +- ' + str(cost[1]))           
        self.callback_counter[0] += 1
    
    def callback_local(self, x):
        #インターバル毎にコールバックが出る
        if self.callback_counter[0]%self.print_interval == 0:
            opt_params = self.unpack_local(x)
            for param_name in self.opt_local_names:
                self.params[param_name]=opt_params[param_name]
            cost=self.ELBO(self.X,self.N)
            print ('iter ' + str(self.callback_counter) + ': ' + str(cost[0]) + ' +- ' + str(cost[1]))           
        self.callback_counter[0] += 1
                             
#############################for experiment
                     
    def experiment_train_by_optimizer_local_and_global(self,batch_size=None):
        iteration = 0
        max_iteration = 100
        print ('start to optimize')
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        print ('BEGINE Training, Log Likelihood = %.2f'% likelihood[0])
        #import minimize 
        #opt_results = minimize.run(self._optimizer_f, self._get_hypArray(params),length=number_epoch,verbose=True)
        init=[]
        from scipy.optimize import minimize
        
        while iteration < max_iteration:   
            
            init=np.hstack((self.params['m'].flatten(),self.params['S_b'].flatten()))
            
            opt_results = minimize(self.local_optimizer_f, init, method='L-BFGS-B', jac=self.local_optimizer_g, options={'ftol':0 , 'disp':True, 'maxiter': 500}, tol=0,callback=self.callback_local)
            optimalHyp = deepcopy(opt_results.x)
            hype=self.unpack_local(optimalHyp)
            for param_name in self.opt_local_names:
                self.params[param_name]=hype[param_name]
            print('finished_local, Now iter' + str(self.callback_counter))
            test=0
            while test < 20:
                init=np.hstack((self.params['Z'].flatten(),self.params['mu'].flatten(),self.params['Sigma_b'].flatten(),self.params['lhyp'].flatten(),self.params['ls'].flatten()))
            
                opt_results = minimize(self.global_optimizer_f, init, method='L-BFGS-B', jac=self.global_optimizer_g, options={'ftol':1.0e-6 , 'disp':True, 'maxiter': 200}, tol=0,callback=self.callback_global)
                optimalHyp = deepcopy(opt_results.x)
                hype=self.unpack_global(optimalHyp)
                
                for param_name in self.opt_global_names:
                    self.params[param_name]=hype[param_name]
                if self.callback_counter[0]%20 == 0:
                    print('Now_global_iter:' + str(test))
                test +=1

            likelihood = self.dggplvm.ELBO(self.X,self.N)
            print('finished_global, Now iter' + str(self.callback_counter))
            print('finished_global, Now iter' + str(iteration))
            iteration += 1
        
        likelihood = self.dggplvm.ELBO(self.X,self.N)
        print ('END Training, Log Likelihood = %.2f'% likelihood[0])