示例#1
0
def theano_f_df(theta, energy, dE_dtheta):
    n_step = T.iscalar('n_step')
    stepsizes = T.vector('stepsizes')
    initial_pos = T.matrix('initial_pos') #parameter for the representative samples     
    initial_vel = T.matrix('initial_vel')
    training = T.matrix('training')
   # theta = T.vector('theta')
    
   
     
    # params includes two parts: one from the energy based model. one from the representive samples
    params = [initial_pos, theta]
    # do one-step HMC sampling
    [accept,accept1, final_pos, final_pos1, ndeltaH] = HMC.hmc_move(initial_vel, initial_pos, energy, stepsizes,n_step)
    initial_pos_vec = T.tile(initial_pos, [final_pos.shape[0],1])
    accept_flatten = accept.flatten()
    final_pos_flatten = T.reshape(final_pos, (final_pos.shape[0]*final_pos.shape[1],final_pos.shape[2]))
    # get sampler_cost
    sampler_cost = dE_dtheta(initial_pos_vec, accept_flatten) - dE_dtheta(final_pos_flatten, accept_flatten)
    #sampler_cost = dE_dtheta(initial_pos_vec, accept_flatten)
    sampler_cost = 1./(final_pos.shape[0]) * sampler_cost
    sampler_cost = T.mean(sampler_cost**2)
    # get param_cost
    param_cost = dE_dtheta(training) - dE_dtheta(initial_pos)
    param_cost = (final_pos.shape[1])*T.mean(param_cost**2)
        
    total_cost = param_cost + sampler_cost
    costs = [param_cost, sampler_cost]
    gparams = []
    for param in params:
        gparam = T.grad(total_cost, param)
        gparams.append(gparam)
    
    f_df=theano.function(params+[training, initial_vel,stepsizes,n_step], costs+gparams, name='func_f_df', allow_input_downcast=True)
    return f_df
def theano_funcs(theta,energy, dE_dtheta):
    n_step = T.iscalar('n_step')
    stepsizes = T.vector('stepsizes')
    initial_pos = T.matrix('initial_pos') #parameter for the representative samples     
    initial_vel = T.matrix('initial_vel')
    training = T.matrix('training')
     
    # params includes two parts: one from the energy based model. one from the representive samples
    params = [initial_pos, theta]
    # do one-step HMC sampling
    [accept, initial_pos_vec, final_pos_vec, ndeltaH, final_pos] = HMC.hmc_move(initial_vel, initial_pos, energy, stepsizes,n_step)
    accept_flatten = accept.flatten()
    # get sampler_cost
    sampler_cost = dE_dtheta(initial_pos_vec, accept_flatten) - dE_dtheta(final_pos_vec, accept_flatten)
    #sampler_cost = dE_dtheta(initial_pos_vec, accept_flatten)
    #sampler_cost = 1./(final_pos.shape[0]) * sampler_cost
    sampler_cost = 1./(final_pos_vec.shape[0]) * sampler_cost
    sampler_cost = T.sum(sampler_cost**2)
    # get param_cost
    param_cost = dE_dtheta(training) - dE_dtheta(initial_pos)
    param_cost = T.sum(param_cost**2)

    # DEBUG counteract scaling of theta
    sampler_cost = sampler_cost * training.shape[0]
    param_cost   = param_cost * training.shape[0]

    total_cost = param_cost + sampler_cost
    report_scalars = [total_cost, param_cost, sampler_cost, T.mean(accept)]
    gparams = []
    for param in params:
        gparam = T.grad(total_cost, param)
        gparams.append(gparam)
    
    f_df=theano.function(params + [training, initial_vel,stepsizes,n_step], report_scalars+gparams, name='func_f_df', allow_input_downcast=True)
    f_samples = theano.function(params + [initial_vel, stepsizes, n_step], [initial_pos_vec, final_pos_vec, final_pos], name='func_samples', allow_input_downcast=True)
    
    return f_df, f_samples
          otherwise:      return \sum acpt_i * dE_i/dtheta. (used for sampler_cost) 
"""   
def dE_dtheta1(x, acpt=None):
    if acpt == None:
       return T.grad(T.mean(gaussian_energy(x)), theta, consider_constant=[x])
    else:
       return T.grad(T.sum(acpt*gaussian_energy(x)), theta, consider_constant=[acpt, x])

observe = T.matrix('observe')       
initial_vel = T.matrix('initial_vel')     
#n_steps=30
n_step = T.iscalar('n_step')
stepsizes = T.vector('stepsizes')

# do HMC sampling
[accept,accept1, final_pos_new, final_pos_new1, ndeltaH] = hmc_sampling.hmc_move(initial_vel, initial_pos, gaussian_energy, stepsizes,n_step)

"""
reshape initial_pos, accept and final_pos_new
initial_pos : (n_sample, n_dim)----> (n_sample*n_steps, n_dim) n_steps is the number of different leapfrog sampler we consider
accept:       (n_steps, n_samples) ----> (n_steps*n_samples, )
final_pos_new: (n_steps, n_sample, n_dim) ---> (n_steps*n_sample, n_dim)
"""
initial_pos_vec = T.tile(initial_pos, [final_pos_new.shape[0],1])
accept_flatten = accept.flatten()
final_pos_new_flatten = T.reshape(final_pos_new, (final_pos_new.shape[0]*final_pos_new.shape[1],final_pos_new.shape[2]))

"""
define the sampler_cost
"""
sampler_cost = dE_dtheta1(initial_pos_vec, accept_flatten) - dE_dtheta1(final_pos_new_flatten, accept_flatten)
def theano_funcs(energy, stats_dict):
    n_step = T.iscalar('n_step')
    stepsizes = T.vector('stepsizes')
    initial_pos = T.matrix('initial_pos') #parameter for the representative samples     
    initial_vel = T.matrix('initial_vel')
   
    sampler_cost = 0.

    # do one-step HMC sampling
    [accept, initial_pos_vec, final_pos_vec, ndeltaH, final_pos] = HMC.hmc_move(initial_vel, initial_pos, energy, stepsizes,n_step)
    nsamp = accept.shape[0].astype(theano.config.floatX)
    accept_matrix = accept.dimshuffle(0,'x')
    for stat in stats_dict.itervalues():
        initial_stat = stat(initial_pos_vec, T.ones_like(accept_matrix)/nsamp)
        final_stat = stat(
            T.concatenate((initial_pos_vec, final_pos_vec), axis=0),
            T.concatenate(
                (T.ones_like(accept_matrix)-accept_matrix, accept_matrix),
                axis=0)/nsamp,
            )
        sampler_cost = sampler_cost + T.sum((final_stat - initial_stat)**2)


    ## DEBUG
    #[accept, initial_pos_vec, final_pos_vec, ndeltaH, final_pos] = HMC.hmc_move(-initial_vel, initial_pos, energy, stepsizes,n_step)
    #nsamp = accept.shape[0].astype(theano.config.floatX)
    #accept_matrix = accept.dimshuffle(0,'x')
    #for stat in stats_dict.itervalues():
    #    initial_stat = stat(initial_pos_vec, T.ones_like(accept_matrix)/nsamp)
    #    final_stat = stat(
    #        T.concatenate((initial_pos_vec, final_pos_vec), axis=0),
    #        T.concatenate(
    #            (T.ones_like(accept_matrix)-accept_matrix, accept_matrix),
    #            axis=0)/nsamp,
    #        )
    #    sampler_cost = sampler_cost + T.sum((final_stat - initial_stat)**2)



        # difference = stat(initial_pos_vec) - stat(final_pos_vec)
        # weighted_difference = T.mean(accept_matrix*difference, axis=0)
        # sampler_cost = sampler_cost + T.sum(weighted_difference**2)





    # we want the gradient per-sample to stay large -- so scale by the number of samples!
    # this is # initial conditions * #steps
    sampler_cost *= nsamp

    ## and actually, let's make it really large -- see if this helps convergence
    #sampler_cost *= 1e10

    total_cost = sampler_cost   
    costs = [total_cost]
    gparams = [T.grad(total_cost, initial_pos)]
      
    f_df      = theano.function([initial_pos,initial_vel,stepsizes,n_step], costs+gparams, name='func_f_df', allow_input_downcast=True)
    f_samples = theano.function([initial_pos,initial_vel,stepsizes,n_step], [initial_pos_vec, final_pos_vec, final_pos], name='func_samples', allow_input_downcast=True)
    return f_df, f_samples
def test_hmc(n_sample, n_dim, mu,cov,cov_inv,rng,seed):
    """
    parameters:
    (maybe just input a gaussian energy function instead of mu and cov_inv)
    ------------
    n_samples: number of samples. 
    n_dim    : number of dim
    mu       : the ground truth mean 
    cov_inv  : the inverse of the ground truth SD. 
    rng      : random generator for initial value x0 for the optimization process.
    seed     : seed for the random momentum generator (for initial_vel)
    """
    #make the params to be shared theano variable, each row is a sample.
    params = theano.shared(value=numpy.zeros(n_sample*n_dim, dtype=theano.config.floatX), name='params', borrow=True)
    initial_pos = params[0:n_sample*n_dim].reshape((n_sample, n_dim))
    
    #return the gaussian energy. 
    def gaussian_energy(x):
        return 0.5 * (T.dot((x - mu), cov_inv) *
                      (x - mu)).sum(axis=1)
    
   
    """
    initial_vel: random initial momentum
    n_step:      # of leapfrog steps
    stepsizes:   stepsizes for different particles.
    """
    initial_vel = T.matrix('initial_vel')    
    n_step = T.iscalar('n_step')
    stepsizes = T.vector('stepsizes')
    
    """
    return value of hmc_move:
    1st:  return accept prob. associated with all trajectories.
    2nd:  return accept prob. associated with only end trajectory.
    3rd:  return final positions associated with all trajectories.
    4th:  return final positions associated with only end trajectory.
    5th:  return difference of Hamiltonian energy
    """
    #_,accept,_, final_pos= hmc_sampling.hmc_move(initial_vel,initial_pos, gaussian_energy, 0.1,50)
    accept,_,final_pos,_,ndeltaH= hmc_sampling.hmc_move(initial_vel, initial_pos, gaussian_energy, stepsizes, n_step)
    accept_matrix = accept.dimshuffle(0,1, 'x')
    
    """
    define the objective function:
    uncomment the proper one and then update the total_cost1: matching the specific statistics
    Here, we use summation instead of mean to avoid large sample fluctuation
    """
    #sampler_cost_first = accept_matrix*(initial_pos-final_pos)
    #sampler_cost_second = accept_matrix*(initial_pos**2-final_pos**2)
    #sampler_cost_third = accept_matrix*(initial_pos**3-final_pos**3)
    #sampler_cost_sin = accept_matrix*(T.sin(initial_pos)-T.sin(final_pos))
    sampler_cost_exp = accept_matrix*(T.exp(initial_pos)-T.exp(final_pos))
    total_cost1 = sampler_cost_exp
    
    """
    1): if we average samples along all the trajectories, use the first two lines.
    2): if we penalize each sampler along the trajectories, uncomment and use the last two lines
    
    """
    total_cost1 = T.mean(total_cost1, axis=0)
    total_cost1 = T.sum(total_cost1, axis=0)
    #total_cost = (total_cost**2).sum(axis=1)
    #total_cost = T.mean(total_cost)
    """
    total_cost2: matching the average Hamiltonian energy, if dont consider the 
                 average Hamiltonian energy matching, just ignore the total_cost2
                 by deleting the correspongding part of the total_cost
    """    
    total_cost2 = T.mean(accept*ndeltaH, axis=0)
    total_cost2 = T.sum(total_cost2)
    
    total_cost = T.mean(total_cost1**2)+total_cost2**2
    
    func_eval = theano.function([initial_vel,stepsizes,n_step], total_cost, name='func_eval', allow_input_downcast=True)
    func_grad = theano.function([initial_vel,stepsizes,n_step], T.grad(total_cost, params), name='func_grad', allow_input_downcast=True)
  
    """
    set up stepsizes for different particles. 
    """
    rng_stepsize = numpy.random.RandomState(353)
    random_stepsizes = numpy.array(rng_stepsize.rand(n_sample), dtype=theano.config.floatX)
    random_interval = 1.5*random_stepsizes-1
    stepsize_baseline = 0.2
    noise_level = 2
    stepsizes0 = stepsize_baseline*noise_level**random_interval
    
    """
    build the evaluation function and gradient function for scipy optimize
    """
    def train_fn(param_new):
        params.set_value(param_new.astype(theano.config.floatX), borrow=True)
        rng_temp = numpy.random.RandomState(seed)
        initial_v=numpy.array(rng_temp.randn(n_sample,n_dim), dtype=theano.config.floatX)
        res = func_eval(initial_v,stepsizes0,30)
        return res
        
    def train_fn_grad(param_new):
        params.set_value(param_new.astype(theano.config.floatX), borrow=True)
        rng_temp = numpy.random.RandomState(seed)
        initial_v=numpy.array(rng_temp.randn(n_sample,n_dim), dtype=theano.config.floatX)
        res = func_grad(initial_v,stepsizes0,30)
        return res
    n_epoch = 5000 

    best_samples_params = scipy.optimize.fmin_l_bfgs_b(func = train_fn, 
                                        x0= numpy.array(rng.randn(n_sample*n_dim), dtype=theano.config.floatX),
                                        #x0 = numpy.array((1.0/cov_inv)*rng.randn(n_sample*n_dim)+mu, dtype=theano.config.floatX),
                                        #x0 = samples_true.flatten(),
                                        fprime = train_fn_grad,
                                        maxiter = n_epoch)
    #res = best_samples_params.reshape((n_sample,n_dim))
    res = (best_samples_params[0]).reshape((n_sample, n_dim))
    
    
    """
    uncomment the proper one to print estimated value for different number of samples.
    """    
   # print "estimated mean from representative sample= ", res.mean(axis=0) 
   # print "true mu= ", mu   
   # print "estimated second moment from representative samples= ", (res**2).mean(axis=0)
   # print "true second moment= ", mu**2 + 1./cov_inv
   # print "estimated third moment from representative samples= ", (res**3).mean(axis=0)
   # print "estimated sin(x) from representative samples= ", (numpy.sin(res)).mean(axis=0)
    print "estimated exp(x) from representative samples= ", (numpy.exp(res)).mean(axis=0)    
  
    """
    uncomment the proper one to return estimated value for different number of samples
    """
    #return res.mean(axis=0)
    #return (res**2).mean(axis=0)
    #return (res**3).mean(axis=0)
    return (numpy.exp(res)).mean(axis=0)
def test_hmc_all(n_sample=1000, n_dim=2):
    #make the params to be shared theano variable, each row is a sample.
    params = theano.shared(value=numpy.zeros(n_sample*n_dim, dtype=theano.config.floatX), name='params', borrow=True)
    initial_pos = params[0:n_sample*n_dim].reshape((n_sample, n_dim))
    
    rng = numpy.random.RandomState(123)
    mu = numpy.array(rng.rand(n_dim)*5, dtype=theano.config.floatX)
   # rng1=numpy.random.RandomState(444)
   # cov = numpy.eye(n_dim, dtype=theano.config.floatX)*rng1.rand(1)
    #cov=numpy.array([[1.,0.95],[0.95, 1.]], dtype=theano.config.floatX)
   # cov = numpy.array(rng1.rand(n_dim,n_dim), dtype=theano.config.floatX)
   # cov = (cov+cov.T)/2.
   # cov[numpy.arange(n_dim), numpy.arange(n_dim)]=1.0
    cov = numpy.array([[0.8, 0.], [0., 0.6]], dtype=theano.config.floatX)
    cov_inv = linalg.inv(cov)
    #cov_inv = 1./(cov)
    #cov_inv = numpy.eye(n_dim, dtype=theano.config.floatX)
    print "begin process..."
    #return the gaussian energy. 
    def gaussian_energy(x):
        return 0.5 * (T.dot((x - mu), cov_inv) *
                      (x - mu)).sum(axis=1)
    
    """
    next, we draw the 2D gaussian contour also define the color map for different set of points along the optimization process
    """
               
    def gaussian_2d(x,y,mu, cov_inv):
        var_x = cov_inv[0,0]
        var_y = cov_inv[1,1]
        cov_xy = cov_inv[0,1]
        log_density = 0.5* (var_x*(x-mu[0])**2+var_y*(y-mu[1])**2+ 2.0 * cov_xy *(x-mu[0])*(y-mu[1]))
        return numpy.exp(-log_density)
    # draw arrows connecting the A and B, mainly used to show the trajectory of each point
    def drawArrow(A, B):
        plt.arrow(A[0], A[1], B[0] - A[0], B[1] - A[1],
              head_width=0.05, width = 0.001, length_includes_head=True)
              
    delta = 0.025
    plt.clf()
    gaussian_x = numpy.arange(-5.0, 10.0, delta)
    gaussian_y = numpy.arange(-5.0, 10.0, delta)
    mesh_X, mesh_Y = numpy.meshgrid(gaussian_x, gaussian_y)
    mesh_Z = gaussian_2d(mesh_X, mesh_Y, mu, cov_inv)
    gaussian_Contour =plt.contour(mesh_X,mesh_Y, mesh_Z, 20)
    #plt.clabel(gaussian_Contour, inline=1)
    #color_map = iter(cm.rainbow(numpy.linspace(0, 1, 20)))
    color_map = iter(['b','r', 'k', 'g', 'c', 'm', 'y'])
    
    initial_vel = T.matrix('initial_vel')     
    n_step = T.iscalar('n_step')
    stepsizes = T.vector('stepsizes')
    
    [accept, accept1,final_pos_new, final_pos_new1, ndeltaH] = hmc_sampling.hmc_move(initial_vel, initial_pos, gaussian_energy, stepsizes,n_step)
   
    accept_matrix = accept.dimshuffle(0,1, 'x')
    # we get the average along the second dimension, i.e., get the mean along the different samples, so after the first operation
    # we get the 2D matrix: number of steps * number of dim
    
    sampler_cost_first = accept_matrix*(initial_pos-final_pos_new)
    #sampler_cost_second = accept_matrix*(initial_pos**2-final_pos**2)
    #sampler_cost_third = accept_matrix*(initial_pos**3-final_pos**3)
    #sampler_cost_sin = accept_matrix*(T.sin(initial_pos)-T.sin(final_pos))
    #sampler_cost_exp = accept_matrix*(T.exp(initial_pos)-T.exp(final_pos))
    total_cost1 = sampler_cost_first
    
    total_cost1 = T.mean(total_cost1, axis=0)
    total_cost1 = T.sum(total_cost1, axis=0)
    
    total_cost2 = T.mean(accept*ndeltaH, axis=0)
    total_cost2 = T.sum(total_cost2)
    
    total_cost = T.mean(total_cost1**2)+total_cost2**2
   
    start_time = timeit.default_timer()
    func_eval = theano.function([initial_vel,stepsizes,n_step], total_cost, name='func_eval', allow_input_downcast=True)
    func_grad = theano.function([initial_vel,stepsizes,n_step], T.grad(total_cost, params), name='func_grad', allow_input_downcast=True)
    end_time = timeit.default_timer()
    print "compiling time= ", end_time-start_time
   
    """
    define the vared stepsize, if want to use the fixed one, use the following lines instead    
    stepsizes_center=numpy.ones((n_sample,), dtype=theano.config.floatX)*0.25
    stepsizes0  = stepsizes_center
    """
    
    rng_stepsize = numpy.random.RandomState(353)
    random_stepsizes = numpy.array(rng_stepsize.rand(n_sample), dtype=theano.config.floatX)
    random_interval = 1.5*random_stepsizes-1
    stepsize_baseline = 0.2
    noise_level = 2
    stepsizes0 = stepsize_baseline*noise_level**random_interval 
    
    
    def train_fn(param_new):
        """
        draw the obtained points on the contour of the previous 2D gaussian
        """     
        params.set_value(param_new.astype(theano.config.floatX), borrow=True)
        rng_temp = numpy.random.RandomState(1234)
        initial_v=numpy.array(rng_temp.randn(n_sample,n_dim), dtype=theano.config.floatX)
        res = func_eval(initial_v, stepsizes0,30)
        print "cost= ", res
        return res
        
    def train_fn_grad(param_new):
        params.set_value(param_new.astype(theano.config.floatX), borrow=True)
        rng_temp = numpy.random.RandomState(1234)
        initial_v=numpy.array(rng_temp.randn(n_sample,n_dim), dtype=theano.config.floatX)
        res = func_grad(initial_v,stepsizes0,30)
        return res
        
    n_epoch = 5000
    
    samples_sd_Normal = numpy.array(rng.randn(n_sample, n_dim), dtype=theano.config.floatX)
    samples_true = (linalg.sqrtm(cov).dot(samples_sd_Normal.T)).T + mu    #samples_true : nsamples*ndim
    
    #initial_draw = samples_true
    initial_points = numpy.array(rng.randn(n_sample*n_dim), dtype=theano.config.floatX)
    """
    draw the initial points onto the 2D gaussian contour
    """
    initial_draw = initial_points.reshape(n_sample, n_dim)
    color_current = next(color_map)
    plt.scatter(initial_draw[:,0], initial_draw[:,1], s=2, color = color_current)
    
   
    best_samples_params = scipy.optimize.fmin_l_bfgs_b(func = train_fn, 
                                        x0= initial_points,
                                        #x0 = samples_true.flatten(),
                                        fprime = train_fn_grad,
                                        maxiter = n_epoch)
   
    res = (best_samples_params[0]).reshape((n_sample, n_dim))
    """
    draw the optimized points onto the 2D gaussian coutour
    """
    color_final = next(color_map)
    plt.scatter(res[:,0], res[:,1],s=2, color=color_final )
    
    """
    we can also plot the set of points which runs one step HMC further than the optimized points, 
    in this case, the initial positions is the optimized points we just found. (comment the following lines if you dont want to plot the intermidate steps)
    func_final_pos: compile function to get the new positions based on the optimized points, return is a 3D tensor [n_steps]*[n_samples]*[n_dim]
    pos_final_step: just extract new points based on specified LF steps ([n_samples]*[n_dim])
    
    """
    """
    func_final_pos =theano.function([initial_vel, stepsizes, n_step], final_pos_new, name='final_pos', allow_input_downcast=True)
    rng_temp = numpy.random.RandomState(1234)
    initial_v_final = numpy.array(rng_temp.randn(n_sample,n_dim), dtype=theano.config.floatX)
    params.set_value(best_samples_params[0].astype(theano.config.floatX))
    
    pos_final = func_final_pos(initial_v_final, stepsizes0, 30)
    #next get the positions after runing sampler which has 2 steps of LF. if you set pos_final[k,:], that reprents sampler which has k+1 LF steps
    # 0<=k<29 cause here we consider LF steps up to 30. i.e, 2, 3, 4, ....30
    pos_final_step = pos_final[0,:]  
    plt.scatter(pos_final_step[:,0], pos_final_step[:,1], s=2, color='k')
    """
   # for initialP, finalP in zip(initial_draw, res):
   #    drawArrow(initialP, finalP)
    """
    draw the ground truth points which sample from the true underlying distribution
    """
    plt.scatter(samples_true[:,0], samples_true[:,1], s=2, color='k')
    
    """
    get the results for representative samples and for indenpent samples. 
    """
    print "estimated mean from representative sample= ", res.mean(axis=0)
    print "true mean= ", mu
    #print "estimated second moment from representive sample= ", (res**2).mean(axis=0)
    #print "estimated third moment from representative samples= ", (res**3).mean(axis=0)
    #print "estimated sin(x) from representative samples= ", (numpy.sin(res)).mean(axis=0)
    #print "estimated exp(x) from representative samples= ", (numpy.exp(res)).mean(axis=0)
    #print "true second moment= ", numpy.outer(mu, mu)+cov
    independent_samples = samples_true
    #independent_samples = (cov)*(numpy.array(rng.randn(n_sample,n_dim), dtype=theano.config.floatX)+mu
    print "estimated mean from independent samples= ", independent_samples.mean(axis=0)