Exemplo n.º 1
0
def PCRL(S, A, H, d, L, eps):

    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    rew = 0
    av_rew = []
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H)
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t, s, a) not in R:
                        R[(t, s, a)] = []
                        P[(t, s, a)] = np.array([0 for i in xrange(c1)])
                    if len(R[(t, s, a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t, s, a)])
                        counts = P[(t, s, a)]
                        Rpost = posterior_sampling.update_normal_ig(
                            prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(
                            prior_dir, counts)

                    Rl[(t, s,
                        a)] = posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t, s, a)] = posterior_sampling.sample_dirichlet(Ppost)

        mu = policy(R, P, Rl, Pl, S, A, H)
        rew += play(mu, H, R, P, A, eps)

        if (l + 1) % 10 == 0:
            av_rew.append(rew / float(10))
            rew = 0
    return av_rew
Exemplo n.º 2
0
def PCRL(S,A,H,d,L,eps):
    
    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    rew=0
    av_rew=[]
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H);
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t,s,a) not in R:
                        R[(t,s,a)]=[]
                        P[(t,s,a)]=np.array([0 for i in xrange(c1)])
                    if len(R[(t,s,a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t,s,a)])
                        counts = P[(t,s,a)]
                        Rpost = posterior_sampling.update_normal_ig(prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(prior_dir, counts)

                    Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost)

        mu = policy(R,P,Rl,Pl,S,A,H)
        rew +=  play(mu,H,R,P,A,eps)
        
        if (l+1)%10==0:
            av_rew.append(rew/float(10))
            rew = 0
    return av_rew
Exemplo n.º 3
0
'''

import numpy as np
import posterior_sampling

#-----------------------------------------------------------------------
# Updating rewards (normal gamma)

# Make a very simple prior
mu = 0.
n_mu = 1.
tau = 1.
n_tau = 1.

# Convert it to our nice format
prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)

# Generate some real data
real_mu = 1.
real_prec = 4.
n_data = 100

data = np.zeros(n_data)
for i in range(n_data):
    data[i] = np.random.normal(real_mu, np.sqrt(1. / real_prec))

print 'True Normal distribution: ' + str((real_mu, real_prec)) + '\n'

# Sampled data from the posterior
posterior_ng = posterior_sampling.update_normal_ig(prior_ng, data)
n_samp = 10
Exemplo n.º 4
0
def PSRL(S,A,H,L):
    """
    Computes the number of episodes it takes for PSRL to experience a positive 
    reward.
    
    IN
    S: list
        States
    A: list
        Actions
    H: int
        Number of states and time frame
    L: int
        Number of episodes
    OUT 
    success:  int
        Number of episodes before UCRL experiences a positive reward.
    
    """
    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H);
    av_rew = []
    rew = 0
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t,s,a) not in R:
                        R[(t,s,a)]=[]
                        P[(t,s,a)]=np.array([0 for i in xrange(c1)])
                    # If we have not visited (t,s,a) we don't update our prior
                    if len(R[(t,s,a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t,s,a)])
                        counts = P[(t,s,a)]
                        # Posterior updating
                        Rpost = posterior_sampling.update_normal_ig(prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(prior_dir, counts)
                    # Posterior sampling
                    Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost)
        # Optimal policy
        mu = policy(R,P,Rl,Pl,S,A,H)
        #Episode
        
        rew += play(mu,H,R,P)
        
        if (l+1)%10==0:
            print rew/float(10)
            av_rew.append(rew/float(10))
            rew = 0
    return av_rew