Exemplo n.º 1
0
def PCRL(S, A, H, d, L, eps):

    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    rew = 0
    av_rew = []
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H)
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t, s, a) not in R:
                        R[(t, s, a)] = []
                        P[(t, s, a)] = np.array([0 for i in xrange(c1)])
                    if len(R[(t, s, a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t, s, a)])
                        counts = P[(t, s, a)]
                        Rpost = posterior_sampling.update_normal_ig(
                            prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(
                            prior_dir, counts)

                    Rl[(t, s,
                        a)] = posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t, s, a)] = posterior_sampling.sample_dirichlet(Ppost)

        mu = policy(R, P, Rl, Pl, S, A, H)
        rew += play(mu, H, R, P, A, eps)

        if (l + 1) % 10 == 0:
            av_rew.append(rew / float(10))
            rew = 0
    return av_rew
Exemplo n.º 2
0
def PCRL(S,A,H,d,L,eps):
    
    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    rew=0
    av_rew=[]
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H);
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t,s,a) not in R:
                        R[(t,s,a)]=[]
                        P[(t,s,a)]=np.array([0 for i in xrange(c1)])
                    if len(R[(t,s,a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t,s,a)])
                        counts = P[(t,s,a)]
                        Rpost = posterior_sampling.update_normal_ig(prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(prior_dir, counts)

                    Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost)

        mu = policy(R,P,Rl,Pl,S,A,H)
        rew +=  play(mu,H,R,P,A,eps)
        
        if (l+1)%10==0:
            av_rew.append(rew/float(10))
            rew = 0
    return av_rew
Exemplo n.º 3
0
n_samp = 10
for i in range(n_samp):
    sample_norm = posterior_sampling.sample_normal_ig(posterior_ng)
    print 'Sampled Normal distribution: ' + str(sample_norm)

print '\n \n '

#---------------------------------------------------------------------
# Updating transitions

# Make a very simple prior
n_state = 5
prior_dir = np.ones(n_state)

# Imagine we have observed the following
p_true = np.random.gamma(shape=1, size=n_state)
p_true = p_true / np.sum(p_true)
n_data = 100
counts = np.random.multinomial(n_data, p_true)

print 'True multinomial distribution: ' + str(p_true) + '\n'

# Sample data from the posterior
posterior_dir = posterior_sampling.update_dirichlet(prior_dir, counts)
n_samp = 10
for i in range(n_samp):
    sample_mult = posterior_sampling.sample_dirichlet(posterior_dir)
    print 'Sampled multinomial distribution: ' + str(sample_mult)

print '\n'
Exemplo n.º 4
0
def PSRL(S,A,H,L):
    """
    Computes the number of episodes it takes for PSRL to experience a positive 
    reward.
    
    IN
    S: list
        States
    A: list
        Actions
    H: int
        Number of states and time frame
    L: int
        Number of episodes
    OUT 
    success:  int
        Number of episodes before UCRL experiences a positive reward.
    
    """
    # Make a very simple prior
    mu = 0.
    n_mu = 1.
    tau = 1.
    n_tau = 1.
    prior_ng = posterior_sampling.convert_prior(mu, n_mu, tau, n_tau)
    
    c1 = len(S)
    prior_dir = np.ones(c1)
    R = {}
    P = {}
    time = range(H);
    av_rew = []
    rew = 0
    for l in xrange(L):
        Rl = {}
        Pl = {}
        for t in time:
            for s in S:
                for a in A:
                    if (t,s,a) not in R:
                        R[(t,s,a)]=[]
                        P[(t,s,a)]=np.array([0 for i in xrange(c1)])
                    # If we have not visited (t,s,a) we don't update our prior
                    if len(R[(t,s,a)]) == 0:
                        Rpost = prior_ng
                        Ppost = prior_dir
                    else:
                        data = np.array(R[(t,s,a)])
                        counts = P[(t,s,a)]
                        # Posterior updating
                        Rpost = posterior_sampling.update_normal_ig(prior_ng, data)
                        Ppost = posterior_sampling.update_dirichlet(prior_dir, counts)
                    # Posterior sampling
                    Rl[(t,s,a)]= posterior_sampling.sample_normal_ig(Rpost)[0]
                    Pl[(t,s,a)]= posterior_sampling.sample_dirichlet(Ppost)
        # Optimal policy
        mu = policy(R,P,Rl,Pl,S,A,H)
        #Episode
        
        rew += play(mu,H,R,P)
        
        if (l+1)%10==0:
            print rew/float(10)
            av_rew.append(rew/float(10))
            rew = 0
    return av_rew