示例#1
0
def train():
    nn = RL.RL([squ, 10 * squ, 10 * squ, 10 * squ, squ])
    RL.train(False, nn)
示例#2
0
# set rewards
R[:, 15] = 100
# goal state
R[:, 9] = -70
# bad state
R[:, 16] = 0
# end state

# Discount factor: scalar in [0,1)
discount = 0.95

# MDP object
mdp = MDP.MDP(T, R, discount)

# RL problem
rlProblem = RL.RL(mdp, np.random.normal)

# Test Q-learning
print("\nepsilon = 0.05")
Q = np.zeros([mdp.nActions, mdp.nStates])
policy = np.zeros(mdp.nStates, int)
c_reward = np.zeros(200)
for i in range(100):
    [Q_t, policy_t, cum_reward_t
     ] = rlProblem.qLearning(s0=0,
                             initialQ=np.zeros([mdp.nActions, mdp.nStates]),
                             nEpisodes=200,
                             nSteps=100,
                             epsilon=0.05)
    Q += Q_t
    c_reward += cum_reward_t
示例#3
0
from gridworld import GridWorld1
import RL
import gridrender as gui
import matplotlib.pyplot as plt
import numpy as np
import time

################################################################################
# Initialization
################################################################################

env = GridWorld1
n_states = env.n_states
n_actions = len(env.action_names)

model = RL.RL(env)

# Estimating initial state distribution
n_start = 10000
model.estimate_start_distribution(n_start)
print(
    f'Estimated start state distribution is {model.mu} after {n_start} throws')

# Computing Tmax such that the discounted truncated sum of rewards is delta-closed to the infinite sum
delta = 0.01
tmax = -int(np.log(delta) / (1 - env.gamma))
print(f'Tmax (max number of iterations in an episode) is chosen as : {tmax}')

################################################################################
# Q4: Policy evaluation
################################################################################