def train(): nn = RL.RL([squ, 10 * squ, 10 * squ, 10 * squ, squ]) RL.train(False, nn)
# set rewards R[:, 15] = 100 # goal state R[:, 9] = -70 # bad state R[:, 16] = 0 # end state # Discount factor: scalar in [0,1) discount = 0.95 # MDP object mdp = MDP.MDP(T, R, discount) # RL problem rlProblem = RL.RL(mdp, np.random.normal) # Test Q-learning print("\nepsilon = 0.05") Q = np.zeros([mdp.nActions, mdp.nStates]) policy = np.zeros(mdp.nStates, int) c_reward = np.zeros(200) for i in range(100): [Q_t, policy_t, cum_reward_t ] = rlProblem.qLearning(s0=0, initialQ=np.zeros([mdp.nActions, mdp.nStates]), nEpisodes=200, nSteps=100, epsilon=0.05) Q += Q_t c_reward += cum_reward_t
from gridworld import GridWorld1 import RL import gridrender as gui import matplotlib.pyplot as plt import numpy as np import time ################################################################################ # Initialization ################################################################################ env = GridWorld1 n_states = env.n_states n_actions = len(env.action_names) model = RL.RL(env) # Estimating initial state distribution n_start = 10000 model.estimate_start_distribution(n_start) print( f'Estimated start state distribution is {model.mu} after {n_start} throws') # Computing Tmax such that the discounted truncated sum of rewards is delta-closed to the infinite sum delta = 0.01 tmax = -int(np.log(delta) / (1 - env.gamma)) print(f'Tmax (max number of iterations in an episode) is chosen as : {tmax}') ################################################################################ # Q4: Policy evaluation ################################################################################