def test_evaluate_episode_delta(self): MC = MonteCarlo() deltas = [] for _ in range(100): deltas.append(MC.evaluate_episode_delta(MC.generate_episode())) self.assertTrue(-1 <= deltas[-1] <= 1) self.assertTrue(any(d for d in deltas)) print('sample path value max deltas:', deltas)
def test_generate_episode(self): MC = MonteCarlo() paths = [] for _ in range(10): states = MC.generate_episode() self.assertLess(states[-1], hash(MC.board)) paths.append(states) print('sample state paths:', paths)
def test_evaluate_episode(self): MC = MonteCarlo() for _ in range(10): states = MC.generate_episode() pre_visits = np.array([MC.visits[s] for s in [0] + states]) pre_values = np.array([MC.values[s] for s in [0] + states]) MC.evaluate_episode(states) post_visits = np.array([MC.visits[s] for s in [0] + states]) post_values = np.array([MC.values[s] for s in [0] + states]) diff_values = pre_values - post_values diff_visits = pre_visits - post_visits print('sample path value deltas:', diff_values) self.assertTrue(np.all(diff_visits) == 1)
def test_eg_policy(self): MC = MonteCarlo() keys = np.zeros(9, int) for _ in range(10**3): keys[MC.epsilon_greedy_policy(hash(MC.board))] += 1 print('eg policy random?', keys) MC.visits[hash(MC.board)] = MC.epsilon_constant keys[:] = 0 MC.board.push(1) MC.values[hash(MC.board)] = 10 MC.board.pop() for _ in range(10**4): keys[MC.epsilon_greedy_policy(hash(MC.board))] += 1 print('eg policy tripled 1,3,5,7?', keys)
def test_policy(self): MC = MonteCarlo() keys = np.zeros(9, int) keys = np.zeros(9, int) for _ in range(10**3): keys[MC.policy()] += 1 print('random policy?', keys) keys[:] = 0 MC.board.push(4) state = hash(MC.board) MC.values[state] = -10 MC.board.pop() for _ in range(10**3): keys[MC.policy()] += 1 self.assertEqual(keys[4], 0) print('policy null 4', keys)
def get_forward_vf(self, pol: SAf) -> Mapping[S, float]: sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0. for s in sa_dict.keys()} episodes = 0 monte = MonteCarlo(self.mdp_rep, True, \ self.num_episodes, self.max_steps) while episodes < self.num_episodes: start_state = self.mdp_rep.init_state_gen() mc_path = monte.get_mc_path(pol, start_state) rew_arr = np.array([x for _, _, x, _ in mc_path[:-1]]) state_list = [x for x, _, _, _ in mc_path[:-1]] val_arr = np.array([vf_dict[s] for s in state_list]) if mc_path[-1][0] in self.mdp_rep.terminal_states: returns = self.get_returns(rew_arr, val_arr) else: raise RuntimeError('Max step out of limit') for i, r in enumerate(returns): s, _, _, _ = mc_path[i] vf_dict[s] += self.learning_rate * (returns[i] - vf_dict[s]) episodes += 1 return vf_dict
def test_run(self): MC = MonteCarlo() MC.run(max_episodes=10**5, threshold=.01, interval=100, checks=30)
def test_run_cutoff(self): MC = MonteCarlo() MC.run(max_episodes=1000, threshold=.01)