def main(argv): del argv gw = Gridworld(10, 10, 0, 99) #for i in range(7): # gw.grid[7][i] = 1 agent_module = importlib.import_module("agents." + FLAGS.agent) avg_num_steps = np.zeros(FLAGS.num_episodes) policy = getattr(policies.tabular_policies, FLAGS.policy) for _ in range(FLAGS.num_trials): agent = agent_module.Agent(FLAGS.agent, gw.width * gw.height, FLAGS.gamma, policy, FLAGS.alpha) steps_per_episode = [] for _ in range(FLAGS.num_episodes): state = gw.start action = agent.select_action(state) step = 0 terminate = False while step < FLAGS.max_steps and not terminate: next_state = gw.apply_action(state, action) terminate, reward = gw.is_goal(next_state) next_action = agent.select_action(next_state) agent.update(state, action, reward, next_state, next_action) state = next_state action = next_action step += 1 steps_per_episode.append(step) avg_num_steps += np.array(steps_per_episode) avg_num_steps = avg_num_steps / FLAGS.num_trials plt.plot(avg_num_steps) plt.show()
def __init__(self, params_file): """ Initializes an experiment. """ self.params = self.get_parameter(params_file) self.exp_dir = self.set_exp_dir() _logger = self.set_logger() _logger.info("Initializing new experiment of type %s" % str(self.params['type'])) _logger.info("Loading parameters from %s" % str(params_file)) _logger.info("Saving logs in %s" % str(self.exp_dir)) # self.set_s tatus('Initializing') # copy parameter source helper.copy_file(params_file, os.path.join(self.exp_dir, 'params.yaml')) # Mersenne Twister pseudo-random number generator self.rng = np.random.RandomState(self.params['random_seed']) # set environment self.env = Gridworld(grid=os.path.join(os.getcwd(), 'maps', self.params['grid']), max_steps=self.params['max_steps'], visual=self.params['visual'], rng=self.rng) self.current_task = 'None' self.current_run = 0 self.current_episode = 0 self.exp_steps = 0
def main(argv): del argv gw = Gridworld(10, 10, 0, 80) for i in range(7): gw.grid[7][i] = 1 agent_module = importlib.import_module("agents." + FLAGS.agent) avg_num_steps = np.zeros(FLAGS.num_episodes) policy = getattr(policies.tabular_policies, FLAGS.policy) for _ in range(FLAGS.num_trials): agent = agent_module.Agent(FLAGS.agent, gw.width * gw.height, FLAGS.n, FLAGS.gamma, policy, FLAGS.alpha) steps_per_episode = [] for _ in range(FLAGS.num_episodes): T = np.Inf state = gw.start agent.reset_agent() agent.stored_states.append(state) action = agent.select_action(state) agent.stored_actions.append(action) step = 0 tau = 0 while tau != T - 1: if step < T: next_state = gw.apply_action(state, action) terminate, reward = gw.is_goal(next_state) agent.stored_states.append(next_state) agent.stored_rewards.append(reward) if terminate or step == FLAGS.max_steps - 1: T = step + 1 else: next_action = agent.select_action(next_state) agent.stored_actions.append(next_action) state = next_state action = next_action tau = step - agent.n + 1 if tau >= 0: agent.update(tau, T) step += 1 steps_per_episode.append(step) avg_num_steps += np.array(steps_per_episode) avg_num_steps = avg_num_steps / FLAGS.num_trials np.save(FLAGS.log_path + "/" + FLAGS.log_file, avg_num_steps) plt.plot(avg_num_steps) plt.savefig(FLAGS.log_path + "/" + FLAGS.log_file)
def get_pi_eval(): alpha = 0.1 env = Gridworld() agent = QAgent(25, 4, 1, alpha) episode = 0 while True: env.reset() agent.new_episode() G = 0 s = env.get_state() t = 0 while True: a = agent.get_action(s) _, r, _ = env.step(a) G += r if env.terminal(): agent.train_inf(s, a, r) break s_prime = env.get_state() agent.train(s, a, r, s_prime) s = s_prime t += 1 episode += 1 print("episode=", episode, " G", G) if episode > 800: break return agent.get_pi()
def __init__( self, nrows = 8, ncols = 8 , **kwargs): # we have to make some assertions to make sure the we can fit # a lake in the middle of the gridworld assert nrows > 2 and ncols > 2 self.nstates = nrows * ncols self.lake = [] for x in range(self.nstates): if ( 0 < (x % ncols) < ncols - 1 ) and ( ncols < x < (nrows - 1) * ncols ): self.lake.append(x) Gridworld.__init__(self, nrows = nrows, ncols = ncols, **kwargs)
def getCliffGrid(): grid = [[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [ 'S', -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 'TERMINAL_STATE' ]] return Gridworld(grid)
def iteration_example(): grid = Gridworld.negative_grid() print('grid:') print_values(grid.rewards, grid) values, policy = policy_iteration(grid, 0.9) print('values:') print_values(values, grid) print('policy:') print_policy(policy, grid)
def q_example(): grid = Gridworld.negative_grid() print('Rewards:') print_values(grid.rewards, grid) values, policy, deltas = gradient_q(grid) plt.plot(deltas) plt.show() print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
def gridworld(): height = 5 width = 5 state_A = (0, 1) state_B = (0, 3) from_A = (4, 1) from_B = (2, 3) reward_leaving_state_A = 10 reward_leaving_state_B = 5 gamma = 0.9 return Gridworld(height, width, state_A, state_B, from_A, from_B, reward_leaving_state_A, reward_leaving_state_B, gamma)
def ex_4_5(size=None): """ Testing policy evaluation and policy iteration on gridworld using Q values. """ if size is None: size = DEF_EX_4_4_SIZE env = Gridworld(size) pi_rand = random_policy(env) pi_init = {(a, s): pi_rand(s, a) for s in env.states for a in env.moves} alg = DynamicProgramming(env, pi=pi_init, theta=1e-4, gamma=1) alg.policy_iteration_Q() alg.print_policy()
def example_1(): """ Example 1: Obains the solution for a given infinite gridworld problem using value iteration and policy iteration. """ # Initialises all required inputs for the Gridworld. In this example, # actions list contains up, down, left and right, and the reward dict is # stored in the form of key: (coord, action index), value: (reward, new coord). name = 'base_problem' size = (5, 5) actions_list = [[-1, 0], [1, 0], [0, -1], [0, 1]] reward_dict = { ((0, 1), 1): (10, (4, 1)), ((0, 2), 0): (3, (0, 2)), ((0, 2), 1): (3, (0, 2)), ((0, 2), 2): (3, (0, 2)), ((0, 2), 3): (3, (0, 2)), ((1, 1), 3): (4, (1, 4)) } discount = 0.9 # Initialises the Gridworld. gridworld = Gridworld(name, size, actions_list, reward_dict, discount) # Obtains the optimum value estimate and policy from the Gridworld using # value iteration. gridworld.obtain_optimum('value_iteration') # Obtains the optimum value estimate and policy from the Gridworld using # policy iteration. gridworld.obtain_optimum('policy_iteration')
def ex_4_4(size=None): """ Testing a policy iteration that stops when policy encountered twice on environment where all policies are equally bad (gridworld with cost of move equal to zero). """ if size is None: size = DEF_EX_4_4_SIZE env = Gridworld(size, cost_move=0) det_pi = {s: env.moves[0] for s in env.states} alg = DynamicProgramming(env, det_pi=det_pi, theta=1e-7, gamma=1) # uncomment/comment for the difference between improvement and not improvement alg.policy_iteration_improved()
def q_example(): grid = Gridworld.negative_grid() print('Rewards:') print_values(grid.rewards, grid) values, policy, deltas, count_updates = q_learning(grid) plt.plot(deltas) plt.show() print('Updates:') print_values(count_updates, grid) print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
def fig_4_1(size=None): if size is None: size = DEF_FIG_4_1_SIZE env = Gridworld(size) pi_rand = random_policy(env) pi_init = {(a, s): pi_rand(s, a) for s in env.states for a in env.moves} alg = DynamicProgramming(env, pi=pi_init, theta=1e-4, gamma=1) # undiscounted alg.policy_evaluation() alg.print_values() # show the optimal policy while not alg.policy_improvement(): pass alg.print_policy()
def simple_example(): grid = Gridworld.negative_grid(-0.9) print('Rewards:') print_values(grid.rewards, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(POSSIBLE_ACTIONS) values, deltas, policy = policy_iteration(grid, 2000, policy) plt.plot(deltas) plt.show() print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
def simple_example(): grid = Gridworld.default_grid() print('Rewards:') print_values(grid.rewards, grid) policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } values = first_visit_monte_carlo(grid, 100, policy) print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
def simple_example(): grid = Gridworld.default_grid() print('Rewards:') print_values(grid.rewards, grid) policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } values = td_zero(grid, policy) print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
def simple_example(): grid = Gridworld.default_grid() values_uniform = policy_evaluation(grid, 1) print('values for uniformly random actions:') print_values(values_uniform, grid) print('\n\n') fixed_policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } print_policy(fixed_policy, grid) fixed_values = policy_evaluation(grid, 0.9, fixed_policy) print('Values for fixed policy:') print_values(fixed_values, grid)
def mc_prediction(): grid = Gridworld.default_grid() print('Rewards:') print_values(grid.rewards, grid) policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'U', (2, 1): 'L', (2, 2): 'U', (2, 3): 'L', } values, deltas = approx_monte_carlo(grid, policy) plt.plot(deltas) plt.show() print('Values:') print_values(values, grid) print('Policy:') print_policy(policy, grid)
target_weights[i] = (self.tau * weights[i] + (1 - self.tau) * target_weights[i]) self.target_model.set_weights(target_weights) def replay(self, batch_size): if len(self.memory) < batch_size: return minibatch = random.sample(self.memory, batch_size) for state, action, reward, next_state, done in minibatch: self.learn(state, action, reward, next_state, done) if __name__ == '__main__': world = Gridworld(3, 3, goal_position=(3, 3), traps=[(2, 2)]) agent = DQNAgent(2, 4) ANGLES = [math.pi / 2, 0, -math.pi / 2, math.pi] episodes = 500 batch_size = 32 memory_length = 20 last_results = deque(maxlen=memory_length) for e in range(episodes): state = world.reset() state = np.expand_dims(state, 0) for time_step in range(500): action = agent.act(state)
class Experiment(object): """ This is the base class for all experiment implementations. The experiment organizes all objects and directs the training in a given scenario. """ __metaclass__ = abc.ABCMeta def __init__(self, params_file): """ Initializes an experiment. """ self.params = self.get_parameter(params_file) self.exp_dir = self.set_exp_dir() _logger = self.set_logger() _logger.info("Initializing new experiment of type %s" % str(self.params['type'])) _logger.info("Loading parameters from %s" % str(params_file)) _logger.info("Saving logs in %s" % str(self.exp_dir)) # self.set_s tatus('Initializing') # copy parameter source helper.copy_file(params_file, os.path.join(self.exp_dir, 'params.yaml')) # Mersenne Twister pseudo-random number generator self.rng = np.random.RandomState(self.params['random_seed']) # set environment self.env = Gridworld(grid=os.path.join(os.getcwd(), 'maps', self.params['grid']), max_steps=self.params['max_steps'], visual=self.params['visual'], rng=self.rng) self.current_task = 'None' self.current_run = 0 self.current_episode = 0 self.exp_steps = 0 def get_parameter(self, file_name): path_to_file = os.path.join(os.getcwd(), file_name) with open(path_to_file, 'r') as ymlfile: params = yaml.load(ymlfile, Loader=yaml.Loader) return params def set_exp_dir(self): folder = "%s_%s_%s" % (str( time.strftime("%Y-%m-%d_%H-%M")), str( self.params['type']).lower(), str(self.params['grid']).lower()) path_to_dir = os.path.join(os.getcwd(), 'logs', folder) return helper.create_dir(path_to_dir) def set_logger(self): # make sure no loggers are already active try: logging.root.handlers.pop() except IndexError: # if no logger exist the list will be empty and we need # to catch the resulting error pass if self.params['log_type'] == 'stdout': logging.basicConfig(level=getattr(logging, self.params['log_level'], None), stream=sys.stdout, format='[%(asctime)s][%(levelname)s]' '[%(module)s][%(funcName)s] ' '%(message)s') else: logging.basicConfig(level=getattr(logging, self.params['log_level'], None), format='[%(asctime)s][%(levelname)s]' '[%(module)s][%(funcName)s] ' '%(message)s', filename=os.path.join(self.exp_dir, 'experiment.log'), filemode='w') return logging.getLogger(__name__) def set_status(self, status): self.status = status _logger.debug("[T:%s,R:%s,E:%s] %s" % (str(self.current_task['name']), str(self.current_run), str(self.current_episode), str(self.status))) def init_episode(self): self.steps_in_episode = 0 self.reward_in_episode = 0 self._init_episode() @abc.abstractmethod def _init_episode(self): pass def cleanup_episode(self): self._cleanup_episode() if self.status == 'training': if self.learner.epsilon > self.params['epsilon_limit']: self.learner.set_epsilon(self.learner.epsilon + self.learner.epsilon_change) @abc.abstractmethod def _cleanup_episode(self): pass def init_run(self): _logger.info("..... Starting run %s" % str(self.current_run)) run_dir = os.path.join(self.task_dir, 'run_' + str(self.current_run)) self.run_dir = helper.create_dir(run_dir) # Create run stats file: run_stats.csv self.run_stats_file = os.path.join(self.run_dir, 'stats_run.csv') self.run_steps = 0 helper.write_stats_file(self.run_stats_file, 'episode', 'steps_total', 'steps_mean', 'reward_total', 'reward_mean', 'epsilon', 'step_count') self._init_run() @abc.abstractmethod def _init_run(self): pass def cleanup_run(self): self.save_best_episode() helper.delete_dirs(self.run_dir) helper.plot_run(self.run_dir) self._cleanup_run() _logger.info("..... Finished run %s" % str(self.current_run)) @abc.abstractmethod def _cleanup_run(self): pass def init_task(self): _logger.info("##### Starting task %s" % str(self.current_task['name'])) task_dir = os.path.join(self.exp_dir, 'task_' + self.current_task['name']) self.task_dir = helper.create_dir(task_dir) self._init_task() @abc.abstractmethod def _init_task(self): pass def cleanup_task(self): helper.plot_runs(self.task_dir) helper.summarize_runs_results(self.task_dir) helper.plot_task(self.task_dir) self.save_best_run() self._cleanup_task() # self.set_s tatus('idle') _logger.info("##### Finished task %s" % str(self.current_task['name'])) @abc.abstractmethod def _cleanup_task(self): pass # def evaluate_current_lib rary(self): # pass def get_action_id(self, state, policy_name): return self._get_action_id(state, policy_name) @abc.abstractmethod def _get_action_id(self): pass @abc.abstractmethod def _specific_updates(self): pass def write_test_results(self): helper.write_stats_file( self.run_stats_file, self.current_episode, sum(self.test_steps), np.mean(self.test_steps), sum(self.test_rewards), np.mean(self.test_rewards), float("{0:.5f}".format(self.learner.last_epsilon)), self.run_steps) self._write_test_results() @abc.abstractmethod def _write_test_results(self): pass def run_tests(self): self.learner.set_epsilon(0.0) self.episode_dir = os.path.join(self.run_dir, 'episode_' + str(self.current_episode)) self.episode_dir = helper.create_dir(self.episode_dir) self.test_steps = [] self.test_rewards = [] for test_pos in self.params['test_positions']: self.init_episode() self.run_episode(test_pos, tuple(self.current_task['goal_pos']), self.current_task['name']) self.test_steps.append(self.steps_in_episode) self.test_rewards.append(self.reward_in_episode) self.write_test_results() self.learner.save_Qs(os.path.join(self.episode_dir, 'Qs.npy')) # Make video from random position if self.params['visual']: self.set_status('recording') self.init_episode() self.run_episode( self.env.get_random_state(tuple( self.current_task['goal_pos'])), tuple(self.current_task['goal_pos']), self.current_task['name']) self.learner.set_epsilon(self.learner.last_epsilon) def run_episode(self, agent_pos, goal_pos, policy_name=None): """ Function to run a single episode. """ if self.status == 'training': _logger.debug("Start episode") self.env.reset_env() self.env.add_agent(agent_pos, self.agent_name) self.env.add_goal(goal_pos) if self.status == 'recording': self.env.draw_frame() self.env.save_current_frame(self.episode_dir) state = self.env.get_current_state(self.agent_name) action_id = self.get_action_id(state, policy_name) reward = self.env.step(self.env.actions[action_id], self.agent_name) state_prime = self.env.get_current_state(self.agent_name) if self.status in ['training', 'policy_eval']: self.run_steps += 1 if self.status == 'recording': self.env.draw_frame() self.env.save_current_frame(self.episode_dir) self.steps_in_episode += 1 self.reward_in_episode += reward if self.status == 'training' and not self.env.episode_ended: self.learner.update_Q(state[0:2], action_id, reward, state_prime[0:2]) self._specific_updates(policy_name) while not self.env.episode_ended: state = state_prime action_id = self.get_action_id(state, policy_name) reward = self.env.step(self.env.actions[action_id], self.agent_name) state_prime = self.env.get_current_state(self.agent_name) if self.status in ['training', 'policy_eval']: self.run_steps += 1 if self.status == 'recording': self.env.draw_frame() self.env.save_current_frame(self.episode_dir) # if self.status in ['testing', 'policy_eval']: self.steps_in_episode += 1 self.reward_in_episode += reward if self.status == 'training': self.learner.update_Q(state[0:2], action_id, reward, state_prime[0:2]) if self.env.step_count >= self.env.max_steps: self.env.episode_ended = True self._specific_updates(policy_name) if self.status == 'training': _logger.debug("End episode") if self.env.visual and self.status == 'recording': self.env.make_video(self.episode_dir) def save_best_episode(self): df = pd.read_csv(os.path.join(self.run_dir, 'stats_run.csv')) least_steps_row = df.ix[df['steps_mean'].idxmin()] run_best_file = os.path.join(self.run_dir, 'stats_run_best.csv') headers = ['run'] content = [int(self.current_run)] for column in df: headers.append(str(column)) content.append(least_steps_row[column]) helper.write_stats_file(run_best_file, headers) helper.write_stats_file(run_best_file, content) helper.copy_file( os.path.join(self.run_dir, 'episode_' + str(int(least_steps_row['episode'])), 'Qs.npy'), os.path.join(self.run_dir, 'best_Qs.npy')) def save_best_run(self): # Save best Q-table for current task df = pd.read_csv( os.path.join(self.task_dir, 'run_' + str(1), 'stats_run_best.csv')) for i in range(2, self.params['runs']): df.append(pd.read_csv( os.path.join(self.task_dir, 'run_' + str(i), 'stats_run_best.csv')), ignore_index=True) least_steps_row = df.ix[df['steps_mean'].idxmin()] task_best_file = os.path.join(self.task_dir, 'stats_task_best.csv') headers = ['task'] content = [str(self.current_task['name'])] for column in df: headers.append(str(column)) content.append(least_steps_row[column]) helper.write_stats_file(task_best_file, headers) helper.write_stats_file(task_best_file, content) helper.copy_file( os.path.join(self.task_dir, 'run_' + str(int(least_steps_row['run'])), 'best_Qs.npy'), os.path.join(self.task_dir, 'best_Qs.npy')) def main(self): for task in self.params['tasks']: self.current_task = task self.init_task() for run in range(1, self.params['runs'] + 1): self.current_run = run self.current_episode = 0 self.current_policy = self.current_task['name'] self.init_run() self.set_status('testing') self.run_tests() self.set_status('training') for episode in range(1, self.params['episodes'] + 1): self.current_episode = episode self.init_episode() self.run_episode( self.env.get_random_state( tuple(self.current_task['goal_pos'])), tuple(self.current_task['goal_pos']), self.current_policy) self.cleanup_episode() if episode % self.params['test_interval'] == 0: self.set_status('testing') self.run_tests() self.set_status('training') self.cleanup_run() self.cleanup_task() _logger.info("Done")
def getBookGrid(): grid = [[' ', ' ', ' ', +1], [' ', '#', ' ', -1], ['S', ' ', ' ', ' ']] return Gridworld(grid)
def getSimpleGrid(): grid = [[' ', ' ', +1], [' ', ' ', -1], ['S', ' ', ' ']] return Gridworld(grid)
def getBridgeGrid(): grid = [['#', -100, -100, -100, -100, -100, '#'], [1, 'S', ' ', ' ', ' ', ' ', 10], ['#', -100, -100, -100, -100, -100, '#']] return Gridworld(grid)
def getDiscountGrid(): grid = [[' ', ' ', ' ', ' ', ' '], [' ', '#', ' ', ' ', ' '], [' ', '#', 1, '#', 10], ['S', ' ', ' ', ' ', ' '], [-10, -10, -10, -10, -10]] return Gridworld(grid)
def getCliffGrid2(): grid = [[' ', ' ', ' ', ' ', ' '], [8, 'S', ' ', ' ', 10], [-100, -100, -100, -100, -100]] return Gridworld(grid)
def getCliffGrid(): grid = [[' ', ' ', ' ', ' ', ' '], ['S', ' ', ' ', ' ', 10], [-100, -100, -100, -100, -100]] return Gridworld(makeGrid(grid))
def getMyGrid(): "Add your own grid definition here." grid = [[' ', ' ', +1, ' ', ' ', ' '], [' ', ' ', 'S', ' ', ' ', ' '], [-1, -1, -1, '#', ' ', ' '], [+30, ' ', ' ', ' ', ' ', +20]] return Gridworld(grid)
def getMazeGrid(): grid = [[' ', ' ', ' ', +1], ['#', '#', ' ', '#'], [' ', '#', ' ', ' '], [' ', '#', '#', ' '], ['S', ' ', ' ', ' ']] return Gridworld(grid)
def learn(self, old_state, new_state, action, reward): old_val = self.q_table[old_state][action] next_value = np.max(self.q_table[new_state]) # print(old_state, action, reward, new_state) new_q_value = self.compute_new_q_value(old_val, reward, next_value) self.q_table[old_state][action] = new_q_value def print_values(self): height, width, _ = self.q_table.shape for r in range(1, height - 1): for c in range(1, width - 1): for a_id, a in enumerate(self.actions): print("q(s{}{}, {}) = {:.3f}".format( r, c, a, self.q_table[r, c, a_id])) print() if __name__ == '__main__': from gridworld import Gridworld env = Gridworld(5, 5, goal_position=(1, 3), traps=[(2, 1)]) env.render() agent = SARSA(env) agent.train(episodes=1000) agent.print_values()
from gridworld import Gridworld import pygame as pg import numpy as np import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.nn.functional as F from vicero.algorithms.deepqlearning import DQN scale = 24 env = Gridworld(scale, width=8, height=8) pg.init() screen = pg.display.set_mode( (scale * len(env.board[0]), scale * len(env.board))) env.screen = screen clock = pg.time.Clock() """ while True: env.step(env.action_space.sample()) env.render() """ def plot(history): plt.figure(2) plt.clf() durations_t = torch.DoubleTensor(history) plt.title('Training...')