def test_sarsa_aggregate(self): def on_step_end(agent, reward, observation, done, action): if agent.total_step % 1000 == 0: print('test_sarsa_aggregate', agent.total_step) if done: print('episode terminated at', agent.total_step) env = gym.make('MountainCar-v0').env env.seed(self.seed) agent = rl.AgentQ(state_space=env.observation_space, action_space=env.action_space, discount=0.99, q_fun_approx=rl.QFunctAggregate(step_size=0.3, bins=[64, 64], init_val=0), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=0, e_rand_start=0.1, e_rand_target=0.1, e_rand_decay=1 / 10000)) agent.register_callback('on_step_end', on_step_end) rl.train_agent(env=env, agent=agent, total_steps=30000) # This is used to test for any numerical discrepancy between runs fp, ws, st, act, rew, done = agent.get_fingerprint() print('FINGERPRINT:', fp) print(' wegight sum:', ws) print(' st, act, rew, done:', st, act, rew, done) self.assertEqual(fp, -24059.666698709698) self.assertEqual(ws, -8850.374069905585) self.assertEqual(st, -15178.292628804113) self.assertEqual(act, 29967) self.assertEqual(rew, -29999.0) self.assertEqual(done, 1)
def test_sarsa_tiles(self): def on_step_end(agent, reward, observation, done, action): if agent.total_step % 1000 == 0: print('test_sarsa_tiles', agent.total_step) if done: print('episode terminated at', agent.total_step) env = gym.make('MountainCar-v0').env env.seed(self.seed) agent = rl.AgentQ(state_space=env.observation_space, action_space=env.action_space, discount=0.99, q_fun_approx=rl.QFunctTiles(step_size=0.3, num_tillings=8, init_val=0), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=0, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) agent.register_callback('on_step_end', on_step_end) rl.train_agent(env=env, agent=agent, total_steps=5000) # This is used to test for any numerical discrepancy between runs fp, ws, st, act, rew, done = agent.get_fingerprint() print('FINGERPRINT:', fp) print(' wegight sum:', ws) print(' st, act, rew, done:', st, act, rew, done) self.assertEqual(fp, -3667.665666738285) self.assertEqual(ws, -1297.1708778794816) self.assertEqual(st, -2430.494788858803) self.assertEqual(act, 5058) self.assertEqual(rew, -4999.0) self.assertEqual(done, 1)
def main(): env = rl.envs.Gridworld(4, 4, random_start=True) env.set_state(0, 3, 'terminal') env.set_state(3, 0, 'terminal') agent = rl.AgentQ(state_space=env.observation_space, action_space=env.action_space, discount=1.0, q_fun_approx=rl.QFunctTabular(step_size=0.02, init_val=0), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=0, e_rand_start=0.1, e_rand_target=0.001, e_rand_decay=0.001)) done = True while True: if done: obs, rew, done = env.reset(), None, False else: obs, rew, done = env.step(act) agent.observe(obs, rew, done) agent.learn() act = agent.take_action(obs) env.render() print('obs:', obs) print('rew:', rew) print('done:', done) print('Q_max:') print(np.sum(agent.Q._weights, axis=1).reshape([4, 4]) / 4) print('===========') agent.next_step(done)
def main(self): args = rl.util.parse_common_args() rl.util.try_freeze_random_seeds(args.seed, args.reproducible) # # Environment # # Environment outputs 3-tuple: cos(ang), sin(ang), angular-velocity # we translate that to 2-tuple: angle [-pi, pi], ang-vel [-8.0, 8.0] # so we can plot 2d action space nicely # # Environment expect continous 1-tuple action representing torque # in range [-2.0, 2.0], but our agent outputs categorical action 0-4 # so we need to tranlate that to torque # this is becouse continous actions are not implemented yet def obs_trans(obs): """Translate from 3d obs space to 2d (for easier plotting)""" theta = np.arctan2(obs[1], obs[0]) vel = obs[2] return np.array([theta, vel]) def act_trans(act): """Translate from categorical actions to continous""" torques = [-2.0, -0.5, 0.0, 0.5, 2.0] return np.array([torques[act]]) self.env = rl.util.EnvTranslator(env=gym.make('Pendulum-v0'), observation_space=gym.spaces.Box( low=np.array([-np.pi, -8.0]), high=np.array([np.pi, 8.0])), observation_translator=obs_trans, action_space=gym.spaces.Discrete(5), action_translator=act_trans, reward_translator=None) self.env.seed(args.seed) # # Agent # agent = rl.AgentQ(state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, q_fun_approx=rl.QFunctTiles(step_size=0.3, num_tillings=16, init_val=0), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=0, e_rand_start=0.0, e_rand_target=0.0, e_rand_decay=1 / 10000)) # # Plotting # # Need to re-think how plotting works if args.plot: fig1 = plt.figure() self.plotter = rl.util.Plotter( realtime_plotting=True, plot_every=1000, disp_len=1000, nb_actions=self.env.action_space.n, figures=(fig1, ), ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'), ax_qmax_im=fig1.add_subplot(2, 4, 2), ax_policy=fig1.add_subplot(2, 4, 3), ax_trajectory=fig1.add_subplot(2, 4, 4), ax_stats=None, ax_memory=None, ax_q_series=None, ax_reward=fig1.add_subplot(2, 1, 2), ) self.plotter.set_state_action_spaces( self.env.observation_space.low, self.env.observation_space.high, h_line=0.0, v_line=0.0) # # Logging # if args.logfile is not None or args.plot: self.logger = rl.util.Logger() self.logger.agent = rl.util.Log('Agent') self.logger.q_val = rl.util.Log('Q_Val') self.logger.env = rl.util.Log('Environment') self.logger.hist = rl.util.Log('History', 'All sates visited') self.logger.memory = rl.util.Log('Memory', 'Full memory dump') self.logger.approx = rl.util.Log('Approx', 'Approximator') self.logger.epsumm = rl.util.Log('Episodes') agent.log_episodes = self.logger.epsumm agent.log_hist = self.logger.hist agent.Q.install_logger(self.logger.q_val, log_every=1000, samples=(64, 64)) # # Callback # agent.register_callback('on_step_end', self.on_step_end) # # Runner # try: rl.train_agent(env=self.env, agent=agent, total_steps=1000000, target_avg_reward=-200) finally: if args.logfile is not None: logger.save(args.logfile) print('Log saved') if self.plotter is not None: plt.show()
def main(self): args = rl.util.parse_common_args() rl.util.try_freeze_random_seeds(args.seed, args.reproducible) # # Environment # # .env at the end removes time limit, see: # https://stackoverflow.com/questions/42787924/ # why-is-episode-done-after-200-time-steps-gym-environment-mountaincar self.env = gym.make('MountainCar-v0').env self.env.seed(args.seed) test_dqn = False if test_dqn: # # Model # q_model = tf.keras.models.Sequential() q_model.add(tf.keras.layers.Dense(256, 'relu', input_dim=2)) q_model.add(tf.keras.layers.Dense(256, 'relu')) q_model.add(tf.keras.layers.Dense(3, 'linear')) q_model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr=0.00025)) # # Agent - DQN with memory # agent = rl.AgentDQN(state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, start_learning_at=100000, memory=rl.MemoryDQN(max_len=100000, batch_size=1024, enable_pmr=False, initial_pmr_error=1000.0), q_fun_approx=rl.QFunctKeras(q_model), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=100000, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) else: # # Agent - tiles or aggregate # agent = rl.AgentQ( state_space=self.env.observation_space, action_space=self.env.action_space, discount=0.99, q_fun_approx=rl.QFunctTiles(step_size=0.3, num_tillings=8, init_val=0), # q_fun_approx=rl.AggregateApproximator( # step_size=0.3, # bins=[64, 64], # init_val=0), policy=rl.PolicyEpsGreedy(expl_start=False, nb_rand_steps=0, e_rand_start=1.0, e_rand_target=0.1, e_rand_decay=1 / 10000)) # # Plotting # # Need to re-think how plotting works if args.plot: fig1 = plt.figure() #fig2 = plt.figure() self.plotter = rl.util.Plotter( realtime_plotting=True, plot_every=1000, disp_len=1000, nb_actions=self.env.action_space.n, figures=(fig1, ), ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'), ax_qmax_im=fig1.add_subplot(2, 4, 2), ax_policy=fig1.add_subplot(2, 4, 3), ax_trajectory=fig1.add_subplot(2, 4, 4), ax_stats=None, ax_memory=None, #fig2.add_subplot(1,1,1), ax_q_series=None, ax_reward=fig1.add_subplot(2, 1, 2), ) self.plotter.set_state_action_spaces( self.env.observation_space.low, self.env.observation_space.high, h_line=0.0, v_line=-0.5) # # Logging # if args.logfile is not None or args.plot: self.logger = rl.util.Logger() self.logger.agent = rl.util.Log('Agent') self.logger.q_val = rl.util.Log('Q_Val') self.logger.env = rl.util.Log('Environment') self.logger.hist = rl.util.Log('History', 'All sates visited') self.logger.memory = rl.util.Log('Memory', 'Full memory dump') self.logger.approx = rl.util.Log('Approx', 'Approximator') self.logger.epsumm = rl.util.Log('Episodes') agent.log_episodes = self.logger.epsumm agent.log_hist = self.logger.hist if isinstance(agent, rl.AgentDQN): agent.memory.install_logger(self.logger.memory, log_every=1000) agent.Q.install_logger(self.logger.q_val, log_every=1000, samples=(64, 64)) agent.register_callback('on_step_end', self.on_step_end) # # Runner # try: rl.train_agent(env=self.env, agent=agent, total_steps=1000000, target_avg_reward=-200) finally: if args.logfile is not None: logger.save(args.logfile) print('Log saved') if self.plotter is not None: plt.show()