예제 #1
0
    def test_sarsa_aggregate(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_sarsa_aggregate', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.AgentQ(state_space=env.observation_space,
                          action_space=env.action_space,
                          discount=0.99,
                          q_fun_approx=rl.QFunctAggregate(step_size=0.3,
                                                          bins=[64, 64],
                                                          init_val=0),
                          policy=rl.PolicyEpsGreedy(expl_start=False,
                                                    nb_rand_steps=0,
                                                    e_rand_start=0.1,
                                                    e_rand_target=0.1,
                                                    e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=30000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -24059.666698709698)
        self.assertEqual(ws, -8850.374069905585)
        self.assertEqual(st, -15178.292628804113)
        self.assertEqual(act, 29967)
        self.assertEqual(rew, -29999.0)
        self.assertEqual(done, 1)
예제 #2
0
    def test_sarsa_tiles(self):
        def on_step_end(agent, reward, observation, done, action):
            if agent.total_step % 1000 == 0:
                print('test_sarsa_tiles', agent.total_step)
            if done:
                print('episode terminated at', agent.total_step)

        env = gym.make('MountainCar-v0').env
        env.seed(self.seed)

        agent = rl.AgentQ(state_space=env.observation_space,
                          action_space=env.action_space,
                          discount=0.99,
                          q_fun_approx=rl.QFunctTiles(step_size=0.3,
                                                      num_tillings=8,
                                                      init_val=0),
                          policy=rl.PolicyEpsGreedy(expl_start=False,
                                                    nb_rand_steps=0,
                                                    e_rand_start=1.0,
                                                    e_rand_target=0.1,
                                                    e_rand_decay=1 / 10000))

        agent.register_callback('on_step_end', on_step_end)

        rl.train_agent(env=env, agent=agent, total_steps=5000)

        # This is used to test for any numerical discrepancy between runs
        fp, ws, st, act, rew, done = agent.get_fingerprint()
        print('FINGERPRINT:', fp)
        print('  wegight sum:', ws)
        print('  st, act, rew, done:', st, act, rew, done)

        self.assertEqual(fp, -3667.665666738285)
        self.assertEqual(ws, -1297.1708778794816)
        self.assertEqual(st, -2430.494788858803)
        self.assertEqual(act, 5058)
        self.assertEqual(rew, -4999.0)
        self.assertEqual(done, 1)
예제 #3
0
def main():

    env = rl.envs.Gridworld(4, 4, random_start=True)
    env.set_state(0, 3, 'terminal')
    env.set_state(3, 0, 'terminal')

    agent = rl.AgentQ(state_space=env.observation_space,
                      action_space=env.action_space,
                      discount=1.0,
                      q_fun_approx=rl.QFunctTabular(step_size=0.02,
                                                    init_val=0),
                      policy=rl.PolicyEpsGreedy(expl_start=False,
                                                nb_rand_steps=0,
                                                e_rand_start=0.1,
                                                e_rand_target=0.001,
                                                e_rand_decay=0.001))

    done = True
    while True:
        if done:
            obs, rew, done = env.reset(), None, False
        else:
            obs, rew, done = env.step(act)
        agent.observe(obs, rew, done)
        agent.learn()
        act = agent.take_action(obs)

        env.render()
        print('obs:', obs)
        print('rew:', rew)
        print('done:', done)
        print('Q_max:')
        print(np.sum(agent.Q._weights, axis=1).reshape([4, 4]) / 4)
        print('===========')

        agent.next_step(done)
예제 #4
0
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # Environment outputs 3-tuple: cos(ang), sin(ang), angular-velocity
        # we translate that to 2-tuple: angle [-pi, pi], ang-vel [-8.0, 8.0]
        # so we can plot 2d action space nicely
        #
        # Environment expect continous 1-tuple action representing torque
        # in range [-2.0, 2.0], but our agent outputs categorical action 0-4
        # so we need to tranlate that to torque
        # this is becouse continous actions are not implemented yet
        def obs_trans(obs):
            """Translate from 3d obs space to 2d (for easier plotting)"""
            theta = np.arctan2(obs[1], obs[0])
            vel = obs[2]
            return np.array([theta, vel])

        def act_trans(act):
            """Translate from categorical actions to continous"""
            torques = [-2.0, -0.5, 0.0, 0.5, 2.0]
            return np.array([torques[act]])

        self.env = rl.util.EnvTranslator(env=gym.make('Pendulum-v0'),
                                         observation_space=gym.spaces.Box(
                                             low=np.array([-np.pi, -8.0]),
                                             high=np.array([np.pi, 8.0])),
                                         observation_translator=obs_trans,
                                         action_space=gym.spaces.Discrete(5),
                                         action_translator=act_trans,
                                         reward_translator=None)

        self.env.seed(args.seed)

        #
        #   Agent
        #
        agent = rl.AgentQ(state_space=self.env.observation_space,
                          action_space=self.env.action_space,
                          discount=0.99,
                          q_fun_approx=rl.QFunctTiles(step_size=0.3,
                                                      num_tillings=16,
                                                      init_val=0),
                          policy=rl.PolicyEpsGreedy(expl_start=False,
                                                    nb_rand_steps=0,
                                                    e_rand_start=0.0,
                                                    e_rand_target=0.0,
                                                    e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=0.0)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

        #
        #   Callback
        #
        agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()
예제 #5
0
    def main(self):

        args = rl.util.parse_common_args()
        rl.util.try_freeze_random_seeds(args.seed, args.reproducible)

        #
        #   Environment
        #
        # .env at the end removes time limit, see:
        # https://stackoverflow.com/questions/42787924/
        # why-is-episode-done-after-200-time-steps-gym-environment-mountaincar
        self.env = gym.make('MountainCar-v0').env

        self.env.seed(args.seed)

        test_dqn = False
        if test_dqn:

            #
            #   Model
            #
            q_model = tf.keras.models.Sequential()
            q_model.add(tf.keras.layers.Dense(256, 'relu', input_dim=2))
            q_model.add(tf.keras.layers.Dense(256, 'relu'))
            q_model.add(tf.keras.layers.Dense(3, 'linear'))
            q_model.compile(loss='mse',
                            optimizer=tf.keras.optimizers.RMSprop(lr=0.00025))

            #
            #   Agent - DQN with memory
            #
            agent = rl.AgentDQN(state_space=self.env.observation_space,
                                action_space=self.env.action_space,
                                discount=0.99,
                                start_learning_at=100000,
                                memory=rl.MemoryDQN(max_len=100000,
                                                    batch_size=1024,
                                                    enable_pmr=False,
                                                    initial_pmr_error=1000.0),
                                q_fun_approx=rl.QFunctKeras(q_model),
                                policy=rl.PolicyEpsGreedy(expl_start=False,
                                                          nb_rand_steps=100000,
                                                          e_rand_start=1.0,
                                                          e_rand_target=0.1,
                                                          e_rand_decay=1 /
                                                          10000))

        else:

            #
            #   Agent - tiles or aggregate
            #
            agent = rl.AgentQ(
                state_space=self.env.observation_space,
                action_space=self.env.action_space,
                discount=0.99,
                q_fun_approx=rl.QFunctTiles(step_size=0.3,
                                            num_tillings=8,
                                            init_val=0),
                # q_fun_approx=rl.AggregateApproximator(
                #     step_size=0.3,
                #     bins=[64, 64],
                #     init_val=0),
                policy=rl.PolicyEpsGreedy(expl_start=False,
                                          nb_rand_steps=0,
                                          e_rand_start=1.0,
                                          e_rand_target=0.1,
                                          e_rand_decay=1 / 10000))

        #
        #   Plotting
        #
        # Need to re-think how plotting works
        if args.plot:
            fig1 = plt.figure()
            #fig2 = plt.figure()
            self.plotter = rl.util.Plotter(
                realtime_plotting=True,
                plot_every=1000,
                disp_len=1000,
                nb_actions=self.env.action_space.n,
                figures=(fig1, ),
                ax_qmax_wf=fig1.add_subplot(2, 4, 1, projection='3d'),
                ax_qmax_im=fig1.add_subplot(2, 4, 2),
                ax_policy=fig1.add_subplot(2, 4, 3),
                ax_trajectory=fig1.add_subplot(2, 4, 4),
                ax_stats=None,
                ax_memory=None,  #fig2.add_subplot(1,1,1),
                ax_q_series=None,
                ax_reward=fig1.add_subplot(2, 1, 2),
            )
            self.plotter.set_state_action_spaces(
                self.env.observation_space.low,
                self.env.observation_space.high,
                h_line=0.0,
                v_line=-0.5)

        #
        #   Logging
        #
        if args.logfile is not None or args.plot:
            self.logger = rl.util.Logger()

            self.logger.agent = rl.util.Log('Agent')
            self.logger.q_val = rl.util.Log('Q_Val')
            self.logger.env = rl.util.Log('Environment')
            self.logger.hist = rl.util.Log('History', 'All sates visited')
            self.logger.memory = rl.util.Log('Memory', 'Full memory dump')
            self.logger.approx = rl.util.Log('Approx', 'Approximator')
            self.logger.epsumm = rl.util.Log('Episodes')

            agent.log_episodes = self.logger.epsumm
            agent.log_hist = self.logger.hist
            if isinstance(agent, rl.AgentDQN):
                agent.memory.install_logger(self.logger.memory, log_every=1000)
            agent.Q.install_logger(self.logger.q_val,
                                   log_every=1000,
                                   samples=(64, 64))

            agent.register_callback('on_step_end', self.on_step_end)

        #
        #   Runner
        #
        try:
            rl.train_agent(env=self.env,
                           agent=agent,
                           total_steps=1000000,
                           target_avg_reward=-200)
        finally:
            if args.logfile is not None:
                logger.save(args.logfile)
                print('Log saved')

        if self.plotter is not None:
            plt.show()