Exemplo n.º 1
0
    def run_step(
            self,
            t: int,
            agent: Agent,
            monitor: Monitor
    ) -> bool:
        """
        Run a step of the environment with an agent.

        :param t: Step.
        :param agent: Agent.
        :param monitor: Monitor.
        :return: True if a terminal state was entered and the run should terminate, and False otherwise.
        """

        a = agent.act(t=t)

        self.state, next_reward = self.advance(
            state=self.state,
            t=t,
            a=a,
            agent=agent
        )

        agent.sense(
            state=self.state,
            t=t+1
        )

        agent.reward(next_reward.r)

        monitor.report(t=t+1, action_reward=next_reward.r)

        return self.state.terminal
Exemplo n.º 2
0
def test_run():

    random_state = RandomState(12345)

    mdp_environment: GamblersProblem = GamblersProblem(
        'gamblers problem', random_state=random_state, T=None, p_h=0.4)

    agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    monitor = Monitor()
    state = mdp_environment.reset_for_new_run(agent)
    agent.reset_for_new_run(state)
    mdp_environment.run(agent, monitor)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle', 'wb') as file:
    #     pickle.dump(monitor, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle',
              'rb') as file:
        fixture = pickle.load(file)

    assert monitor.t_average_reward == fixture.t_average_reward
Exemplo n.º 3
0
def test_monitor():

    T = 100

    monitor = Monitor()

    rng = RandomState(12345)

    actions = rng.randint(0, 2, T)
    optimal_actions = rng.randint(0, 2, T)
    rewards = rng.random(T)

    for t in range(T):

        monitor.report(
            t=t,
            agent_action=actions[t],
            optimal_action=optimal_actions[t],
            action_reward=rewards[t]
        )

    assert np.array_equal(
        [
            monitor.t_count_optimal_action[t]
            for t in sorted(monitor.t_count_optimal_action)
        ],
        [
            1 if action == optimal else 0
            for action, optimal in zip(actions, optimal_actions)
        ]
    )

    assert np.array_equal(
        [
            monitor.t_average_cumulative_reward[t].get_value()
            for t in sorted(monitor.t_average_cumulative_reward)
        ],
        np.cumsum(rewards)
    )
Exemplo n.º 4
0
    def run_step(self, t: int, agent: Agent, monitor: Monitor) -> bool:
        """
        Run a step of the environment with an agent.

        :param t: Step.
        :param agent: Agent.
        :param monitor: Monitor.
        :return: True if a terminal state was entered and the run should terminate, and False otherwise.
        """

        if self.random_state.random_sample() < self.reset_probability:
            self.reset_for_new_run(agent)

        action = agent.act(t=t)
        monitor.report(t=t,
                       agent_action=action,
                       optimal_action=Action(self.best_arm.i))

        reward = self.pull(action.i)
        monitor.report(t=t, action_reward=reward)

        agent.reward(reward)

        return False
Exemplo n.º 5
0
def run(args: List[str]) -> List[Monitor]:
    """
    Run an agent within an environment.

    :param args: Arguments.
    :return: List of run monitors.
    """

    parser = get_argument_parser_for_run()

    parsed_args, unparsed_args = parser.parse_known_args(args)

    # set logging level
    if parsed_args.log is not None:
        logging.getLogger().setLevel(parsed_args.log)
    del parsed_args.log

    if parsed_args.random_seed is None:
        warnings.warn(
            'No random seed provided to the trainer. Results will not be replicable. Consider passing --random-seed argument.'
        )
        random_state = RandomState()
    else:
        random_state = RandomState(parsed_args.random_seed)

    # init environment
    environment_class = load_class(parsed_args.environment)
    environment, unparsed_args = environment_class.init_from_arguments(
        args=unparsed_args, random_state=random_state)

    # init agent from file if it's a path
    if os.path.exists(os.path.expanduser(parsed_args.agent)):
        with open(os.path.expanduser(parsed_args.agent), 'rb') as f:
            agents = [pickle.load(f)]

    # otherwise, parse arguments for agent.
    else:
        agent_class = load_class(parsed_args.agent)
        agents, unparsed_args = agent_class.init_from_arguments(
            args=unparsed_args,
            random_state=random_state,
            pi=
            None  # there can't be a policy in this case, as policies only come from prior training/pickling.
        )

    # no unparsed arguments should remain
    if len(unparsed_args) > 0:
        raise ValueError(f'Unparsed arguments remain:  {unparsed_args}')

    # set up plotting
    pdf = None
    reward_ax = cum_reward_ax = optimal_action_ax = None
    if parsed_args.plot:

        if parsed_args.pdf_save_path:
            pdf = PdfPages(parsed_args.pdf_save_path)

        _, axs = plt.subplots(2, 1, sharex='all', figsize=(6, 9))

        reward_ax = axs[0]
        cum_reward_ax = reward_ax.twinx()
        optimal_action_ax = axs[1]

    # run each agent in the environment
    monitors = []
    for agent in agents:

        logging.info(f'Running {agent} agent in {environment} environment.')

        # manually set the environment on continuous action policies, as they require a reference but do not pickle it.
        if hasattr(agent, 'pi') and isinstance(agent.pi,
                                               ContinuousActionPolicy):
            agent.pi.environment = environment

        monitor = Monitor()
        monitors.append(monitor)

        num_runs_per_print = math.ceil(parsed_args.n_runs * 0.05)
        for r in range(parsed_args.n_runs):

            state = environment.reset_for_new_run(agent)
            agent.reset_for_new_run(state)
            monitor.reset_for_new_run()

            environment.run(agent=agent, monitor=monitor)

            num_runs_finished = r + 1
            if (num_runs_finished % num_runs_per_print) == 0:
                percent_done = 100 * (num_runs_finished / parsed_args.n_runs)
                logging.info(
                    f'{percent_done:.0f}% complete (finished {num_runs_finished} of {parsed_args.n_runs} runs).'
                )

        if parsed_args.plot:

            reward_ax.plot([
                monitor.t_average_reward[t].get_value()
                for t in sorted(monitor.t_average_reward)
            ],
                           linewidth=1,
                           label=agent.name)

            cum_reward_ax.plot([
                monitor.t_average_cumulative_reward[t].get_value()
                for t in sorted(monitor.t_average_cumulative_reward)
            ],
                               linewidth=1,
                               linestyle='--',
                               label=agent.name)

            optimal_action_ax.plot([
                monitor.t_count_optimal_action[t] / parsed_args.n_runs
                for t in sorted(monitor.t_count_optimal_action)
            ],
                                   linewidth=1,
                                   label=agent.name)

    # finish plotting
    if parsed_args.plot:

        if parsed_args.figure_name is not None:
            reward_ax.set_title(parsed_args.figure_name)

        reward_ax.set_xlabel('Time step')
        reward_ax.set_ylabel(
            f'Per-step reward (averaged over {parsed_args.n_runs} run(s))')
        reward_ax.grid()
        reward_ax.legend()
        cum_reward_ax.set_ylabel(
            f'Cumulative reward (averaged over {parsed_args.n_runs} run(s))')
        cum_reward_ax.legend(loc='lower right')

        optimal_action_ax.set_xlabel('Time step')
        optimal_action_ax.set_ylabel('% optimal action selected')
        optimal_action_ax.grid()
        optimal_action_ax.legend()

        plt.tight_layout()

        if pdf is None:
            plt.show(block=False)
        else:
            pdf.savefig()
            pdf.close()

    return monitors