Exemplo n.º 1
0
def train_agent(agent, env, steps, outdir, max_episode_len=None,
                step_offset=0, evaluator=None, successful_score=None,
                step_hooks=[], logger=None):

    logger = logger or logging.getLogger(__name__)

    episode_r = 0
    episode_idx = 0

    # o_0, r_0
    obs = env.reset()
    r = 0

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    episode_len = 0
    try:
        while t < steps:

            # a_t
            action = agent.act_and_train(obs, r)
            # o_{t+1}, r_{t+1}
            obs, r, done, info = env.step(action)
            t += 1
            episode_r += r
            episode_len += 1

            for hook in step_hooks:
                hook(env, agent, t)

            if done or episode_len == max_episode_len or t == steps:
                agent.stop_episode_and_train(obs, r, done=done)
                logger.info('outdir:%s step:%s episode:%s R:%s',
                            outdir, t, episode_idx, episode_r)
                logger.info('statistics:%s', agent.get_statistics())
                if evaluator is not None:
                    evaluator.evaluate_if_necessary(
                        t=t, episodes=episode_idx + 1)
                    if (successful_score is not None and
                            evaluator.max_score >= successful_score):
                        break
                if t == steps:
                    break
                # Start a new episode
                episode_r = 0
                episode_idx += 1
                episode_len = 0
                obs = env.reset()
                r = 0

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        raise

    # Save the final model
    save_agent(agent, t, outdir, logger, suffix='_finish')
Exemplo n.º 2
0
def train_agent(agent,
                env,
                steps,
                outdir,
                max_episode_len=None,
                step_offset=0,
                evaluator=None,
                successful_score=None):

    episode_r = 0
    episode_idx = 0

    # o_0, r_0
    obs = env.reset()
    r = 0
    done = False

    t = step_offset
    agent.t = step_offset

    episode_len = 0
    try:
        while t < steps:

            # a_t
            action = agent.act_and_train(obs, r)
            # o_{t+1}, r_{t+1}
            obs, r, done, info = env.step(action)
            t += 1
            episode_r += r
            episode_len += 1

            if done or episode_len == max_episode_len or t == steps:
                agent.stop_episode_and_train(obs, r, done=done)
                print('outdir:{} step:{} episode:{} R:{}'.format(
                    outdir, t, episode_idx, episode_r))
                print('statistics:{}'.format(agent.get_statistics()))
                if evaluator is not None:
                    evaluator.evaluate_if_necessary(t)
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break
                if t == steps:
                    break
                # Start a new episode
                episode_r = 0
                episode_idx += 1
                episode_len = 0
                obs = env.reset()
                r = 0
                done = False

    except Exception:
        # Save the current model before being killed
        save_agent(agent, t, outdir, suffix='_except')
        raise

    # Save the final model
    save_agent(agent, t, outdir, suffix='_finish')
    def sending_mission_quit_commands(self, overall_reward_agent_Tom,
                                      overall_reward_agent_Jerry, time_step,
                                      obs1, r1, obs2, r2, outdir, t, tom,
                                      jerry, experiment_ID):

        self.agent_host1.sendCommand("quit")
        self.agent_host2.sendCommand("quit")
        self.agent_host3.sendCommand("quit")

        dirname = os.path.join(outdir, 'plots')
        print("dirname: ", dirname)
        """ save and show results of reward calculations """
        self.save_results(overall_reward_agent_Tom, overall_reward_agent_Jerry,
                          time_step)
        print("Final Reward Tom:   ", overall_reward_agent_Tom)
        print("Final Reward Jerry: ", overall_reward_agent_Jerry)
        """ end episode, save results """
        tom.stop_episode_and_train(obs1, r1, done=True)
        jerry.stop_episode_and_train(obs2, r2, done=True)
        print("outdir: %s step: %s " % (outdir, t))
        print("Tom's statistics:   ", tom.get_statistics())
        print("Jerry's statistics: ", jerry.get_statistics())
        """ save the final model and results """
        save_agent(tom, t, outdir, logger, suffix='_finish_01')
        save_agent(jerry, t, outdir, logger, suffix='_finish_02')
        """ save all the collected data for evaluation graphs """
        self.save_data_for_evaluation_plots(t, time_step,
                                            overall_reward_agent_Tom,
                                            overall_reward_agent_Jerry,
                                            dirname)
        time.sleep(2)
        """ initialisation for the next episode, reset parameters, build new world """
        t += 1
        self.episode_counter += 1
        r1 = r2 = 0
        done1 = done2 = self.mission_end = False
        overall_reward_agent_Jerry = overall_reward_agent_Tom = 0
        self.save_new_round(t)
        obs1, obs2 = self.reset_world(experiment_ID)
        self.too_close_counter = 0
        self.winner_agent = "-"
        self.time_step_tom_won = self.time_step_jerry_won = None
        self.time_step_tom_captured_the_flag = self.time_step_jerry_captured_the_flag = None
        self.time_step_agents_ran_into_each_other = None
        self.steps_tom = 0
        self.steps_jerry = 0
        """ recover """
        """if evaluator1 and evaluator2 is not None:
            evaluator1.evaluate_if_necessary(
                t=t, episodes=episode_idx + 1)
            evaluator2.evaluate_if_necessary(
                t=t, episodes=episode_idx + 1)
            if (successful_score is not None and
                    evaluator1.max_score >= successful_score and evaluator2.max_score >= successful_score):
                break"""
        return t, obs1, obs2, r1, r2, done1, done2, overall_reward_agent_Jerry, overall_reward_agent_Tom
Exemplo n.º 4
0
def train_agent_batch(agent,
                      env,
                      steps,
                      outdir,
                      log_interval=None,
                      max_episode_len=None,
                      eval_interval=None,
                      step_offset=0,
                      evaluator=None,
                      successful_score=None,
                      step_hooks=[],
                      return_window_size=100,
                      logger=None):
    """Train an agent in a batch environment.

    Args:
        agent: Agent to train.
        env: Environment to train the agent against.
        steps (int): Number of total time steps for training.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output things.
        log_interval (int): Interval of logging.
        max_episode_len (int): Maximum episode length.
        step_offset (int): Time step from which training starts.
        return_window_size (int): Number of training episodes used to estimate
            the average returns of the current agent.
        successful_score (float): Finish training if the mean score is greater
            or equal to thisvalue if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        logger (logging.Logger): Logger used in this function.
    """

    logger = logger or logging.getLogger(__name__)
    recent_returns = deque(maxlen=return_window_size)

    num_envs = env.num_envs
    episode_r = np.zeros(num_envs, dtype=np.float64)
    episode_idx = np.zeros(num_envs, dtype='i')
    episode_len = np.zeros(num_envs, dtype='i')

    # o_0, r_0
    obss = env.reset()
    rs = np.zeros(num_envs, dtype='f')

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    try:
        while True:
            # a_t
            actions = agent.batch_act_and_train(obss)
            # o_{t+1}, r_{t+1}
            obss, rs, dones, infos = env.step(actions)
            episode_r += rs
            episode_len += 1

            # Compute mask for done and reset
            if max_episode_len is None:
                resets = np.zeros(num_envs, dtype=bool)
            else:
                resets = (episode_len == max_episode_len)
            resets = np.logical_or(
                resets, [info.get('needs_reset', False) for info in infos])
            # Agent observes the consequences
            agent.batch_observe_and_train(obss, rs, dones, resets)

            # Make mask. 0 if done/reset, 1 if pass
            end = np.logical_or(resets, dones)
            not_end = np.logical_not(end)

            # For episodes that ends, do the following:
            #   1. increment the episode count
            #   2. record the return
            #   3. clear the record of rewards
            #   4. clear the record of the number of steps
            #   5. reset the env to start a new episode
            # 3-5 are skipped when training is already finished.
            episode_idx += end
            recent_returns.extend(episode_r[end])

            for _ in range(num_envs):
                t += 1
                for hook in step_hooks:
                    hook(env, agent, t)

            if (log_interval is not None and t >= log_interval
                    and t % log_interval < num_envs):
                logger.info(
                    'outdir:{} step:{} episode:{} last_R: {} average_R:{}'.
                    format(  # NOQA
                        outdir,
                        t,
                        np.sum(episode_idx),
                        recent_returns[-1] if recent_returns else np.nan,
                        np.mean(recent_returns) if recent_returns else np.nan,
                    ))
                logger.info('statistics: {}'.format(agent.get_statistics()))
            if evaluator:
                if evaluator.evaluate_if_necessary(
                        t=t, episodes=np.sum(episode_idx)):
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break

            if t >= steps:
                break

            # Start new episodes if needed
            episode_r[end] = 0
            episode_len[end] = 0
            obss = env.reset(not_end)

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        env.close()
        if evaluator:
            evaluator.env.close()
        raise
    else:
        # Save the final model
        save_agent(agent, t, outdir, logger, suffix='_finish')
Exemplo n.º 5
0
def train_agent(agent,
                env,
                steps,
                outdir,
                max_episode_len=None,
                step_offset=0,
                evaluator=None,
                successful_score=None,
                step_hooks=[],
                episode_hooks=[],
                logger=None):
    logger = logger or logging.getLogger(__name__)

    episode_r = 0
    episode_idx = 0

    # o_0, r_0
    obs = env.reset()
    last_obs = []
    r = 0
    done = False

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    episode_len = 0
    try:
        while t < steps:

            # a_t
            action = agent.act_and_train(obs, r)
            last_obs = copy.copy(obs)
            # o_{t+1}, r_{t+1}
            obs, r, done, info = env.step(action)
            with open(path, 'a+') as f:
                f.write("action is {}\n".format(episode_idx, episode_len,
                                                action))
                f.write("obs is {}\n".format(obs - last_obs))

            t += 1
            episode_r += r
            episode_len += 1

            for hook in step_hooks:
                hook(env, agent, t)

            if done or episode_len == max_episode_len or t == steps:
                with open(path, 'a+') as f:
                    f.write("done\n")
                agent.stop_episode_and_train(obs, r, done=done)
                logger.info('outdir:%s step:%s episode:%s R:%s', outdir, t,
                            episode_idx, episode_r)
                logger.info('statistics:%s', agent.get_statistics())
                if evaluator is not None:
                    evaluator.evaluate_if_necessary(t=t,
                                                    episodes=episode_idx + 1)
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break
                if t == steps:
                    break
                for hook in episode_hooks:
                    hook(env, agent, episode_idx, episode_len)
                # Start a new episode
                episode_r = 0
                episode_idx += 1
                episode_len = 0
                obs = env.reset()
                r = 0
                done = False

    except Exception:
        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        raise

    # Save the final model
    save_agent(agent, t, outdir, logger, suffix='_finish')
Exemplo n.º 6
0
def parallel_train_agent_batch(start_weighted_size,
                               all_agents,
                               env,
                               steps,
                               outdir,
                               log_interval=None,
                               max_episode_len=None,
                               eval_interval=None,
                               step_offset=0,
                               evaluator=None,
                               before_evaluator=None,
                               successful_score=None,
                               step_hooks=[],
                               return_window_size=100,
                               logger=None,
                               step_callback=None,
                               schedule_args={}):
    """Train an agent in a batch environment.

    Args:
        agent: Agent to train.
        env: Environment to train the agent against.
        steps (int): Number of total time steps for training.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output things.
        log_interval (int): Interval of logging.
        max_episode_len (int): Maximum episode length.
        step_offset (int): Time step from which training starts.
        return_window_size (int): Number of training episodes used to estimate
            the average returns of the current agent.
        successful_score (float): Finish training if the mean score is greater
            or equal to thisvalue if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        logger (logging.Logger): Logger used in this function.
    """

    logger = logger or logging.getLogger(__name__)

    # TODO: set a buffer for recent returns
    n_agents = len(all_agents)
    select_probs = np.array([1 / n_agents] * n_agents)
    agent_ids = [id(agent) for agent in all_agents]
    assert len(np.unique(agent_ids)) == n_agents

    all_recent_returns = [
        deque(maxlen=return_window_size) for i in range(len(all_agents))
    ]
    all_eval_returns = [0 for i in range(len(all_agents))]
    num_envs = env.num_envs
    assert num_envs == 1

    episode_r = np.zeros(num_envs, dtype=np.float64)
    episode_idx = np.zeros(num_envs, dtype='i')
    episode_len = np.zeros(num_envs, dtype='i')

    # o_0, r_0
    obss = env.reset()
    rs = np.zeros(num_envs, dtype='f')

    t = step_offset

    # TODO: initialize an agent index for the first episode (initially we use uniform distribution)
    perf_metric = schedule_args['perf_metric']
    selected_agent_idx = np.random.randint(low=0, high=len(all_agents))
    logger.info('selected_idx: {}'.format(selected_agent_idx))
    try:
        while True:
            # TODO: take the actins from the selected agent
            agent = all_agents[selected_agent_idx]
            # a_t
            actions = agent.batch_act_and_train(obss)
            # HACK: to set the other agents' batch_last_obs as []
            for idx in range(n_agents):
                if idx != selected_agent_idx:
                    all_agents[idx].batch_last_obs = [None] * num_envs

            # o_{t+1}, r_{t+1}
            obss_next, rs, dones, infos = env.step(actions)
            obss = obss_next
            episode_r += rs
            episode_len += 1

            # Compute mask for done and reset
            if max_episode_len is None:
                resets = np.zeros(num_envs, dtype=bool)
            else:
                resets = (episode_len == max_episode_len)
            resets = np.logical_or(
                resets, [info.get('needs_reset', False) for info in infos])

            # Agent observes the consequences
            for idx, agent_in_list in enumerate(all_agents):
                # Only the selected agent will add replay buffer. The others just update.
                agent_in_list.batch_observe_and_train(obss, rs, dones, resets)
                logger.debug('agent_{}: t = {}, selected_idx = {}'.format(
                    idx, agent_in_list.t, selected_agent_idx))
            after_ts = [agent_in_list.t for agent_in_list in all_agents]
            assert len(np.unique(after_ts)) == 1

            # Make mask. 0 if done/reset, 1 if pass
            end = np.logical_or(resets, dones)
            not_end = np.logical_not(end)

            if before_evaluator:
                before_max_mean, before_all_means = before_evaluator.evaluate_if_necessary(
                    t=t + 1, episodes=np.sum(episode_idx))

            if step_callback is not None:
                new_select_probs = step_callback(t, all_recent_returns,
                                                 all_eval_returns, np.any(end))
                if new_select_probs is not None and schedule_args[
                        'select_prob_update'] == 'interval':
                    select_probs = new_select_probs

            # For episodes that ends, do the following:
            #   1. increment the episode count
            #   2. record the return
            #   3. clear the record of rewards
            #   4. clear the record of the number of steps
            #   5. reset the env to start a new episode
            # 3-5 are skipped when training is already finished.
            episode_idx += end

            # TODO: append to the selected returns
            all_recent_returns[selected_agent_idx].extend(episode_r[end])

            for _ in range(num_envs):
                t += 1
                for hook in step_hooks:
                    hook(env, agent, t)

            if (log_interval is not None and t >= log_interval
                    and t % log_interval < num_envs):
                logger.info(
                    'outdir:{} agent_idx: {} step:{} episode:{} last_R: {} average_R:{}'
                    .format(  # NOQA
                        outdir,
                        selected_agent_idx,
                        t,
                        np.sum(episode_idx),
                        all_recent_returns[selected_agent_idx][-1]
                        if all_recent_returns[selected_agent_idx] else np.nan,
                        np.mean(all_recent_returns[selected_agent_idx])
                        if all_recent_returns[selected_agent_idx] else np.nan,
                    ))
                logger.info('statistics: {}'.format(agent.get_statistics()))
            if evaluator:
                max_mean, all_means = evaluator.evaluate_if_necessary(
                    t=t, episodes=np.sum(episode_idx))
                all_eval_returns = all_means
                if max_mean:
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break

            if t >= steps:
                break

            # Start new episodes if needed
            episode_r[end] = 0
            episode_len[end] = 0
            obss = env.reset(not_end)

            # TODO: update the selected agent index
            if np.any(end):
                logger.debug('selected_agent.last_obs: {}'.format(
                    agent.batch_last_obs))
                #TODO: weighted selection, uniform, or eps-greedy
                if t < start_weighted_size or schedule_args[
                        'select_algo'] == 'uniform':
                    # prevent not yet eval
                    if t < start_weighted_size:
                        all_mean_returns = '(warm up)'
                    else:
                        if perf_metric == 'train':
                            all_mean_returns = np.asarray([
                                np.mean(recent_returns)
                                for recent_returns in all_recent_returns
                            ])
                        else:
                            all_mean_returns = np.asarray(all_eval_returns)
                    logger.info(
                        't: {} (new agent idx) selection prob.: (uniform), mean returns: {}'
                        .format(t, all_mean_returns))
                    selected_agent_idx = np.random.randint(
                        low=0, high=len(all_agents))
                else:
                    if perf_metric == 'train':
                        all_mean_returns = np.asarray([
                            np.mean(recent_returns)
                            for recent_returns in all_recent_returns
                        ])
                    else:
                        all_mean_returns = np.asarray(all_eval_returns)

                    # determine sample or greedy using eps or algo
                    if schedule_args['select_algo'] == 'eps-greedy':
                        eps = schedule_args['eps_schedule'].value(t)
                        use_sample = (np.random.random() < eps)
                        logger.info(
                            't: {}, eps: {}, sample: {}, mean returns: {}'.
                            format(t, eps, use_sample, all_mean_returns))
                        selected_agent_idx = np.random.choice(
                            n_agents, 1)[0] if use_sample else np.argmax(
                                all_mean_returns)
                    else:
                        temp = 1.0
                        if 'select_prob_temp' in schedule_args:
                            temp = schedule_args['select_prob_temp']
                            logger.info('use temperature: {}'.format(
                                schedule_args['select_prob_temp']))
                        if schedule_args['select_prob_update'] == 'episode':
                            select_probs = softmax(all_mean_returns / temp)
                        else:
                            logger.info('Use periodically update')
                        logger.info(
                            't: {} (new agent idx) selection prob.: {}, mean returns: {}'
                            .format(t, select_probs, all_mean_returns))
                        selected_agent_idx = np.random.choice(
                            n_agents, 1, p=select_probs)[0]

                logger.info('t: {} selected_idx: {}'.format(
                    t, selected_agent_idx))

            # TODO: Call step callbacks

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        for idx, agent in enumerate(all_agents):
            save_agent(agent,
                       t,
                       outdir,
                       logger,
                       suffix='_except_{}'.format(idx))
        env.close()
        if evaluator:
            evaluator.env.close()
        raise
    else:
        # Save the final model
        for idx, agent in enumerate(all_agents):
            save_agent(agent,
                       t,
                       outdir,
                       logger,
                       suffix='_finish_{}'.format(idx))
Exemplo n.º 7
0
def train_agent_batch(agent,
                      env,
                      steps,
                      outdir,
                      log_interval=None,
                      max_episode_len=None,
                      eval_interval=None,
                      step_offset=0,
                      evaluator=None,
                      successful_score=None,
                      step_hooks=[],
                      return_window_size=100,
                      logger=None,
                      use_humans_reward=False,
                      humans_reward_interval=2048):
    """Train an agent in a batch environment.

    Args:
        agent: Agent to train.
        env: Environment train the againt against.
        steps (int): Number of total time steps for training.
        eval_interval (int): Interval of evaluation.
        outdir (str): Path to the directory to output things.
        log_interval (int): Interval of logging.
        max_episode_len (int): Maximum episode length.
        step_offset (int): Time step from which training starts.
        return_window_size (int): Number of training episodes used to estimate
            the average returns of the current agent.
        successful_score (float): Finish training if the mean score is greater
            or equal to thisvalue if not None
        step_hooks (list): List of callable objects that accepts
            (env, agent, step) as arguments. They are called every step.
            See chainerrl.experiments.hooks.
        logger (logging.Logger): Logger used in this function.
        use_humans_reward: Boolean for whether or not to use humans sessions reward
        humans_reward_interval: Number of episodes between reevaluation of agent performance
    """
    env_prop = update_global_env_prop_from_cfg()
    schema_name = SchemaName(cfg.schema)

    agent_humanity_rate = -1
    log_idx = 0
    logger = logger or logging.getLogger(__name__)

    arch = ArchName(cfg.arch)

    recent_returns = deque(maxlen=return_window_size)
    recent_returns_without_humanity = deque(maxlen=return_window_size)
    recent_rewards = defaultdict(lambda: deque(maxlen=return_window_size))

    # a deque that stores the entropy values of the various actions distributions during training
    entropy_values = deque(maxlen=100)

    # a counter for the number of actions from each type
    action_types_cntr = {"back": 0, "filter": 0, "group": 0}

    # a counter where key is a function index (numerical ID) and value
    # is number of occurrences
    actions_cntr = defaultdict(int)

    agent_humanity_info = {}

    num_envs = env.num_envs
    episode_r = np.zeros(num_envs, dtype=np.float64)
    episode_r_without_humanity = np.zeros(num_envs, dtype=np.float64)
    episode_idx = np.zeros(num_envs, dtype='i')
    episode_len = np.zeros(num_envs, dtype='i')

    # o_0, r_0
    obss = env.reset()
    rs = np.zeros(num_envs, dtype='f')

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    try:
        while t < steps:

            # a_t
            #actions = agent.batch_act_and_train(obss)
            actions, actions_distrib = batch_act_and_train(agent, obss)

            # trace entropy_values
            entropy_values.append(actions_distrib.entropy.data[0])

            # o_{t+1}, r_{t+1}
            obss, rs, dones, infos = env.step(actions)

            # Update actions of agent to actual actions made in environment
            # This step is being done because we change illegal filter actions
            # to legal actions inside the environment
            # if arch is ArchName.FF_PARAM_SOFTMAX:
            #     for info_idx, info in enumerate(infos):
            #         actions[info_idx] = info["actual_parametric_softmax_idx"]

            # add action types to counter
            action_types = []
            for action in actions:
                if arch is ArchName.FF_GAUSSIAN:
                    action = env_prop.compressed2full_range(action)
                elif arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX:
                    actions_cntr[action] += 1
                    action = env_prop.static_param_softmax_idx_to_action_type(
                        action)
                action_disc = ATENAEnvCont.cont2dis(action)
                action_type = ATENAUtils.OPERATOR_TYPE_LOOKUP.get(
                    action_disc[0])
                action_types_cntr[action_type] += 1
                action_types.append(action_type)

            episode_r_without_humanity += rs

            # save rewards for logging purposes
            for i, info in enumerate(infos):
                action_type = action_types[i]
                step_reward_info = info["reward_info"]
                for reward_type, value in step_reward_info.items():
                    if (value != 0 or reward_type in {
                            "back", "same_display_seen_already",
                            "empty_display", "empty_groupings", "humanity"
                    } or (action_type == "group"
                          and reward_type in {"compaction_gain", "diversity"})
                            or
                        (action_type == "filter"
                         and reward_type in {"kl_distance", "diversity"})):
                        recent_rewards[reward_type].append(value)

            # add rewards to rewards of episode
            episode_r += rs

            episode_len += 1

            # Compute mask for done and reset
            if max_episode_len is None:
                resets = np.zeros(num_envs, dtype=bool)
            else:
                resets = (episode_len == max_episode_len)
            # Agent observes the consequences
            agent.batch_observe_and_train(obss, rs, dones, resets)

            # Make mask. 0 if done/reset, 1 if pass
            end = np.logical_or(resets, dones)
            not_end = np.logical_not(end)

            # For episodes that ends, do the following:
            #   1. increment the episode count
            #   2. record the return
            #   3. clear the record of rewards
            #   4. clear the record of the number of steps
            #   5. reset the env to start a new episode
            episode_idx += end
            recent_returns.extend(episode_r[end])
            recent_returns_without_humanity.extend(
                episode_r_without_humanity[end])
            episode_r[end] = 0
            episode_r_without_humanity[end] = 0
            episode_len[end] = 0

            obss = env.reset(not_end)

            for _ in range(num_envs):
                t += 1
                for hook in step_hooks:
                    hook(env, agent, t)

            # log and save using Tensorboard
            if (log_interval is not None and t >= log_interval
                    and t % log_interval < num_envs):
                logger.info(
                    'outdir:{} step:{} episode:{} last_R: {} average_R:{}'.
                    format(  # NOQA
                        outdir,
                        t,
                        np.sum(episode_idx),
                        recent_returns[-1] if recent_returns else np.nan,
                        np.mean(recent_returns) if recent_returns else np.nan,
                    ))
                summary_writer.add_scalar('episode_reward',
                                          np.mean(recent_returns), t)
                summary_writer.add_scalar(
                    'episode_r_without_humanity',
                    np.mean(recent_returns_without_humanity), t)
                summary_writer.add_scalar('agent_humanity_rate',
                                          agent_humanity_rate, t)
                for reward_type, reward_vals in recent_rewards.items():
                    summary_writer.add_scalar(reward_type,
                                              np.mean(reward_vals), t)

                if not cfg.obs_with_step_num and cfg.stack_obs_num == 1:
                    for elem in [
                            'success_count_per_action_type',
                            'failure_count_per_action_type'
                    ]:
                        summary_writer.add_scalars(elem,
                                                   agent_humanity_info[elem],
                                                   t)

                summary_writer.add_scalars('action_types_count',
                                           action_types_cntr, t)

                log_idx += 1
                logger.info('statistics: {}'.format(agent.get_statistics()))
                # k_probs = 18
                # k_highest_act_probs = actions_distrib.k_highest_probablities(k_probs)
                # avg_k_highest_act_probs = np.mean(k_highest_act_probs, axis=0)
                # logger.info('actions_distribution ({} highest probs):\n{}'.format(
                #     k_probs, k_highest_act_probs))
                # summary_writer.add_scalar('avg_highest_act_prob', avg_k_highest_act_probs[0], t)
                # summary_writer.add_scalar('avg_second_highest_act_prob', avg_k_highest_act_probs[1], t)
                summary_writer.add_scalar('avg_entropy',
                                          np.mean(entropy_values), t)

            if evaluator:
                if evaluator.evaluate_if_necessary(
                        t=t, episodes=np.sum(episode_idx)):
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break

    except (Exception, KeyboardInterrupt):
        # save the actions counter before killed
        # Store data (serialize)
        if arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX:
            actions_cntr_path = os.path.join(outdir, 'actions_cntr.pickle')
            with open(actions_cntr_path, 'wb') as handle:
                pickle.dump(actions_cntr,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        env.close()
        if evaluator:
            evaluator.env.close()

        # save the current difficult human observations before killed
        # Store data (serialize)
        if not cfg.obs_with_step_num and cfg.stack_obs_num == 1:
            failure_obs_path = os.path.join(outdir, 'hard_obs.pickle')
            with open(failure_obs_path, 'wb') as handle:
                pickle.dump(agent_humanity_info['failure_obs'],
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

        raise
    else:
        # save the actions counter
        # Store data (serialize)
        if arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX:
            actions_cntr_path = os.path.join(outdir, 'actions_cntr.pickle')
            with open(actions_cntr_path, 'wb') as handle:
                pickle.dump(actions_cntr,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

        # Save the final model
        save_agent(agent, t, outdir, logger, suffix='_finish')

        # save the final difficult human observations
        # Store data (serialize)
        if not cfg.obs_with_step_num and cfg.stack_obs_num == 1:
            failure_obs_path = os.path.join(outdir, 'hard_obs.pickle')
            with open(failure_obs_path, 'wb') as handle:
                pickle.dump(agent_humanity_info['failure_obs'],
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 8
0
def train_agent(agent,
                env,
                steps,
                outdir,
                max_episode_len=None,
                step_offset=0,
                evaluator=None,
                successful_score=None,
                step_hooks=[],
                logger=None):
    logger = logger or logging.getLogger(__name__)

    episode_r = 0
    # vector where each index contains the number that represents the type
    # of action in step == index, in the current episode
    # the actions are 0, 1, 2 for back, filter, group resp.
    episode_action_type_hist = []
    episode_idx = 0

    # o_0, r_0
    obs = env.reset()
    r = 0

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    episode_len = 0
    try:
        while t < steps:

            # a_t
            action = agent.act_and_train(obs, r)
            if episode_idx % SUMMARY_EPISODE_SLOT == 0:
                # env.env.env exists only if args.monitor is set to True
                action = env.env_prop.compressed2full_range(action)
                action_disc = env.cont2dis(action)
                episode_action_type_hist.append(action_disc[0])
            # o_{t+1}, r_{t+1}
            obs, r, done, info = env.step(action)
            t += 1
            episode_r += r
            episode_len += 1

            for hook in step_hooks:
                hook(env, agent, t)

            if done or episode_len == max_episode_len or t == steps:
                agent.stop_episode_and_train(obs, r, done=done)
                # logger.info('outdir:%s step:%s episode:%s R:%s',
                #            outdir, t, episode_idx, episode_r)
                # logger.info('statistics:%s', agent.get_statistics())
                log_results(logger, outdir, t, episode_idx, episode_r,
                            agent.get_statistics(), episode_action_type_hist)

                if episode_idx % SUMMARY_EPISODE_SLOT == 0:
                    summary_writer.add_scalar('episode_reward', episode_r,
                                              episode_idx)
                    summary_writer.add_histogram(
                        'operators_hist', np.array(episode_action_type_hist),
                        episode_idx)
                    episode_action_type_hist = []

                if evaluator is not None:
                    evaluator.evaluate_if_necessary(t=t,
                                                    episodes=episode_idx + 1)
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break
                if t == steps:
                    break
                # Start a new episode
                episode_r = 0
                episode_idx += 1
                episode_len = 0
                obs = env.reset()
                r = 0

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        raise

    # Save the final model
    save_agent(agent, t, outdir, logger, suffix='_finish')
Exemplo n.º 9
0
def train_agent(agent,
                env,
                steps,
                outdir,
                checkpoint_freq=None,
                max_episode_len=None,
                step_offset=0,
                evaluator=None,
                successful_score=None,
                step_hooks=(),
                logger=None):

    logger = logger or logging.getLogger(__name__)

    episode_r = 0
    episode_idx = 0

    # o_0, r_0
    obs = env.reset()
    r = 0

    t = step_offset
    if hasattr(agent, 't'):
        agent.t = step_offset

    start_time = time.time()
    last_chunk_end_time = start_time
    chunk = 1000

    episode_len = 0
    try:
        while t < steps:
            # print(t, t % 1000 == 0)
            if t % chunk == 0:
                now = time.time()
                chunk_elapsed = now - last_chunk_end_time
                last_chunk_end_time = now
                logger.critical(
                    "STEPS: {} / {}, {:0.0f}s total elapsed {:0.0f} for this chunk of {}"
                    .format(t, steps, now - start_time, chunk_elapsed, chunk))

            # a_t
            action = agent.act_and_train(obs, r)
            # o_{t+1}, r_{t+1}
            obs, r, done, info = env.step(action)
            t += 1
            episode_r += r
            episode_len += 1

            for hook in step_hooks:
                hook(env, agent, t)

            reset = (episode_len == max_episode_len
                     or info.get('needs_reset', False))
            if done or reset or t == steps:
                agent.stop_episode_and_train(obs, r, done=done)
                logger.info('outdir:%s step:%s episode:%s R:%s', outdir, t,
                            episode_idx, episode_r)
                logger.info('statistics:%s', agent.get_statistics())
                if evaluator is not None:
                    evaluator.evaluate_if_necessary(t=t,
                                                    episodes=episode_idx + 1)
                    if (successful_score is not None
                            and evaluator.max_score >= successful_score):
                        break
                if t == steps:
                    break
                # Start a new episode
                episode_r = 0
                episode_idx += 1
                episode_len = 0
                obs = env.reset()
                r = 0
            if checkpoint_freq and t % checkpoint_freq == 0:
                save_agent(agent, t, outdir, logger, suffix='_checkpoint')

    except (Exception, KeyboardInterrupt):
        # Save the current model before being killed
        save_agent(agent, t, outdir, logger, suffix='_except')
        raise

    # Save the final model
    save_agent(agent, t, outdir, logger, suffix='_finish')
                    hook(env, jerry, t)

                """ check, if an agent captured the flag """
                env.check_inventory(time_step)

                print("--------------------------------------------------------------------")

                """ end mission when both agents finished or time is over, start over again """
                if env.mission_end or done1 or done2 or (time_step > 1920):  # 960 = 16 min | 1920 = 32 min

                    """ send mission QuitCommands to tell Malmo that the mission has ended,save and reset everything """
                    t, obs1, obs2, r1, r2, done1, done2, overall_reward_agent_Jerry, overall_reward_agent_Tom = \
                        env.sending_mission_quit_commands(overall_reward_agent_Tom, overall_reward_agent_Jerry,
                                                          time_step, obs1, r1, obs2, r2, outdir, t, tom, jerry,
                                                          experiment_ID)

                    time_stamp_start = time.time()

                    """ recover """
                    time.sleep(5)

                if t == 1001:
                    print("Mission-Set finished. Congratulations! Check results and parameters. Start over.")
                    break

        except (Exception, KeyboardInterrupt):
            # Save the current model before being killed
            save_agent(tom, t, outdir, logger, suffix='_except01')
            save_agent(jerry, t, outdir, logger, suffix='_except02')
            raise