示例#1
0
def run_episode(episode_prep, args, logger):

    method_id, params_episode, params_method, action_dict = \
        episode_prep['method_id'], episode_prep['params'], episode_prep['params_method'],\
        episode_prep['action_dict']

    if params_method['method'] == 'Double_Q_Learning':
        (Q1, Q2) = episode_prep['Q']
    else:
        Q = episode_prep['Q']

    evo_episode = {
        'n_episode_steps': 0,
        'done': False,
        'action': [],
        'action_taken': [],
        'reward': [],
        'happiness': [],
        'food': [],
        'inv_fat': [],
        'affection': []
    }

    # Start episode and get initial observation
    state = env_reset()
    evo_episode['happiness'].append(get_happiness(state))
    evo_episode['food'].append(state['food'])
    evo_episode['inv_fat'].append(1 - state['fat'])
    evo_episode['affection'].append(state['affection'])

    # pbar = tqdm(total=params['nmax_steps'])

    while (not evo_episode['done']) and (evo_episode['n_episode_steps'] <
                                         params_episode['nmax_steps']):

        # Get an action
        if params_method['method'] == 'Double_Q_Learning':
            action = select_best_action(Q_state=Q1[state['state_id']] +
                                        Q2[state['state_id']])
        else:
            action = select_best_action(Q_state=Q[state['state_id']])
        evo_episode['action'].append(action)

        # Perform a step
        state, reward, evo_episode['done'], info = env_step(state, action)
        evo_episode['reward'].append(reward)
        evo_episode['happiness'].append(get_happiness(state))
        if info['action_taken_while_not_possible']:
            evo_episode['action_taken'].append(0)
        else:
            evo_episode['action_taken'].append(action)
        evo_episode['food'].append(state['food'])
        evo_episode['inv_fat'].append(1 - state['fat'])
        evo_episode['affection'].append(state['affection'])

        # Update n_steps
        evo_episode['n_episode_steps'] += 1

        # pbar.update(1)

    # pbar.close()

    evo_episode['avg_reward'] = sum(
        evo_episode['reward']) / evo_episode['n_episode_steps']
    evo_episode['n_steps'] = evo_episode['n_episode_steps']
    evo_episode['avg_happiness'] = sum(
        evo_episode['happiness']) / evo_episode['n_episode_steps']

    info_params = get_info_params({
        'method':
        params_method['method'],
        'method_id':
        method_id,
        'Avg Reward':
        round(evo_episode['avg_reward'], 4),
        'N-Steps':
        '{}/{}'.format(evo_episode['n_steps'], params_episode['nmax_steps']),
        'Avg Happiness':
        round(evo_episode['avg_happiness'], 4)
    })

    name_episode = '{}__{}'.format(method_id, args['episode'])
    if args['plot_episode']:
        plot_episode_happiness(evo_episode, action_dict, name_episode,
                               info_params)
    if args['save_episode']:
        save_models({'evo_episode': evo_episode},
                    name_episode,
                    final=(args['episode'] == 'final'))

    logger.debug(evo_episode.keys())

    return {
        'avg_reward':
        evo_episode['avg_reward'],
        'sum_reward':
        sum(evo_episode['reward']),
        'n_steps':
        evo_episode['n_steps'],
        'avg_happiness':
        evo_episode['avg_happiness'],
        'sum_happiness':
        sum(evo_episode['happiness']),
        'n_actions':
        len([
            action_taken for action_taken in evo_episode['action_taken']
            if action_taken > 0
        ]),
        'cause_of_death':
        info['cause_of_death'],
        'avg_food':
        sum(evo_episode['food']) / evo_episode['n_episode_steps'],
        'sum_food':
        sum(evo_episode['food']),
        'avg_inv_fat':
        sum(evo_episode['inv_fat']) / evo_episode['n_episode_steps'],
        'sum_inv_fat':
        sum(evo_episode['inv_fat']),
        'avg_affection':
        sum(evo_episode['affection']) / evo_episode['n_episode_steps'],
        'sum_affection':
        sum(evo_episode['affection'])
    }
示例#2
0
def main(args, logger):

    method_id = args.method_id
    logger.info(method_id)

    logger.debug(args)

    # Parametrisation
    params = json.loads(open('src/models/value_based/monte_carlo/config/{}.json'.format(method_id)).read())
    params_episode = json.loads(open('src/models/run_one_episode.json').read())

    logger.debug(params)

    info_params_dict = define_info_params_dict(params, method_id)
    info_params = get_info_params(info_params_dict)

    # Initializing environment
    action_dict, n_actions = get_env_actions()
    _, n_states = get_env_space()

    # Initializing the Q-matrix
    Q = init_Q(n_actions, params)
    Q_saved = Q.copy()

    # Initializing steps_per_state (count number of times we have been to each state)
    steps_per_state = init_steps_per_state()

    # Initializing the N-matrix
    N = init_N(n_actions)

    # Visualisation
    if args.update_episode_division == 0:
        n_episodes_save = 1e10
    else:
        n_episodes_save = int(np.ceil(params['n_episodes'] / 100 * args.update_episode_division))
    logger.debug('n_episodes_save :: {}'.format(n_episodes_save))

    if args.run_episode == 0:
        n_episodes_run = 1e10
    else:
        n_episodes_run = int(np.ceil(params['n_episodes'] / 100 * args.run_episode))
    logger.debug('n_episodes_run :: {}'.format(n_episodes_run))

    evolution_real_episode = initialize_real_episode()

    # Initializing the training
    evo_training = initialize_evo_training()

    # Training

    # Starting the learning
    pbar = tqdm(total=params['n_episodes'])

    while (not evo_training['convergence']) & (evo_training['episode'] < params['n_episodes']):

        # Get episode
        evo_episode = initialize_evo_episode()

        state1 = env_reset(params['start_at_random'])
        evo_episode['episode_step_happiness'].append(get_happiness(state1))

        # Update parameters
        epsilon = get_epsilon(
            params_epsilon=params['epsilon'], episode=evo_training['episode'],
            steps_state=steps_per_state[state1['state_id']])
        evo_episode['evo_epsilon'].append(epsilon)

        alpha = get_alpha(
            params_alpha=params['alpha'], episode=evo_training['episode'],
            steps_state=steps_per_state[state1['state_id']])
        evo_episode['evo_alpha'].append(alpha)

        action1 = epsilon_greedy(Q, state1['state_id'], n_actions, epsilon)
        steps_per_state = update_steps_per_state(steps_per_state, state1['state_id'])

        while (not evo_episode['done']) and (evo_episode['n_episode_steps'] < params['nmax_steps']):

            # Getting the next state
            state2, reward1, evo_episode['done'], info = env_step(state1, action1)

            # Update parameters
            epsilon = get_epsilon(
                params_epsilon=params['epsilon'], episode=evo_training['episode'],
                steps_state=steps_per_state[state2['state_id']])

            # Choosing the next action
            action2 = epsilon_greedy(Q, state2['state_id'], n_actions, epsilon)
            steps_per_state = update_steps_per_state(steps_per_state, state2['state_id'])

            evo_episode = update_evo_episode(evo_episode, reward1, state2, epsilon, alpha)
            evo_episode['steps_episode'].append({
                'state': state1,
                'action': action1,
                'reward': reward1})

            # Updating the respective values
            state1 = state2
            action1 = action2
            evo_episode['n_episode_steps'] += 1

        # Add discounted reward
        evo_episode['steps_episode'] = add_discounted_reward(evo_episode['steps_episode'], params['gamma'])

        # Update N and Q
        states_already_visited = []
        for step_episode in evo_episode['steps_episode']:
            N = update_N_MC(N, step_episode, params['method_MC'], states_already_visited)
            Q = update_Q_MC(Q, N, step_episode, params['method_MC'], states_already_visited)
            states_already_visited.append(step_episode['state']['state_id'])

        # At the end of learning process
        if args.render_episode:
            logger.debug(env_render_episode(evo_training['episode'], evo_episode, epsilon, alpha))

        evo_training = update_evo_training(evo_training, evo_episode)

        # Run a real episode
        info_episode = run_episode(
            episode_prep={
                'method_id': method_id,
                'params': params_episode,
                'params_method': params,
                'action_dict': action_dict,
                'Q': Q
            }, args={
                'episode': evo_training['episode'],
                'plot_episode': False,
                'save_episode': False
            }, logger=logger)

        evolution_real_episode = update_real_episode(evolution_real_episode, info_episode)

        # if (evo_training['episode'] + 1) % n_episodes_run == 0:
        # save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False)
        #     os.system("python -m src.models.run_one_episode value_based/monte_carlo/config/{} {}".format(
        #         method_id, evo_training['episode']))

        if (evo_training['episode'] + 1) % n_episodes_save == 0:

            save_models({'evo_training': evo_training}, method_id, final=False)

            # info_params_dict['n_episodes'] = '{}/{}'.format(evo_training['episode'] + 1, params['n_episodes'])
            # info_params = get_info_params(info_params_dict)

            # evo_training['checking'], Q_saved = launch_checking(
            #    evo_training['checking'], Q_saved, Q, method_id, info_params, final=False)
            # save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=False)

            # plot_evolution_reward(evo_training, method_id, info_params, final=False)
            # plot_evolution_steps(evo_training, method_id, params['nmax_steps'], info_params, final=False)
            # plot_evolution_happiness(evo_training, method_id, info_params, final=False)

            # evo_training['convergence'] = define_training_convergence(evo_training['checking']['evo_KLdiv'][-1], params)

        pbar.update(1)

    pbar.close()

    save_models({'evolution_real_episode': evolution_real_episode}, method_id, final=True)

    info_params_dict['n_episodes'] = '{}/{}'.format(evo_training['episode'] + 1, params['n_episodes'])
    info_params = get_info_params(info_params_dict)

    logger.info('checking')
    _, _ = launch_checking(
        evo_training['checking'], Q_saved, Q, method_id, info_params, final=True)
    save_models({'Q': Q, 'evo_training': evo_training}, method_id, final=True)

    logger.info('reward')
    plot_evolution_reward(evo_training, method_id, info_params, final=True)
    logger.info('steps')
    plot_evolution_steps(evo_training, method_id, params['nmax_steps'], info_params, final=True)
    logger.info('happiness')
    plot_evolution_happiness(evo_training, method_id, info_params, final=True)

    logger.info('episode')
    os.system("python -m src.models.run_one_episode value_based/monte_carlo/config/{} {} --save_episode --plot_episode".format(
        method_id, 'final'))
示例#3
0
from src.visualization.rl_plots_comparison import (
    plot_comparison_evolution_reward, plot_comparison_evolution_steps)

# python -m src.models.run_comparison

setup_logging(file_handler_name='run_comparison')

logger = logging.getLogger(__name__)

params = json.loads(open('src/models/run_comparison.json').read())

info_params_dict = {
    "nmax_steps": params['nmax_steps'],
    "gamma": params['gamma']
}
info_params = get_info_params(info_params_dict)

evo_training__evo_avg_reward_per_step = {}
evo_training__evo_n_steps = {}
evo_training__evo_avg_happiness = {}

evo_episode__happiness = {}

for method_id in params['list_method_ids']:
    logger.info(method_id)
    with open("models/{}__evo_training.pkl".format(method_id),
              "rb") as input_file:
        evo_training = dill.load(input_file)

    evo_training__evo_avg_reward_per_step[method_id] = evo_training[
        'evo_avg_reward_per_step']