Exemplo n.º 1
0
 def __init__(self, random_seed, frame_skip, repeat_action_probability,
              sound, display_screen, alpha, gamma, epsilon,
              unexplored_threshold, unexplored_reward, exploration,
              distance_metric):
     state_repr = 'verbose'
     self.world = QbertWorld(random_seed, frame_skip,
                             repeat_action_probability, sound,
                             display_screen)
     self.learner = QLearner(self.world, alpha, gamma, epsilon,
                             unexplored_threshold, unexplored_reward,
                             exploration, distance_metric, state_repr)
Exemplo n.º 2
0
 def __init__(self, random_seed, frame_skip, repeat_action_probability,
              sound, display_screen, alpha, gamma, epsilon,
              unexplored_threshold, unexplored_reward, exploration,
              distance_metric, state_representation):
     if state_representation is 'simple':
         state_repr = 'along_direction'
     else:
         state_repr = 'verbose'
     self.world = QbertWorld(random_seed,
                             frame_skip,
                             repeat_action_probability,
                             sound,
                             display_screen,
                             block_state_repr=state_repr)
     self.block_learner = QLearner(self.world, alpha, gamma, epsilon,
                                   unexplored_threshold, unexplored_reward,
                                   exploration, distance_metric, state_repr)
Exemplo n.º 3
0
class QbertCombinedVerboseAgent(Agent):
    """
    Obert agent which uses a verbose state for enemies, blocks and friendlies.
    """
    def __init__(self, random_seed, frame_skip, repeat_action_probability,
                 sound, display_screen, alpha, gamma, epsilon,
                 unexplored_threshold, unexplored_reward, exploration,
                 distance_metric):
        state_repr = 'verbose'
        self.world = QbertWorld(random_seed, frame_skip,
                                repeat_action_probability, sound,
                                display_screen)
        self.learner = QLearner(self.world, alpha, gamma, epsilon,
                                unexplored_threshold, unexplored_reward,
                                exploration, distance_metric, state_repr)

    def action(self):
        s = self.world.to_state_combined_verbose()
        a = self.learner.get_best_single_action(s)
        block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action(
            a)
        s_next = self.world.to_state_combined_verbose()
        self.learner.update(
            s, a, s_next,
            block_score + friendly_score + enemy_score + enemy_penalty)
        return block_score + friendly_score + enemy_score

    def q_size(self):
        return len(self.learner.Q)

    def save(self, filename):
        self.learner.save(filename)

    def load(self, filename):
        self.learner.load(filename)
Exemplo n.º 4
0
 def __init__(self, random_seed, frame_skip, repeat_action_probability,
              sound, display_screen, alpha, gamma, epsilon,
              unexplored_threshold, unexplored_reward, exploration,
              distance_metric, combined_reward, state_representation):
     if state_representation is 'simple':
         block_state_repr = 'adjacent'
         enemy_state_repr = 'adjacent_dangerous'
         friendly_state_repr = 'simple'
     else:
         block_state_repr = 'verbose'
         enemy_state_repr = 'verbose'
         friendly_state_repr = 'verbose'
     self.world = QbertWorld(random_seed,
                             frame_skip,
                             repeat_action_probability,
                             sound,
                             display_screen,
                             block_state_repr=block_state_repr,
                             enemy_state_repr=enemy_state_repr,
                             friendly_state_repr=friendly_state_repr)
     self.block_learner = QLearner(self.world,
                                   alpha,
                                   gamma,
                                   epsilon,
                                   unexplored_threshold,
                                   unexplored_reward,
                                   exploration,
                                   distance_metric,
                                   state_repr=block_state_repr,
                                   tag='blocks')
     self.friendly_learner = QLearner(self.world,
                                      alpha,
                                      gamma,
                                      epsilon,
                                      unexplored_threshold,
                                      unexplored_reward,
                                      exploration,
                                      distance_metric,
                                      state_repr=friendly_state_repr,
                                      tag='friendlies')
     enemy_epsilon = 0
     self.enemy_learner = QLearner(self.world,
                                   alpha,
                                   gamma,
                                   enemy_epsilon,
                                   unexplored_threshold,
                                   unexplored_reward,
                                   exploration,
                                   distance_metric,
                                   state_repr=enemy_state_repr,
                                   tag='enemies')
     self.combined_reward = combined_reward
Exemplo n.º 5
0
class QbertFriendlyAgent(Agent):
    """
    Obert agent which learns to capture friendlies (green agents).
    """
    def __init__(self, random_seed, frame_skip, repeat_action_probability,
                 sound, display_screen, alpha, gamma, epsilon,
                 unexplored_threshold, unexplored_reward, exploration,
                 distance_metric, state_representation):
        if state_representation is 'simple':
            state_repr = 'simple'
        else:
            state_repr = 'verbose'
        self.world = QbertWorld(random_seed,
                                frame_skip,
                                repeat_action_probability,
                                sound,
                                display_screen,
                                friendly_state_repr=state_repr)
        self.friendly_learner = QLearner(self.world, alpha, gamma, epsilon,
                                         unexplored_threshold,
                                         unexplored_reward, exploration,
                                         distance_metric, state_repr)

    def action(self):
        s = self.world.to_state_friendlies()
        a = self.friendly_learner.get_best_single_action(s)
        block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action(
            a)
        s_next = self.world.to_state_friendlies()
        self.friendly_learner.update(s, a, s_next, friendly_score)
        return block_score + friendly_score + enemy_score

    def q_size(self):
        return len(self.friendly_learner.Q)

    def save(self, filename):
        self.friendly_learner.save(filename)

    def load(self, filename):
        self.friendly_learner.load(filename)
Exemplo n.º 6
0
def main():

    args = get_args()

    device = torch.device('cuda',
                          index=args.device_id) if torch.cuda.is_available(
                          ) else torch.device('cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.device_id)

    if args.log_path:
        output_path = OutputPath(args.log_path)
    else:
        output_path = OutputPath()


#    monitor = Monitor(output_path.path)

    tbw = SummaryWriter(output_path.path)

    # Create an atari env.
    from atari_utils import make_atari_deepmind
    env = make_atari_deepmind(args.gym_env, valid=False)
    env_val = make_atari_deepmind(args.gym_env, valid=True)
    print('Observation:', env.observation_space)
    print('Action:', env.action_space)

    # 10000 * 4 frames
    val_replay_memory = ReplayMemory(env.observation_space.shape,
                                     env.action_space.shape,
                                     max_memory=args.num_frames)
    replay_memory = ReplayMemory(env.observation_space.shape,
                                 env.action_space.shape,
                                 max_memory=40000)

    learner = QLearner(env.action_space.n,
                       device,
                       sync_freq=1000,
                       save_freq=250000,
                       gamma=0.99,
                       learning_rate=1e-4,
                       save_path=output_path)

    explorer = LinearDecayEGreedyExplorer(env.action_space.n,
                                          device,
                                          network=learner.get_network(),
                                          eps_start=1.0,
                                          eps_end=0.01,
                                          eps_steps=1e6)

    sampler = Sampler(args.num_frames)
    obs_sampler = ObsSampler(args.num_frames)

    validator = Validator(env_val,
                          val_replay_memory,
                          explorer,
                          obs_sampler,
                          num_episodes=args.num_val_episodes,
                          num_eval_steps=args.num_eval_steps,
                          render=args.render_val,
                          tbw=tbw)

    trainer_with_validator = Trainer(env,
                                     replay_memory,
                                     learner,
                                     sampler,
                                     explorer,
                                     obs_sampler,
                                     inter_eval_steps=args.inter_eval_steps,
                                     num_episodes=args.num_episodes,
                                     train_start=10000,
                                     batch_size=32,
                                     render=args.render_train,
                                     validator=validator,
                                     tbw=tbw)

    for e in range(args.num_epochs):
        trainer_with_validator.step()
Exemplo n.º 7
0
        '--number-of-evaluation-intervals',
        default=10,
        type=int,
        help='The number of evaluation intervals (Default is 10).')
    parser.add_argument('--alpha',
                        default=0.01,
                        type=float,
                        help='The learning rate (Default is 0.01).')
    parser.add_argument('--gamma',
                        default=0.5,
                        type=float,
                        help='The discount factor (Default is 0.5).')
    parser.add_argument(
        '--epsilon',
        default=0.1,
        type=float,
        help='The chance of performing a random action (Default is 0.1).')

    return parser.parse_args()


if __name__ == '__main__':
    args = _arguments_definition()

    learner = QLearner()
    _, episodes_rewards = learner.learn(args.number_of_episodes, args.alpha,
                                        args.gamma, args.epsilon)

    Evaluator.evaluate(episodes_rewards, args.number_of_evaluation_intervals,
                       '')
Exemplo n.º 8
0
def main():

    args = get_args()

    nn.set_default_context(
        get_extension_context(args.extension, device_id=args.device_id))

    if args.log_path:
        output_path = OutputPath(args.log_path)
    else:
        output_path = OutputPath()
    monitor = Monitor(output_path.path)

    tbw = SummaryWriter(output_path.path)

    # Create an atari env.
    from atari_utils import make_atari_deepmind
    env = make_atari_deepmind(args.gym_env, valid=False)
    env_val = make_atari_deepmind(args.gym_env, valid=True)
    print('Observation:', env.observation_space)
    print('Action:', env.action_space)

    # 10000 * 4 frames
    val_replay_memory = ReplayMemory(env.observation_space.shape,
                                     env.action_space.shape,
                                     max_memory=args.num_frames)
    replay_memory = ReplayMemory(env.observation_space.shape,
                                 env.action_space.shape,
                                 max_memory=40000)

    learner = QLearner(q_cnn,
                       env.action_space.n,
                       sync_freq=1000,
                       save_freq=250000,
                       gamma=0.99,
                       learning_rate=1e-4,
                       name_q='q',
                       save_path=output_path)

    explorer = LinearDecayEGreedyExplorer(env.action_space.n,
                                          eps_start=1.0,
                                          eps_end=0.01,
                                          eps_steps=1e6,
                                          q_builder=q_cnn,
                                          name='q')

    sampler = Sampler(args.num_frames)
    obs_sampler = ObsSampler(args.num_frames)

    validator = Validator(env_val,
                          val_replay_memory,
                          explorer,
                          obs_sampler,
                          num_episodes=args.num_val_episodes,
                          num_eval_steps=args.num_eval_steps,
                          render=args.render_val,
                          monitor=monitor,
                          tbw=tbw)

    trainer_with_validator = Trainer(env,
                                     replay_memory,
                                     learner,
                                     sampler,
                                     explorer,
                                     obs_sampler,
                                     inter_eval_steps=args.inter_eval_steps,
                                     num_episodes=args.num_episodes,
                                     train_start=10000,
                                     batch_size=32,
                                     render=args.render_train,
                                     validator=validator,
                                     monitor=monitor,
                                     tbw=tbw)

    for e in range(args.num_epochs):
        trainer_with_validator.step()
Exemplo n.º 9
0
class QbertSubsumptionAgent(Agent):
    """
    Obert agent which uses a subsumption model, separating learning block exploring, avoiding enemies and capturing
    friendlies.
    """
    def __init__(self, random_seed, frame_skip, repeat_action_probability,
                 sound, display_screen, alpha, gamma, epsilon,
                 unexplored_threshold, unexplored_reward, exploration,
                 distance_metric, combined_reward, state_representation):
        if state_representation is 'simple':
            block_state_repr = 'adjacent'
            enemy_state_repr = 'adjacent_dangerous'
            friendly_state_repr = 'simple'
        else:
            block_state_repr = 'verbose'
            enemy_state_repr = 'verbose'
            friendly_state_repr = 'verbose'
        self.world = QbertWorld(random_seed,
                                frame_skip,
                                repeat_action_probability,
                                sound,
                                display_screen,
                                block_state_repr=block_state_repr,
                                enemy_state_repr=enemy_state_repr,
                                friendly_state_repr=friendly_state_repr)
        self.block_learner = QLearner(self.world,
                                      alpha,
                                      gamma,
                                      epsilon,
                                      unexplored_threshold,
                                      unexplored_reward,
                                      exploration,
                                      distance_metric,
                                      state_repr=block_state_repr,
                                      tag='blocks')
        self.friendly_learner = QLearner(self.world,
                                         alpha,
                                         gamma,
                                         epsilon,
                                         unexplored_threshold,
                                         unexplored_reward,
                                         exploration,
                                         distance_metric,
                                         state_repr=friendly_state_repr,
                                         tag='friendlies')
        enemy_epsilon = 0
        self.enemy_learner = QLearner(self.world,
                                      alpha,
                                      gamma,
                                      enemy_epsilon,
                                      unexplored_threshold,
                                      unexplored_reward,
                                      exploration,
                                      distance_metric,
                                      state_repr=enemy_state_repr,
                                      tag='enemies')
        self.combined_reward = combined_reward

    def action(self):
        enemy_present = self.world.is_enemy_nearby()
        friendly_present = self.world.is_friendly_nearby()
        a_enemies = None
        a_friendlies = None
        s_enemies = None
        s_friendlies = None
        if enemy_present:
            logging.debug('Enemy present!')
            s_enemies = self.world.to_state_enemies()
            a_enemies = self.enemy_learner.get_best_actions(s_enemies)
        if friendly_present:
            logging.debug('Friendly present!')
            s_friendlies = self.world.to_state_friendlies()
            a_friendlies = self.friendly_learner.get_best_actions(s_friendlies)
        s = self.world.to_state_blocks()
        a = self.block_learner.get_best_actions(s)
        if enemy_present and len(a_enemies) > 0:
            logging.debug('Chose enemy action!')
            if len(a_enemies) > 1:
                logging.debug('Broke tie!')
                chosen_action = self.block_learner.get_best_action(
                    s, a_enemies)
            else:
                chosen_action = a_enemies[0]
        elif friendly_present and len(a_friendlies) > 0:
            logging.debug('Chose friendly action!')
            chosen_action = a_friendlies[0]
        else:
            logging.debug('Chose block action!')
            chosen_action = random.choice(a)
        block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action(
            chosen_action)
        if enemy_present:
            s_next_enemies = self.world.to_state_enemies()
            self.enemy_learner.update(s_enemies, chosen_action, s_next_enemies,
                                      enemy_score + enemy_penalty)
        if friendly_present:
            s_next_friendlies = self.world.to_state_friendlies()
            self.friendly_learner.update(s_friendlies, chosen_action,
                                         s_next_friendlies, friendly_score)
        s_next = self.world.to_state_blocks()
        combined_score = block_score if self.combined_reward else block_score
        self.block_learner.update(s, chosen_action, s_next, combined_score)
        return block_score + friendly_score + enemy_score

    def q_size(self):
        return len(self.block_learner.Q) + \
               len(self.friendly_learner.Q) + \
               len(self.enemy_learner.Q)

    def save(self, filename):
        self.block_learner.save('{}_{}'.format(filename, 'block'))
        self.friendly_learner.save('{}_{}'.format(filename, 'friendly'))
        self.enemy_learner.save('{}_{}'.format(filename, 'enemy'))

    def load(self, filename):
        self.block_learner.load('{}_{}'.format(filename, 'block'))
        self.friendly_learner.load('{}_{}'.format(filename, 'friendly'))
        self.enemy_learner.load('{}_{}'.format(filename, 'enemy'))