def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric): state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen) self.learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr)
def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_representation): if state_representation is 'simple': state_repr = 'along_direction' else: state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen, block_state_repr=state_repr) self.block_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr)
class QbertCombinedVerboseAgent(Agent): """ Obert agent which uses a verbose state for enemies, blocks and friendlies. """ def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric): state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen) self.learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr) def action(self): s = self.world.to_state_combined_verbose() a = self.learner.get_best_single_action(s) block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action( a) s_next = self.world.to_state_combined_verbose() self.learner.update( s, a, s_next, block_score + friendly_score + enemy_score + enemy_penalty) return block_score + friendly_score + enemy_score def q_size(self): return len(self.learner.Q) def save(self, filename): self.learner.save(filename) def load(self, filename): self.learner.load(filename)
def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, combined_reward, state_representation): if state_representation is 'simple': block_state_repr = 'adjacent' enemy_state_repr = 'adjacent_dangerous' friendly_state_repr = 'simple' else: block_state_repr = 'verbose' enemy_state_repr = 'verbose' friendly_state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen, block_state_repr=block_state_repr, enemy_state_repr=enemy_state_repr, friendly_state_repr=friendly_state_repr) self.block_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=block_state_repr, tag='blocks') self.friendly_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=friendly_state_repr, tag='friendlies') enemy_epsilon = 0 self.enemy_learner = QLearner(self.world, alpha, gamma, enemy_epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=enemy_state_repr, tag='enemies') self.combined_reward = combined_reward
class QbertFriendlyAgent(Agent): """ Obert agent which learns to capture friendlies (green agents). """ def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_representation): if state_representation is 'simple': state_repr = 'simple' else: state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen, friendly_state_repr=state_repr) self.friendly_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr) def action(self): s = self.world.to_state_friendlies() a = self.friendly_learner.get_best_single_action(s) block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action( a) s_next = self.world.to_state_friendlies() self.friendly_learner.update(s, a, s_next, friendly_score) return block_score + friendly_score + enemy_score def q_size(self): return len(self.friendly_learner.Q) def save(self, filename): self.friendly_learner.save(filename) def load(self, filename): self.friendly_learner.load(filename)
def main(): args = get_args() device = torch.device('cuda', index=args.device_id) if torch.cuda.is_available( ) else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.device_id) if args.log_path: output_path = OutputPath(args.log_path) else: output_path = OutputPath() # monitor = Monitor(output_path.path) tbw = SummaryWriter(output_path.path) # Create an atari env. from atari_utils import make_atari_deepmind env = make_atari_deepmind(args.gym_env, valid=False) env_val = make_atari_deepmind(args.gym_env, valid=True) print('Observation:', env.observation_space) print('Action:', env.action_space) # 10000 * 4 frames val_replay_memory = ReplayMemory(env.observation_space.shape, env.action_space.shape, max_memory=args.num_frames) replay_memory = ReplayMemory(env.observation_space.shape, env.action_space.shape, max_memory=40000) learner = QLearner(env.action_space.n, device, sync_freq=1000, save_freq=250000, gamma=0.99, learning_rate=1e-4, save_path=output_path) explorer = LinearDecayEGreedyExplorer(env.action_space.n, device, network=learner.get_network(), eps_start=1.0, eps_end=0.01, eps_steps=1e6) sampler = Sampler(args.num_frames) obs_sampler = ObsSampler(args.num_frames) validator = Validator(env_val, val_replay_memory, explorer, obs_sampler, num_episodes=args.num_val_episodes, num_eval_steps=args.num_eval_steps, render=args.render_val, tbw=tbw) trainer_with_validator = Trainer(env, replay_memory, learner, sampler, explorer, obs_sampler, inter_eval_steps=args.inter_eval_steps, num_episodes=args.num_episodes, train_start=10000, batch_size=32, render=args.render_train, validator=validator, tbw=tbw) for e in range(args.num_epochs): trainer_with_validator.step()
'--number-of-evaluation-intervals', default=10, type=int, help='The number of evaluation intervals (Default is 10).') parser.add_argument('--alpha', default=0.01, type=float, help='The learning rate (Default is 0.01).') parser.add_argument('--gamma', default=0.5, type=float, help='The discount factor (Default is 0.5).') parser.add_argument( '--epsilon', default=0.1, type=float, help='The chance of performing a random action (Default is 0.1).') return parser.parse_args() if __name__ == '__main__': args = _arguments_definition() learner = QLearner() _, episodes_rewards = learner.learn(args.number_of_episodes, args.alpha, args.gamma, args.epsilon) Evaluator.evaluate(episodes_rewards, args.number_of_evaluation_intervals, '')
def main(): args = get_args() nn.set_default_context( get_extension_context(args.extension, device_id=args.device_id)) if args.log_path: output_path = OutputPath(args.log_path) else: output_path = OutputPath() monitor = Monitor(output_path.path) tbw = SummaryWriter(output_path.path) # Create an atari env. from atari_utils import make_atari_deepmind env = make_atari_deepmind(args.gym_env, valid=False) env_val = make_atari_deepmind(args.gym_env, valid=True) print('Observation:', env.observation_space) print('Action:', env.action_space) # 10000 * 4 frames val_replay_memory = ReplayMemory(env.observation_space.shape, env.action_space.shape, max_memory=args.num_frames) replay_memory = ReplayMemory(env.observation_space.shape, env.action_space.shape, max_memory=40000) learner = QLearner(q_cnn, env.action_space.n, sync_freq=1000, save_freq=250000, gamma=0.99, learning_rate=1e-4, name_q='q', save_path=output_path) explorer = LinearDecayEGreedyExplorer(env.action_space.n, eps_start=1.0, eps_end=0.01, eps_steps=1e6, q_builder=q_cnn, name='q') sampler = Sampler(args.num_frames) obs_sampler = ObsSampler(args.num_frames) validator = Validator(env_val, val_replay_memory, explorer, obs_sampler, num_episodes=args.num_val_episodes, num_eval_steps=args.num_eval_steps, render=args.render_val, monitor=monitor, tbw=tbw) trainer_with_validator = Trainer(env, replay_memory, learner, sampler, explorer, obs_sampler, inter_eval_steps=args.inter_eval_steps, num_episodes=args.num_episodes, train_start=10000, batch_size=32, render=args.render_train, validator=validator, monitor=monitor, tbw=tbw) for e in range(args.num_epochs): trainer_with_validator.step()
class QbertSubsumptionAgent(Agent): """ Obert agent which uses a subsumption model, separating learning block exploring, avoiding enemies and capturing friendlies. """ def __init__(self, random_seed, frame_skip, repeat_action_probability, sound, display_screen, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, combined_reward, state_representation): if state_representation is 'simple': block_state_repr = 'adjacent' enemy_state_repr = 'adjacent_dangerous' friendly_state_repr = 'simple' else: block_state_repr = 'verbose' enemy_state_repr = 'verbose' friendly_state_repr = 'verbose' self.world = QbertWorld(random_seed, frame_skip, repeat_action_probability, sound, display_screen, block_state_repr=block_state_repr, enemy_state_repr=enemy_state_repr, friendly_state_repr=friendly_state_repr) self.block_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=block_state_repr, tag='blocks') self.friendly_learner = QLearner(self.world, alpha, gamma, epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=friendly_state_repr, tag='friendlies') enemy_epsilon = 0 self.enemy_learner = QLearner(self.world, alpha, gamma, enemy_epsilon, unexplored_threshold, unexplored_reward, exploration, distance_metric, state_repr=enemy_state_repr, tag='enemies') self.combined_reward = combined_reward def action(self): enemy_present = self.world.is_enemy_nearby() friendly_present = self.world.is_friendly_nearby() a_enemies = None a_friendlies = None s_enemies = None s_friendlies = None if enemy_present: logging.debug('Enemy present!') s_enemies = self.world.to_state_enemies() a_enemies = self.enemy_learner.get_best_actions(s_enemies) if friendly_present: logging.debug('Friendly present!') s_friendlies = self.world.to_state_friendlies() a_friendlies = self.friendly_learner.get_best_actions(s_friendlies) s = self.world.to_state_blocks() a = self.block_learner.get_best_actions(s) if enemy_present and len(a_enemies) > 0: logging.debug('Chose enemy action!') if len(a_enemies) > 1: logging.debug('Broke tie!') chosen_action = self.block_learner.get_best_action( s, a_enemies) else: chosen_action = a_enemies[0] elif friendly_present and len(a_friendlies) > 0: logging.debug('Chose friendly action!') chosen_action = a_friendlies[0] else: logging.debug('Chose block action!') chosen_action = random.choice(a) block_score, friendly_score, enemy_score, enemy_penalty = self.world.perform_action( chosen_action) if enemy_present: s_next_enemies = self.world.to_state_enemies() self.enemy_learner.update(s_enemies, chosen_action, s_next_enemies, enemy_score + enemy_penalty) if friendly_present: s_next_friendlies = self.world.to_state_friendlies() self.friendly_learner.update(s_friendlies, chosen_action, s_next_friendlies, friendly_score) s_next = self.world.to_state_blocks() combined_score = block_score if self.combined_reward else block_score self.block_learner.update(s, chosen_action, s_next, combined_score) return block_score + friendly_score + enemy_score def q_size(self): return len(self.block_learner.Q) + \ len(self.friendly_learner.Q) + \ len(self.enemy_learner.Q) def save(self, filename): self.block_learner.save('{}_{}'.format(filename, 'block')) self.friendly_learner.save('{}_{}'.format(filename, 'friendly')) self.enemy_learner.save('{}_{}'.format(filename, 'enemy')) def load(self, filename): self.block_learner.load('{}_{}'.format(filename, 'block')) self.friendly_learner.load('{}_{}'.format(filename, 'friendly')) self.enemy_learner.load('{}_{}'.format(filename, 'enemy'))