def main(_): params = {} if FLAGS.symbolic: params = {'seed': FLAGS.seed, 'level_name': FLAGS.level_name} env_generator = symbolic_alchemy.get_symbolic_alchemy_level else: env_settings = dm_alchemy.EnvironmentSettings( seed=FLAGS.seed, level_name=FLAGS.level_name) params = {'name': FLAGS.docker_image_name, 'settings': env_settings} env_generator = dm_alchemy.load_from_docker with env_generator(**params) as env: agent = RandomAgent(env.action_spec()) timestep = env.reset() score = 0 while not timestep.last(): action = agent.act(timestep) timestep = env.step(action) if timestep.reward: score += timestep.reward print('Total score: {:.2f}, reward: {:.2f}'.format( score, timestep.reward))
def run(env_name, agent_name, nb_episodes, render_freq, render_mode): logger.set_level(logger.INFO) env = gym.make(env_name) # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). #outdir = '/tmp/random-agent-results' #video_callable = None if render_mode == 'human' else False #env = wrappers.Monitor(env, directory=outdir, force=True, video_callable=video_callable) #env = DynamicMonitor(env, directory=outdir, force=True, video_callable=video_callable) env.render(mode=render_mode) env.seed(0) if agent_name == 'RandomAgent': agent = RandomAgent(env.env.action_space) elif agent_name == 'EpsilonGreedyAgent': agent = EpsilonGreedy(env.env.action_space) elif agent_name == 'GradientBanditAgent': agent = GradientBandit(env.env.action_space) elif agent_name == 'ucb': agent = ucb(env.env.action_space) elif agent_name == 'ThompsonSampling': agent = ThompsonSampling(env.env.action_space) step = 0 reward = 0 done = False for episode in range(nb_episodes): print(f'--------- Episode {episode} ---------') ob = env.reset() agent = agent.reset() while True: step += 1 # action space may have change # agent = EpsilonGreedy(env.env.action_space) action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break if step % render_freq == 0: env.render() # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Close the env and write monitor result info to disk env.env.close()
def main(episode_count): env = gym.make('CartPole-v0') agent = RandomAgent(env.action_space.n) for i in range(episode_count): observation = env.reset() # initialize the environment done = False step = 0 while not done: env.render() action = agent.act(observation) next_observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(step + 1)) observation = next_observation step += 1
class Learner(object): def __init__(self, agent_type, tensorboard_dir, save_dir=None, player='offense', pretrained=None, seed=1, episodes=20000, server_port=6000, max_steps=15000, save_freq=500, start=0): if player == 'offense': self.team = 'base_left' self.mid_action = offense_mid_action else: self.team = 'base_right' if player == 'goalie': self.mid_action = goalie_mid_action else: self.mid_action = defense_mid_action self.seed = seed self.save_freq = save_freq self.episodes = episodes self.scale_actions = True self.server_port = server_port self.max_steps = max_steps self.save_dir = os.path.join('./saved_models', save_dir) self.tensorboard_dir = os.path.join('./tensorboard-log', tensorboard_dir) self.pretrained = pretrained self.start = start self.env = make_env(player, self.server_port, self.team, self.scale_actions) # configure redis self.redis_instance = connect_redis() # initialize teammate set self.redis_instance.sadd('teammates', self.env.unum) print('Number of teammates:', self.redis_instance.scard('teammates')) if agent_type == 'PDDPG': self.agent = PDDPGAgent(self.env.observation_space, self.env.action_space, actor_kwargs={'hidden_layers': [1024, 512, 256, 256, 128, 128], 'init_type': "kaiming", 'init_std': 0.01, 'activation': 'leaky_relu'}, critic_kwargs={'hidden_layers': [1024, 512, 256, 256, 128, 128], 'init_type': "kaiming", 'init_std': 0.01, 'activation': 'leaky_relu'}, batch_size=32, # batch_size, learning_rate_actor=0.001, # learning_rate_actor, # 0.0001 learning_rate_critic=0.001, # learning_rate_critic, # 0.001 gamma=0.99, # gamma, # 0.99 tau_actor=0.001, # tau, tau_critic=0.001, # tau, n_step_returns=True, # n_step_returns, epsilon_steps=1000, # epsilon_steps, epsilon_final=0.1, # epsilon_final, replay_memory_size=500000, # replay_memory_size, inverting_gradients=True, # inverting_gradients, initial_memory_threshold=1000, # initial_memory_threshold, beta=0.2, # beta, clip_grad=1., # clip_grad, use_ornstein_noise=False, # use_ornstein_noise, adam_betas=(0.9, 0.999), # default 0.95,0.999 seed=self.seed) elif agent_type == 'MAPDDPG': self.agent = MAPDDPGAgent(self.env.observation_space, self.env.action_space, actor_kwargs={'hidden_layers': [1024, 512, 256, 256, 128, 128], 'init_type': "kaiming", 'init_std': 0.01, 'activation': 'leaky_relu'}, critic_kwargs={'hidden_layers': [1024, 512, 256, 256, 128, 128], 'init_type': "kaiming", 'init_std': 0.01, 'activation': 'leaky_relu'}, batch_size=32, # batch_size, learning_rate_actor=0.001, # learning_rate_actor, # 0.0001 learning_rate_critic=0.001, # learning_rate_critic, # 0.001 gamma=0.99, # gamma, # 0.99 tau_actor=0.001, # tau, tau_critic=0.001, # tau, n_step_returns=True, # n_step_returns, epsilon_steps=1000, # epsilon_steps, epsilon_final=0.1, # epsilon_final, replay_memory_size=500000, # replay_memory_size, inverting_gradients=True, # inverting_gradients, initial_memory_threshold=1000, # initial_memory_threshold, beta=0.2, # beta, clip_grad=1., # clip_grad, use_ornstein_noise=False, # use_ornstein_noise, adam_betas=(0.9, 0.999), # default 0.95,0.999 seed=self.seed, num_agents=int(self.redis_instance.get('num_agents')), unum=self.env.unum) elif agent_type == 'RANDOM': self.agent = RandomAgent(observation_space=self.env.observation_space, action_space=self.env.action_space) else: raise NotImplementedError self.writer = SummaryWriter(self.tensorboard_dir) print('Agent:', self.agent) def load_model(self, dir, i): prefix = os.path.join(dir, str(i)) print('Evaluating model from', prefix, '...') self.agent.actor.load_state_dict(torch.load(prefix + '_actor.pt', map_location='cpu')) self.agent.critic.load_state_dict(torch.load(prefix + '_critic.pt', map_location='cpu')) self.agent.actor.eval() self.agent.critic.eval() print('Models evaluated successfully') def run(self): # Random seed # self.seed += 10000 * proc_id() torch.manual_seed(self.seed) np.random.seed(self.seed) # later can sort by approximity (need to be same as in redis_manager) self.all_agents = list(self.redis_instance.smembers('teammates')) self.all_agents.sort() # Prepare for interaction with environment start_time = time.time() # if isinstance(self.agent, RandomAgent): # for i in range(self.episodes): # obs = self.env.reset() # obs = np.array(obs, dtype=np.float32, copy=False) # print(obs) # # for j in range(self.local_steps_per_episode): # act, act_param = self.agent.act(obs) # action = mid_action(act, act_param) # # next_obs, reward, terminal, info = self.env.step(action) # obs = next_obs n_step_returns = True update_ratio = 0.1 if self.save_freq > 0 and self.save_dir: self.save_dir = os.path.join(self.save_dir, 'agent' + str(self.env.unum)) os.makedirs(self.save_dir, exist_ok=True) if self.pretrained: self.load_model(self.pretrained, self.start) # train log total_reward = 0. returns = [] timesteps = [] goals = [] if isinstance(self.agent, PDDPGAgent) and not isinstance(self.agent, MAPDDPGAgent): print('\n===========Start Training PDDPG===========') # main loop for i in range(self.start, self.start + self.episodes): # save model if self.save_freq > 0 and self.save_dir and (i + 1) % self.save_freq == 0: prefix = os.path.join(self.save_dir, str(i + 1)) torch.save(self.agent.actor.state_dict(), prefix + '_actor.pt') torch.save(self.agent.critic.state_dict(), prefix + '_critic.pt') print('Models saved successfully at episode' + str(i + 1)) info = {'status': "NOT_SET"} # initialize environment, reward and transitions obs = self.env.reset() obs = np.array(obs, dtype=np.float32, copy=False) episode_reward = 0. transitions = [] # get discrete action and continuous parameters act, act_param, all_actions, all_action_parameters = self.agent.act(obs) action = self.mid_action(act, act_param) for j in range(self.max_steps): next_obs, reward, terminal, info = self.env.step(action) next_obs = np.array(next_obs, dtype=np.float32, copy=False) # get discrete action and continuous parameters next_act, next_act_param, next_all_actions, next_all_action_parameters = self.agent.act(next_obs) next_action = self.mid_action(next_act, next_act_param) if n_step_returns: transitions.append( [obs, np.concatenate((all_actions.data, all_action_parameters.data)).ravel(), reward, next_obs, np.concatenate((next_all_actions.data, next_all_action_parameters.data)).ravel(), terminal]) else: self.agent.step(obs, (act, act_param, all_actions, all_action_parameters), reward, next_obs, (next_act, next_act_param, next_all_actions, next_all_action_parameters), terminal, optimise=False) act, act_param, all_actions, all_action_parameters = \ next_act, next_act_param, next_all_actions, next_all_action_parameters action = next_action obs = next_obs episode_reward += reward # env.render() if terminal: break # decay epsilon self.agent.end_episode() # calculate n-step returns if n_step_returns: nsreturns = compute_n_step_returns(transitions, self.agent.gamma) for t, nsr in zip(transitions, nsreturns): t.append(nsr) self.agent.replay_memory.append(state=t[0], action=t[1], reward=t[2], next_state=t[3], next_action=t[4], terminal=t[5], time_steps=None, n_step_return=nsr) # update networks at the end of each episode n_updates = int(update_ratio * j) for _ in range(n_updates): self.agent.update() # train log returns.append(episode_reward) timesteps.append(j) goals.append(info['status'] == 'GOAL') total_reward += episode_reward self.writer.add_scalar('Episode reward', episode_reward, i) if i % 100 == 0: print('{0:5s} : Total mean reward:{1:.4f} | Episode reward:{2:.4f}' .format(str(i + 1), total_reward / (i + 1), episode_reward)) self.writer.add_scalar('Last 100episodes mean reward', np.array(returns[-100:]).mean(), i) elif isinstance(self.agent, MAPDDPGAgent): print('\n===========Start Training MAPDDPG===========') # main loop for i in range(self.start, self.start + self.episodes): # save model if self.save_freq > 0 and self.save_dir and (i + 1) % self.save_freq == 0: prefix = os.path.join(self.save_dir, str(i + 1)) torch.save(self.agent.actor.state_dict(), prefix + '_actor.pt') torch.save(self.agent.critic.state_dict(), prefix + '_critic.pt') print('Models saved successfully at episode' + str(i + 1)) info = {'status': "NOT_SET"} # initialize environment, reward and transitions obs = self.env.reset() obs = np.array(obs, dtype=np.float32, copy=False) episode_reward = 0. transitions = [] # get discrete action and continuous parameters act, act_param, all_actions, all_action_parameters = self.agent.act(obs) action = self.mid_action(act, act_param) # update the observation and action of agent i in redis sync_agent_obs_actions(self.redis_instance, self.env.unum, obs, all_actions, all_action_parameters) # query all agents' observations and actions all_agent_obs, all_agent_actions, success1 = query_all_obs_actions(self.redis_instance) for j in range(self.max_steps): # # TODO: query other agents' actions (for inference use) # take action in environment next_obs, reward, terminal, info = self.env.step(action) next_obs = np.array(next_obs, dtype=np.float32, copy=False) # get discrete action and continuous parameters next_act, next_act_param, next_all_actions, next_all_action_parameters = self.agent.act(next_obs) next_action = self.mid_action(next_act, next_act_param) # update the observation and action of agent i in redis sync_agent_obs_actions(self.redis_instance, self.env.unum, next_obs, next_all_actions, next_all_action_parameters) # query all agents' observations and actions all_agent_next_obs, all_agent_next_actions, success2 = query_all_obs_actions(self.redis_instance) if n_step_returns and success1 and success2: transitions.append([all_agent_obs.ravel(), all_agent_actions.ravel(), reward, all_agent_next_obs.ravel(), all_agent_next_actions.ravel(), terminal]) # else: # self.agent.step(obs, (act, act_param, all_actions, all_action_parameters), reward, next_obs, # (next_act, next_act_param, next_all_actions, next_all_action_parameters), # terminal, # optimise=False) all_agent_actions = all_agent_next_actions action = next_action all_agent_obs = all_agent_next_obs success1 = True episode_reward += reward if terminal: break # decay epsilon self.agent.end_episode() # calculate n-step returns if n_step_returns: nsreturns = compute_n_step_returns(transitions, self.agent.gamma) for t, nsr in zip(transitions, nsreturns): t.append(nsr) if not any(elem is None for elem in t): self.agent.replay_memory.append(state=t[0], action=t[1], reward=t[2], next_state=t[3], next_action=t[4], terminal=t[5], time_steps=None, n_step_return=nsr) # update networks at the end of each episode n_updates = int(update_ratio * j) for _ in range(n_updates): # sync policy in redis sync_agent_policy(self.redis_instance, self.env.unum, self.agent.actor_target) self.agent.update() # train log returns.append(episode_reward) timesteps.append(j) goals.append(info['status'] == 'GOAL') total_reward += episode_reward self.writer.add_scalar('Episode reward', episode_reward, i) if i % 100 == 0: print('{0:5s} : Total mean reward:{1:.4f} | Episode reward:{2:.4f}' .format(str(i + 1), total_reward / (i + 1), episode_reward)) self.writer.add_scalar('Last 100episodes mean reward', np.array(returns[-100:]).mean(), i) train_goal_ratio = goals.count(True) / self.episodes print("Training goal ratio: %.2f" % train_goal_ratio) end_time = time.time() print("Training time: %.2f seconds" % (end_time - start_time)) print("==========TRAINING END==========")
agent = DistanceMinimizing(env.action_space) elif agent_arg == 'deg-min': agent = DegreeMinimizing(env.action_space) seed = asteroids_args.seed if isinstance(asteroids_args.seed, int) else asteroids_args.seed[0] runs = asteroids_args.runs if isinstance(asteroids_args.runs, int) else asteroids_args.runs[0] env.seed(seed) total_runs = 0 total_score = 0 while total_runs < runs: reward = 0 done = False score = 0 special_data = {'ale.lives': 3} ob = env.reset() i = 0 while not done: action = agent.act(ob, reward, done) ob, reward, done, x = env.step(action) score += reward # env.render() i += 1 print(f'Asteroids score [{i}]: {score}') total_runs += 1 total_score += score print(f'Average score across {total_runs} runs: {total_score / total_runs}') # Close the env and write monitor result info to disk env.close()
import gym from agents.random_agent import RandomAgent env = gym.make("CartPole-v0") agent = RandomAgent(env.action_space) for i_episode in range(20): observation = env.reset() reward = 0 done = False for t in range(100): env.render() print(observation) action = agent.act(object, reward, done) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break env.close()