def test_header(self): """Define the arena with one version""" arena = Arena([("Random A", lambda seed: AgentRandom(seed)), ("Random C", lambda seed: AgentRandom(seed)), ("Random B", lambda seed: AgentRandom(seed))], 5) self.assertListEqual(arena.csv_header(), ["opponent", "Random A", "Random B", "Random C"])
def test_valid_actions(self): """Test if the random agent acts correctly.""" game = TestGames.replay(1, [ 1, 1, 3, 0, 4, 1, 0, 0, 1, 1, 3, 0, 1, 2, 6, 0, 7, 0, 0, 0, 1, 2, 2, 0, 8, 2, 0, 0, 1, 1, 5, 0 ]) actions = Agent.valid_actions(game, 782) self.assertEqual(len(actions), 2) # player can use baron or priest, at 1 or 3 self.assertListEqual(actions, [ PlayerAction( discard=Card.priest, player_target=3, guess=0, revealed_card=0), PlayerAction( discard=Card.baron, player_target=3, guess=0, revealed_card=0) ]) agent = AgentRandom(4) action = agent.move(game) self.assertEqual( action, PlayerAction(discard=Card.baron, player_target=1, guess=0, revealed_card=0))
def test_init_multiple(self): """Define the arena with several agents""" arena = Arena([("Random A", lambda seed: AgentRandom(seed)), ("Random C", lambda seed: AgentRandom(seed)), ("Random B", lambda seed: AgentRandom(seed))], 5) self.assertListEqual(arena.names(), ["Random A", "Random B", "Random C"])
def test_list(self): """Define the arena with one version""" arena = Arena([("Random A", lambda seed: AgentRandom(seed)), ("Random C", lambda seed: AgentRandom(seed)), ("Random B", lambda seed: AgentRandom(seed))], 5) self.assertEqual(len(arena.csv_results_lists()), 3) self.assertListEqual( arena.csv_results_lists(), [['Random A', 0.2, 0.8, 0.8], ['Random B', 0.2, 0.2, 0.2], ['Random C', 0.2, 0.8, 0.2]])
def test_init_multiple(self): """Define the arena with several agents""" arena = Arena([("Random A", lambda seed: AgentRandom(seed)), ("Random C", lambda seed: AgentRandom(seed)), ("Random B", lambda seed: AgentRandom(seed))], 5) results = arena.results() self.assertEqual(len(results), 6) self.assertListEqual(results, [('Random A', 'Random A', 1), ('Random A', 'Random B', 1), ('Random A', 'Random C', 1), ('Random B', 'Random B', 1), ('Random C', 'Random B', 1), ('Random C', 'Random C', 1)])
def __init__(self, model_path, dtype, seed=451): self._seed = seed self._idx = 0 self._dtype = dtype self.env = LoveLetterEnv(AgentRandom(seed), seed) state = self.env.reset() self._model = ActorCritic(state.shape[0], self.env.action_space).type(dtype) self._model.load_state_dict( torch.load(model_path, map_location={'cuda:0': 'cpu'}))
def __init__(self, model_path, seed=451): self._seed = seed self._idx = 0 self.env = LoveLetterEnv(AgentRandom(seed), seed) self.vec_env = DummyVecEnv([ lambda: self.env ]) # The algorithms require a vectorized environment to run state = self.env.reset() self._model = PPO2(MlpPolicy, self.vec_env, verbose=0, tensorboard_log="./tensorboard/") self._model.load(model_path)
default='arena.results.csv', help='Path to write arena results') ARGS = PARSER.parse_args() print('Starting arena') A3C_PATH = os.path.join("models", "stated_2017-05-01T22-59-33.510476_best_0.45875") dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor ARENA = Arena( [ # Place agents in this list as created # first in the tuple is the readable name # second is a lambda that ONLY takes a random seed. This can be discarded # if the the Agent does not require a seed ("A3C", lambda seed: AgentA3C(A3C_PATH, dtype, seed)), ("Random", lambda seed: AgentRandom(seed)) ], 500) print('Run the arena for: ', ARENA.csv_header()) with open(ARGS.output, 'w') as f: WRITER = csv.writer(f) WRITER.writerow(ARENA.csv_header()) WRITER.writerows(ARENA.csv_results_lists()) print('Complete')
def train(rank, args, shared_model, dtype): torch.manual_seed(args.seed + rank) env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() values = [] log_probs = [] state = torch.from_numpy(state).type(dtype) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256).type(dtype)) hx = Variable(torch.zeros(1, 256).type(dtype)) else: cx = Variable(cx.data.type(dtype)) hx = Variable(hx.data.type(dtype)) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.cpu().numpy()[0][0]) done = done or episode_length >= args.max_episode_length if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state).type(dtype) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1).type(dtype) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1).type(dtype) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.beta * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, dtype): test_ctr = 0 torch.manual_seed(args.seed + rank) # set up logger timestring = str( date.today()) + '_' + datetime.datetime.now().strftime('%H-%M-%S') run_name = args.save_name + '_' + timestring configure("logs/run_" + run_name, flush_secs=5) env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) model.eval() state = torch.from_numpy(state).type(dtype) reward_sum = 0 max_reward = -99999999 max_winrate = 0 rewards_recent = deque([], 100) done = True start_time = time.time() episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) hx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) else: cx = Variable(cx.data.type(dtype), volatile=True) hx = Variable(hx.data.type(dtype), volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: rewards_recent.append(reward_sum) rewards_recent_avg = sum(rewards_recent) / len(rewards_recent) print( "{} | Episode Reward {: >4}, Length {: >2} | Avg Reward {:0.2f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length, rewards_recent_avg)) # if not stuck or args.evaluate: log_value('Reward', reward_sum, test_ctr) log_value('Reward Average', rewards_recent_avg, test_ctr) log_value('Episode length', episode_length, test_ctr) if reward_sum >= max_reward: # pickle.dump(shared_model.state_dict(), open(args.save_name + '_max' + '.p', 'wb')) path_output = args.save_name + '_max' torch.save(shared_model.state_dict(), path_output) path_now = "{}_{}".format( args.save_name, datetime.datetime.now().strftime('%H-%M-%S')) torch.save(shared_model.state_dict(), path_now) max_reward = reward_sum win_rate_v_random = Arena.compare_agents_float( lambda seed: AgentA3C(path_output, dtype, seed), lambda seed: AgentRandom(seed), 800) msg = " {} | VsRandom: {: >4}%".format( datetime.datetime.now().strftime("%c"), round(win_rate_v_random * 100, 2)) print(msg) log_value('Win Rate vs Random', win_rate_v_random, test_ctr) if win_rate_v_random > max_winrate: print("Found superior model at {}".format( datetime.datetime.now().isoformat())) torch.save( shared_model.state_dict(), "{}_{}_best_{}".format( args.save_name, datetime.datetime.now().isoformat(), win_rate_v_random)) max_winrate = win_rate_v_random reward_sum = 0 episode_length = 0 state = env.reset() test_ctr += 1 if test_ctr % 10 == 0 and not args.evaluate: # pickle.dump(shared_model.state_dict(), open(args.save_name + '.p', 'wb')) torch.save(shared_model.state_dict(), args.save_name) if not args.evaluate: time.sleep(60) elif test_ctr == evaluation_episodes: # Ensure the environment is closed so we can complete the # submission env.close() # gym.upload('monitor/' + run_name, api_key=api_key) state = torch.from_numpy(state).type(dtype)
def test_init_single(self): """Define the arena with one version""" arena = Arena([("Random", lambda seed: AgentRandom(seed))], 5) results = arena.results() self.assertEqual(len(results), 1) self.assertListEqual(results, [("Random", "Random", 1)])
def test_init_single(self): """Define the arena with one version""" arena = Arena([("Random", lambda seed: AgentRandom(seed))], 5) self.assertListEqual(arena.names(), ["Random"])
help='path/prefix for the filename to save shared model\'s parameters') parser.add_argument( '--load-name', default=None, metavar='SN', help='path/prefix for the filename to load shared model\'s parameters') if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor env = LoveLetterEnv(AgentRandom(args.seed), args.seed) state = env.reset() shared_model = ActorCritic(state.shape[0], env.action_space).type(dtype) if args.load_name is not None: shared_model.load_state_dict(torch.load(args.load_name)) shared_model.share_memory() # train(1,args,shared_model,dtype) processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model, dtype)) p.start() processes.append(p) if not args.evaluate:
**kwargs, net_arch=[512, dict(pi=[256, 128], vf=[256, 128])], feature_extraction="mlp") if __name__ == '__main__': args = parser.parse_args() if args.load_name: env = SubprocVecEnv([ lambda: LoveLetterEnv(TFAgent(args.load_name, args.seed + i)) for i in range(args.num_processes) ]) else: env = SubprocVecEnv([ lambda: LoveLetterEnv(AgentRandom(args.seed + i)) for i in range(args.num_processes) ]) model = PPO2(CustomPolicy, env, verbose=0, tensorboard_log=args.log_dir, learning_rate=args.lr, n_steps=args.num_steps, nminibatches=5) if args.load_name: model.load(args.load_name) model.learn(total_timesteps=int(args.total_steps),