def run_training(args): step_mul = 16 opt_eps = 1e-8 with open(args.log_file, mode='w') as log_file: # Removes "Namespace" from printout print('Args:', str(args)[9:], file=log_file) env = BuildMarinesEnvironment( render=args.render, step_multiplier=step_mul, verbose=args.verbose, enable_scv_helper=(not args.no_scvs), enable_kill_helper=(not args.no_kill), ) run_settings = RunSettings( num_episodes=args.episodes, num_epochs=args.epochs, batch_size=args.batch_size, train_every=args.train_every, save_every=args.save_every, graph_every=args.graph_every, averaging_window=args.window, graph_file=args.graph_file, log_file=log_file, verbose=args.verbose, ) if args.testagent: agent = TestAgent() else: agent_settings = AgentSettings( optimizer=torch.optim.Adam, learning_rate=args.lr, lr_gamma=args.lr_gamma, lr_step_size=args.lr_step_size, opt_eps=opt_eps, epsilon_max=0, epsilon_min=0, epsilon_duration=0, verbose=args.verbose, ) memory = PolicyGradientMemory(buffer_len=args.memsize, discount=args.discount, averaging_window=args.window) model = PolicyGradientNet( num_blocks=args.resblocks, channels=args.channels, force_cpu=args.cpu, ) agent = PolicyGradientAgent( init_temp=args.init_temp, temp_steps=args.temp_steps, save_file=args.model_file, log_file=log_file, model=model, settings=agent_settings, memory=memory, force_cpu=args.cpu, ) agent.load() experiment = Experiment([agent], env, run_settings) experiment.train()
def run_training( opponent, mcts_opp, game_state_file, graph_file, model_save_file, mcts_iters, temp, tempsteps, lr, discount, memsize, num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window, opt_eps=1e-8, ucb_c=1.5, boardsize=8, inputs=20, render=False, verbose=False, ): env = PommermanEnvironment( render=render, num_agents=2, game_state_file=game_state_file, ) run_settings = RunSettings( num_episodes=num_episodes, num_epochs=num_epochs, batch_size=batch_size, train_every=train_every, save_every=save_every, graph_every=graph_every, averaging_window=averaging_window, graph_file=graph_file, verbose=verbose, ) agent_settings = AgentSettings( optimizer=torch.optim.Adam, learning_rate=lr, opt_eps=opt_eps, epsilon_max=0, epsilon_min=0, epsilon_duration=0, verbose=verbose, ) memory = MCTSMemory(buffer_len=memsize, discount=discount) if mcts_opp is None: mcts_opp = opponent if mcts_opp == 'rand': opp = pommerman.agents.RandomAgent() elif mcts_opp == 'noop': opp = PommermanNoopAgent() elif mcts_opp == 'simp': opp = pommerman.agents.SimpleAgent() else: raise Exception('Invalid MCTS opponent type', mcts_opp) mcts_model = ActorCriticNet(board_size=boardsize, in_channels=inputs) agent1 = MCTSAgent( mcts_iters=mcts_iters, discount=discount, c=ucb_c, temp=temp, tempsteps=tempsteps, agent_id=0, opponent=opp, model_save_file=model_save_file, model=mcts_model, settings=agent_settings, memory=memory, ) agent1.load() if opponent == 'rand': agent2 = RandomAgent() elif opponent == 'noop': agent2 = NoopAgent() elif opponent == 'simp': agent2 = SimpleAgent() else: raise Exception('Invalid opponent type', opponent) experiment = Experiment([agent1, agent2], env, run_settings) experiment.train()
def index_spatial_probs(self, spatial_probs, indices): index_tuple = torch.meshgrid( [torch.arange(x) for x in spatial_probs.size()[:-2]]) + ( indices[:, 0], indices[:, 1], ) output = spatial_probs[index_tuple] return output def entropy(self, spatial_probs, nonspatial_probs): c3 = self.PPO_settings['c3'] c4 = self.PPO_settings['c4'] eps_denom = self.PPO_settings['eps_denom'] prod_s = spatial_probs[:, 0, :, :] * torch.log(spatial_probs[:, 0, :, :] + eps_denom) prod_n = nonspatial_probs * torch.log(nonspatial_probs + eps_denom) ent = -c3 * (torch.mean(torch.sum(prod_s, dim=(1, 2)))) ent = ent - c4 * torch.mean(torch.sum(prod_n, dim=1)) return ent settings_ppo = AgentSettings(optimizer=optim.Adam, learning_rate=0.00025, epsilon_max=1.0, epsilon_min=0.05, epsilon_duration=1000000)
def main(): ### Change this map if you must map_name = "DefeatRoaches" render = False step_mul = 8 ### Edit this to be a list of sc2_env.Agent() variables, one for each agent ### or bot you want, unless you are playing a minigame players = None env = FullStateActionEnvironment(map_name_=map_name, render=render, step_multiplier=step_mul, players=players) ### Set this to construct your desired network inheriting from BaseNetwork model = None ### Change these parameters and dicts to customize training lr = 1e-4 eps_max = 0.3 eps_min = 0.05 eps_duration=1e5 history_size=20 num_episodes = 1000000 num_epochs = 2 batch_size = 32 train_every = 2048 save_every = 10240 graph_every = 50 averaging_window = 100 """ :param optimizer: A class from torch.optim (instantiated later) :param learning_rate: The learning rate for the network :param epsilon_max: The starting epsilon :param epsilon_min: The final epsilon :param epsilon_duration: The number of frames to reach the final epsilon """ agent_settings = AgentSettings(torch.optim.Adam, lr, eps_max, eps_min, eps_duration) ### Unless you are changing code in interface, you shouldn't change this dict run_settings = RunSettings(num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window) ### Unless you are changing memory, you shouldn't change this memory = ReplayMemory(train_every, batch_size, hist_size=history_size) """ Custom to how you want to train your agent. Unless you are changing base_agent and changing the training algorithm, or you want to tune train parameters, you should not change this dict. """ train_settings = { "discount_factor": 0.99, "lambda": 0.95, "hist_size": history_size, "device": device, "eps_denom": 1e-6, "c1": 0.1, "c2": 0.05, "c3": 0.01, "c4": 0.01, "clip_param": 0.1, "map": map_name } """ Constructs the agent and trains it in an experiment. """ agent = BaseAgent(model, agent_settings, memory, train_settings) experiment = Experiment([agent], env, run_settings) experiment.train()
def main(): map_name = "DefeatRoaches" render = False step_mul = 8 env = MinigameEnvironment(state_modifier.graph_conv_modifier, map_name_=map_name, render=render, step_multiplier=step_mul) nonspatial_act_size, spatial_act_depth = env.action_space model = GraphConvModel(nonspatial_act_size, spatial_act_depth, device=device).to(device) lr = 5e-3 eps_max = 0.3 eps_min = 0.05 eps_duration = 1e5 num_episodes = 1000000 num_epochs = 3 batch_size = 32 train_every = 1024 save_every = 10240 graph_every = 50 averaging_window = 100 """ :param optimizer: A class from torch.optim (instantiated later) :param learning_rate: The learning rate for the network :param epsilon_max: The starting epsilon :param epsilon_min: The final epsilon :param epsilon_duration: The number of frames to reach the final epsilon """ agent_settings = AgentSettings(torch.optim.Adam, lr, eps_max, eps_min, eps_duration) run_settings = RunSettings(num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window) memory = ReplayMemory(train_every, 8, batch_size) PPO_settings = { "discount_factor": 0.99, "lambda": 0.95, "hist_size": 8, "device": device, "eps_denom": 1e-6, "c1": 1.0, "c2": 0.5, "c3": 0.5, "c4": 1.0, "clip_param": 0.1 } agent = PPOAgent(model, agent_settings, memory, PPO_settings) experiment = Experiment([agent], env, run_settings) experiment.train()
def main(): map_name = "DefeatRoaches" render = False step_mul = 8 env = FullStateActionEnvironment(map_name_=map_name, render=render, step_multiplier=step_mul) state_embed = 10 action_embed = 16 lr = 1e-4 opt_eps = 1e-8 eps_max = 0.3 eps_min = 0.05 eps_duration = 2e4 history_size = 5 num_episodes = 10000000 num_epochs = 3 batch_size = 32 train_every = 1024 save_every = 10240 graph_every = 50 averaging_window = 100 net_config = { "state_embedding_size": state_embed, # number of features output by embeddings "action_embedding_size": action_embed, "down_conv_features": 128, "down_residual_depth": 2, "up_features": 32, "up_conv_features": 128, "resnet_features": 128, "LSTM_in_size": 64, "LSTM_hidden_size": 96, "inputs2d_size": 64, "inputs3d_width": 8, "relational_features": 32, "relational_depth": 3, "relational_heads": 3, "spatial_out_depth": 64, "channels3": 16, "history_size": history_size, "device": device } #action_space = np.zeros(full_action_space.shape) #action_space[[0, 3, 12, 13, 331, 332]] = 1 action_space = np.ones(full_action_space.shape) model = RRLModel(net_config, device=device, action_space=action_space).to(device) print(model) """ :param optimizer: A class from torch.optim (instantiated later) :param learning_rate: The learning rate for the network :param epsilon_max: The starting epsilon :param epsilon_min: The final epsilon :param epsilon_duration: The number of frames to reach the final epsilon """ agent_settings = AgentSettings(torch.optim.Adam, lr, eps_max, eps_min, eps_duration, opt_eps) run_settings = RunSettings(num_episodes, num_epochs, batch_size, train_every, save_every, graph_every, averaging_window) memory = ReplayMemory(train_every, batch_size, hist_size=history_size) train_settings = { "discount_factor": 0.99, "lambda": 0.95, "hist_size": history_size, "device": device, "eps_denom": 1e-5, "c1": 0.1, "c2": 0.01, "c3": 1.0, "c4": 1.0, "minc2": 0.01, "clip_param": 0.1, "min_clip_param": 0.01, "clip_decay": 10000, "c2_decay": 10000, "map": map_name, "history_size": history_size } agent = BaseAgent(model, agent_settings, memory, train_settings) #agent.load() experiment = Experiment([agent], env, run_settings) #experiment.test() experiment.train()