parser.add_argument('--eval_episodes', help='Set number of evaluation episodes', type=int, default=30) parser.add_argument('--filename', help='Path to the pretrained model', default=None) parser.add_argument('--mode', help='Mode for evaluating currently: (shooting | defense)', default='normal') parser.add_argument('--show', help='Set if want to render training process', action='store_true') parser.add_argument('--q', help='Quiet mode (no prints)', action='store_true') parser.add_argument('--opposite', help='Evaluate agent on opposite side', action='store_true') opts = parser.parse_args() if __name__ == '__main__': if opts.mode == 'normal': mode = h_env.HockeyEnv_BasicOpponent.NORMAL elif opts.mode == 'shooting': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING elif opts.mode == 'defense': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE else: raise ValueError('Unknown training mode. See --help') logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) + '/logs', mode=opts.mode, quiet=opts.q) q_agent = logger.load_model(filename=opts.filename) q_agent._config['show'] = opts.show q_agent._config['max_steps'] = 250 q_agent.eval() env = h_env.HockeyEnv(mode=mode) opponent = h_env.BasicOpponent(weak=False) evaluate(agent=q_agent, env=env, opponent=opponent, eval_episodes=opts.eval_episodes, action_mapping=q_agent.action_mapping, evaluate_on_opposite_side=opts.opposite)
parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G') parser.add_argument('--seed', type=int, default=111111, metavar='N') parser.add_argument('--batch_size', type=int, default=4, metavar='N') parser.add_argument('--num_steps', type=int, default=1000001, metavar='N') parser.add_argument('--hidden_size', type=int, default=512, metavar='N') parser.add_argument('--updates_per_step', type=int, default=1, metavar='N') parser.add_argument('--start_steps', type=int, default=10000, metavar='N') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N') args = parser.parse_args() args.cuda =True if torch.cuda.is_available() else False env = h_env.HockeyEnv(mode=h_env.HockeyEnv.NORMAL) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.load_model('full_player_models/sac_actor_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000','full_player_models/sac_critic_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000') # opponent = copy.deepcopy(agent) basic_strong = h_env.BasicOpponent(weak=False) time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') #Tesnorboard writer = SummaryWriter(f"strongplay-runs/ERE{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}") # Memory memory = ERE_PrioritizedReplay(args.replay_size) # memory = ReplayMemory(args.replay_size,args.seed) # Training Loop total_numsteps = 0
mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE else: raise ValueError('Unknown training mode. See --help') opts.device = torch.device( 'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu') dirname = time.strftime(f'%y%m%d_%H%M%S_{random.randint(0, 1e6):06}', time.gmtime(time.time())) abs_path = os.path.dirname(os.path.realpath(__file__)) logger = Logger(prefix_path=os.path.join(abs_path, dirname), mode=opts.mode, cleanup=True, quiet=opts.q) env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q)) opponents = [ h_env.BasicOpponent(weak=True), ] # Add absolute paths for pretrained agents pretrained_agents = [] if opts.selfplay: for p in pretrained_agents: a = SACAgent.load_model(p) a.eval() opponents.append(a) if opts.preload_path is None: agent = SACAgent(logger=logger,
parser.add_argument('--batch_size', type=int, default=4, metavar='N') parser.add_argument('--num_steps', type=int, default=1000001, metavar='N') parser.add_argument('--hidden_size', type=int, default=512, metavar='N') parser.add_argument('--updates_per_step', type=int, default=1, metavar='N') parser.add_argument('--start_steps', type=int, default=10000, metavar='N') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N') args = parser.parse_args() args.cuda = True if torch.cuda.is_available() else False env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # actor512 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.385833864540086_episode-41000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-512_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-40-41' # critic512 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.385833864540086_episode-41000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-512_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-40-41' # actor128 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.184820100545167_episode-39000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-128_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-16' # critic128 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.184820100545167_episode-39000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-128_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-16' actor64 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.407677291229737_episode-33000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-64_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-10' critic64 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.407677291229737_episode-33000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-64_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-10' agent.load_model(actor64, critic64) time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') #Tesnorboard writer = SummaryWriter( f"hockey-hidden-runs-defence/{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}" )
optParser = optparse.OptionParser() optParser.add_option('-e', '--env',action='store', type='string', dest='env_name',default="hockey", help='Environment (default %default)') optParser.add_option('-c', '--eps',action='store', type='float', dest='eps_clip',default=0.2, help='Clipping epsilon (default %default)') optParser.add_option('-r', '--run',action='store', type='int', dest='test_run',default=0, help='Test run (default %default)') env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING) opts, args = optParser.parse_args() ############## Hyperparameters ############## run_number = opts.test_run env_name = opts.env_name # creating environment state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_interactions = 10000001 # max training episodes max_timesteps = 300 # max timesteps in one episode n_latent_var = 256 # number of variables in hidden layer update_timestep = 2000 # update policy every n timesteps
def training_loop(hyperparameters): print(f"Starting training with hyperparameters: {hyperparameters}") save_path = hyperparameters["save_path"] load_path = hyperparameters["load_path"] # create the save path and save hyperparameter configuration if not os.path.exists(save_path): os.mkdir(save_path) else: a = input("Warning, Directory already exists. Dou want to continue?") if a not in ["Y","y"]: raise Exception("Path already exists, please start with another path.") with open(save_path+ "/parameters.json", "w") as f: json.dump(hyperparameters, f) # general configurations state_dim=18 action_dim=4 max_action=1 iterations=hyperparameters["max_iterations"] batch_size=hyperparameters["batch_size"] max_episodes=hyperparameters["max_episodes"] train_mode = hyperparameters["train_mode"] closeness_factor=hyperparameters["closeness_factor"] c = closeness_factor # init the agent agent1 = TD3Agent([state_dim + action_dim, 256, 256, 1], [state_dim, 256, 256, action_dim], optimizer=hyperparameters["optimizer"], policy_noise=hyperparameters["policy_noise"], policy_noise_clip=hyperparameters["policy_noise_clip"], gamma=hyperparameters["gamma"], delay=hyperparameters["delay"], tau=hyperparameters["tau"], lr=hyperparameters["lr"], max_action=max_action, weight_decay=hyperparameters["weight_decay"]) # load the agent if given loaded_state=False if load_path: agent1.load(load_path) loaded_state=True # define opponent if hyperparameters["self_play"]: agent2=agent1 else: agent2 = h_env.BasicOpponent(weak=hyperparameters["weak_agent"]) # load enviroment and replaybuffer replay_buffer = ReplayBuffer(state_dim, action_dim) if train_mode == "defense": env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE) elif train_mode == "shooting": env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING) else: env = h_env.HockeyEnv() # add figure to plot later if hyperparameters["plot_performance"]: fig, (ax_loss, ax_reward) = plt.subplots(2) ax_loss.set_xlim(0, max_episodes) ax_loss.set_ylim(0, 20) ax_reward.set_xlim(0, max_episodes) ax_reward.set_ylim(-30, 20) with HiddenPrints(): # first sample enough data to start: obs_last = env.reset() for i in range(batch_size*100): a1 = env.action_space.sample()[:4] if not loaded_state else agent1.act(env.obs_agent_two()) a2 = agent2.act(env.obs_agent_two()) obs, r, d, info = env.step(np.hstack([a1,a2])) done = 1 if d else 0 replay_buffer.add(obs_last, a1, obs, r, done) obs_last=obs if d: obs_last = env.reset() print("Finished collection of data prior to training") # tracking of performance episode_critic_loss=[] episode_rewards=[] win_count=[] if not os.path.isfile(save_path + "/performance.csv"): pd.DataFrame(data={"Episode_rewards":[], "Episode_critic_loss":[], "Win/Loss":[]}).to_csv(save_path + "/performance.csv", sep=",", index=False) # Then start training for episode_count in range(max_episodes+1): obs_last = env.reset() total_reward=0 critic_loss=[] for i in range(iterations): # run the enviroment with HiddenPrints(): with torch.no_grad(): a1 = agent1.act(env.obs_agent_two()) + np.random.normal(loc=0, scale=hyperparameters["exploration_noise"], size=action_dim) a2 = agent2.act(env.obs_agent_two()) obs, r, d, info = env.step(np.hstack([a1,a2])) total_reward+=r done = 1 if d else 0 # mopify reward with cloeness to puck reward if hyperparameters["closeness_decay"]: c = closeness_factor *(1 - episode_count/max_episodes) newreward = r + c * info["reward_closeness_to_puck"] # add to replaybuffer replay_buffer.add(obs_last, a1, obs, newreward, done) obs_last=obs # sample minibatch and train states, actions, next_states, reward, done = replay_buffer.sample(batch_size) loss = agent1.train(states, actions, next_states, reward, done) critic_loss.append(loss.detach().numpy()) # if done, finish episode if d: episode_rewards.append(total_reward) episode_critic_loss.append(np.mean(critic_loss)) win_count.append(info["winner"]) print(f"Episode {episode_count} finished after {i} steps with a total reward of {total_reward}") # Online plotting if hyperparameters["plot_performance"] and episode_count>40 : ax_loss.plot(list(range(-1, episode_count-29)), moving_average(episode_critic_loss, 30), 'r-') ax_reward.plot(list(range(-1, episode_count-29)), moving_average(episode_rewards, 30), "r-") plt.draw() plt.pause(1e-17) break # Intermediate evaluation of win/loss and saving of model if episode_count % 500 ==0 and episode_count != 0: print(f"The agents win ratio in the last 500 episodes was {win_count[-500:].count(1)/500}") print(f"The agents loose ratio in the last 500 episodes was {win_count[-500:].count(-1)/500}") try: agent1.save(save_path) print("saved model") except Exception: print("Saving Failed model failed") pd.DataFrame(data={"Episode_rewards": episode_rewards[-500:], "Episode_critic_loss": episode_critic_loss[-500:], "Win/Loss": win_count[-500:]}).to_csv(save_path + "/performance.csv", sep=",", index=False, mode="a", header=False) print(f"Finished training with a final mean reward of {np.mean(episode_rewards[-500:])}") # plot the performance summary if hyperparameters["plot_performance_summary"]: try: fig, (ax1, ax2) = plt.subplots(2) x = list(range(len(episode_critic_loss))) coef = np.polyfit(x, episode_critic_loss,1) poly1d_fn = np.poly1d(coef) ax1.plot(episode_critic_loss) ax1.plot(poly1d_fn(list(range(len(episode_critic_loss))))) x = list(range(len(episode_rewards))) coef = np.polyfit(x, episode_rewards,1) poly1d_fn = np.poly1d(coef) ax2.plot(episode_rewards) ax2.plot(poly1d_fn(list(range(len(episode_rewards))))) fig.show() fig.savefig(save_path + "/performance.png", bbox_inches="tight") except: print("Failed saving figure")