parser.add_argument('--eval_episodes', help='Set number of evaluation episodes', type=int, default=30)
parser.add_argument('--filename', help='Path to the pretrained model', default=None)
parser.add_argument('--mode', help='Mode for evaluating currently: (shooting | defense)', default='normal')
parser.add_argument('--show', help='Set if want to render training process', action='store_true')
parser.add_argument('--q', help='Quiet mode (no prints)', action='store_true')
parser.add_argument('--opposite', help='Evaluate agent on opposite side', action='store_true')

opts = parser.parse_args()

if __name__ == '__main__':
    if opts.mode == 'normal':
        mode = h_env.HockeyEnv_BasicOpponent.NORMAL
    elif opts.mode == 'shooting':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING
    elif opts.mode == 'defense':
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE
    else:
        raise ValueError('Unknown training mode. See --help')

    logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) + '/logs',
                    mode=opts.mode,
                    quiet=opts.q)
    q_agent = logger.load_model(filename=opts.filename)
    q_agent._config['show'] = opts.show
    q_agent._config['max_steps'] = 250
    q_agent.eval()
    env = h_env.HockeyEnv(mode=mode)
    opponent = h_env.BasicOpponent(weak=False)
    evaluate(agent=q_agent, env=env, opponent=opponent, eval_episodes=opts.eval_episodes,
             action_mapping=q_agent.action_mapping, evaluate_on_opposite_side=opts.opposite)
Пример #2
0
parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G')
parser.add_argument('--seed', type=int, default=111111, metavar='N')
parser.add_argument('--batch_size', type=int, default=4, metavar='N')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N')
parser.add_argument('--hidden_size', type=int, default=512, metavar='N')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N')

args = parser.parse_args()

args.cuda =True if torch.cuda.is_available() else False


env = h_env.HockeyEnv(mode=h_env.HockeyEnv.NORMAL)
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
agent.load_model('full_player_models/sac_actor_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000','full_player_models/sac_critic_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000')
# opponent = copy.deepcopy(agent)
basic_strong = h_env.BasicOpponent(weak=False)
time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
#Tesnorboard
writer = SummaryWriter(f"strongplay-runs/ERE{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}")

# Memory
memory = ERE_PrioritizedReplay(args.replay_size)
# memory = ReplayMemory(args.replay_size,args.seed)

# Training Loop
total_numsteps = 0
Пример #3
0
        mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE
    else:
        raise ValueError('Unknown training mode. See --help')

    opts.device = torch.device(
        'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu')

    dirname = time.strftime(f'%y%m%d_%H%M%S_{random.randint(0, 1e6):06}',
                            time.gmtime(time.time()))
    abs_path = os.path.dirname(os.path.realpath(__file__))
    logger = Logger(prefix_path=os.path.join(abs_path, dirname),
                    mode=opts.mode,
                    cleanup=True,
                    quiet=opts.q)

    env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q))
    opponents = [
        h_env.BasicOpponent(weak=True),
    ]

    # Add absolute paths for pretrained agents
    pretrained_agents = []

    if opts.selfplay:
        for p in pretrained_agents:
            a = SACAgent.load_model(p)
            a.eval()
            opponents.append(a)

    if opts.preload_path is None:
        agent = SACAgent(logger=logger,
Пример #4
0
parser.add_argument('--batch_size', type=int, default=4, metavar='N')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N')
parser.add_argument('--hidden_size', type=int, default=512, metavar='N')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N')
parser.add_argument('--target_update_interval',
                    type=int,
                    default=1,
                    metavar='N')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N')

args = parser.parse_args()

args.cuda = True if torch.cuda.is_available() else False

env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
# actor512 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.385833864540086_episode-41000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-512_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-40-41'
# critic512 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.385833864540086_episode-41000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-512_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-40-41'
# actor128 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.184820100545167_episode-39000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-128_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-16'
# critic128 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.184820100545167_episode-39000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-128_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-16'
actor64 = 'hockey-hidden-models-attack/sac_actor_hockey_reward-8.407677291229737_episode-33000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-64_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-10'
critic64 = 'hockey-hidden-models-attack/sac_critic_hockey_reward-8.407677291229737_episode-33000_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-64_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000_t-2021-03-10_22-36-10'
agent.load_model(actor64, critic64)
time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
#Tesnorboard
writer = SummaryWriter(
    f"hockey-hidden-runs-defence/{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}"
)
Пример #5
0

optParser = optparse.OptionParser()
optParser.add_option('-e', '--env',action='store', type='string',
                        dest='env_name',default="hockey",
                        help='Environment (default %default)')
optParser.add_option('-c', '--eps',action='store',  type='float',
                        dest='eps_clip',default=0.2,
                        help='Clipping epsilon (default %default)')
optParser.add_option('-r', '--run',action='store',  type='int',
                        dest='test_run',default=0,
                        help='Test run (default %default)')



env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING)

opts, args = optParser.parse_args()
############## Hyperparameters ##############
run_number = opts.test_run
env_name = opts.env_name
# creating environment
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
render = False
solved_reward = 230         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_interactions = 10000001        # max training episodes
max_timesteps = 300         # max timesteps in one episode
n_latent_var = 256           # number of variables in hidden layer
update_timestep = 2000      # update policy every n timesteps
Пример #6
0
def training_loop(hyperparameters):
    print(f"Starting training with hyperparameters: {hyperparameters}")
    save_path = hyperparameters["save_path"]
    load_path = hyperparameters["load_path"]

    # create the save path and save hyperparameter configuration
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    else:
        a = input("Warning, Directory already exists. Dou want to continue?")
        if a not in ["Y","y"]:
            raise Exception("Path already exists, please start with another path.")

    with open(save_path+ "/parameters.json", "w") as f:
        json.dump(hyperparameters, f)

    # general configurations
    state_dim=18
    action_dim=4
    max_action=1
    iterations=hyperparameters["max_iterations"]
    batch_size=hyperparameters["batch_size"]
    max_episodes=hyperparameters["max_episodes"]
    train_mode = hyperparameters["train_mode"]
    closeness_factor=hyperparameters["closeness_factor"]
    c = closeness_factor

    # init the agent
    agent1 = TD3Agent([state_dim + action_dim, 256, 256, 1],
                        [state_dim, 256, 256, action_dim],
                        optimizer=hyperparameters["optimizer"],
                        policy_noise=hyperparameters["policy_noise"],
                        policy_noise_clip=hyperparameters["policy_noise_clip"],
                        gamma=hyperparameters["gamma"],
                        delay=hyperparameters["delay"],
                        tau=hyperparameters["tau"],
                        lr=hyperparameters["lr"],
                        max_action=max_action,
                        weight_decay=hyperparameters["weight_decay"])

    # load the agent if given
    loaded_state=False
    if load_path:
        agent1.load(load_path)
        loaded_state=True

    # define opponent
    if hyperparameters["self_play"]:
        agent2=agent1
    else:
        agent2 = h_env.BasicOpponent(weak=hyperparameters["weak_agent"])

    # load enviroment and replaybuffer
    replay_buffer = ReplayBuffer(state_dim, action_dim)

    if train_mode == "defense":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
    elif train_mode == "shooting":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING)
    else:
        env = h_env.HockeyEnv()


    # add figure to plot later
    if hyperparameters["plot_performance"]:
        fig, (ax_loss, ax_reward) = plt.subplots(2)
        ax_loss.set_xlim(0, max_episodes)
        ax_loss.set_ylim(0, 20)
        ax_reward.set_xlim(0, max_episodes)
        ax_reward.set_ylim(-30, 20)

    with HiddenPrints():
    # first sample enough data to start:
        obs_last = env.reset()
        for i in range(batch_size*100):
            a1 = env.action_space.sample()[:4] if not loaded_state else agent1.act(env.obs_agent_two())
            a2 = agent2.act(env.obs_agent_two())
            obs, r, d, info = env.step(np.hstack([a1,a2]))
            done = 1 if d else 0
            replay_buffer.add(obs_last, a1, obs, r, done)
            obs_last=obs
            if d:
                obs_last = env.reset()

    print("Finished collection of data prior to training")

    # tracking of performance
    episode_critic_loss=[]
    episode_rewards=[]
    win_count=[]
    if not os.path.isfile(save_path + "/performance.csv"):
        pd.DataFrame(data={"Episode_rewards":[], "Episode_critic_loss":[], "Win/Loss":[]}).to_csv(save_path + "/performance.csv", sep=",", index=False)

    # Then start training
    for episode_count in range(max_episodes+1):
        obs_last = env.reset()
        total_reward=0
        critic_loss=[]

        for i in range(iterations):
            # run the enviroment
            with HiddenPrints():
                with torch.no_grad():
                    a1 =  agent1.act(env.obs_agent_two()) + np.random.normal(loc=0, scale=hyperparameters["exploration_noise"], size=action_dim)
                a2 = agent2.act(env.obs_agent_two())
                obs, r, d, info = env.step(np.hstack([a1,a2]))
            total_reward+=r
            done = 1 if d else 0

            # mopify reward with cloeness to puck reward
            if hyperparameters["closeness_decay"]:
                c = closeness_factor *(1 - episode_count/max_episodes)
            newreward = r + c * info["reward_closeness_to_puck"] 

            # add to replaybuffer
            replay_buffer.add(obs_last, a1, obs, newreward, done)
            obs_last=obs
            
            # sample minibatch and train
            states, actions, next_states, reward, done = replay_buffer.sample(batch_size)
            loss = agent1.train(states, actions, next_states, reward, done)
            critic_loss.append(loss.detach().numpy())

            # if done, finish episode
            if d:
                episode_rewards.append(total_reward)
                episode_critic_loss.append(np.mean(critic_loss))
                win_count.append(info["winner"])
                print(f"Episode {episode_count} finished after {i} steps with a total reward of {total_reward}")
                
                # Online plotting
                if hyperparameters["plot_performance"] and episode_count>40 :
                    ax_loss.plot(list(range(-1, episode_count-29)), moving_average(episode_critic_loss, 30), 'r-')
                    ax_reward.plot(list(range(-1, episode_count-29)), moving_average(episode_rewards, 30), "r-")
                    plt.draw()
                    plt.pause(1e-17)

                break
        
        # Intermediate evaluation of win/loss and saving of model
        if episode_count % 500 ==0 and episode_count != 0:
                print(f"The agents win ratio in the last 500 episodes was {win_count[-500:].count(1)/500}")
                print(f"The agents loose ratio in the last 500 episodes was {win_count[-500:].count(-1)/500}")
                try:
                    agent1.save(save_path)
                    print("saved model")
                except Exception:
                    print("Saving Failed model failed")
                pd.DataFrame(data={"Episode_rewards": episode_rewards[-500:], "Episode_critic_loss": episode_critic_loss[-500:], "Win/Loss": win_count[-500:]}).to_csv(save_path + "/performance.csv", sep=",", index=False, mode="a", header=False)
                    
    print(f"Finished training with a final mean reward of {np.mean(episode_rewards[-500:])}")





    # plot the performance summary
    if hyperparameters["plot_performance_summary"]:
            try:
                fig, (ax1, ax2) = plt.subplots(2)
                x = list(range(len(episode_critic_loss)))
                coef = np.polyfit(x, episode_critic_loss,1)
                poly1d_fn = np.poly1d(coef)
                ax1.plot(episode_critic_loss)
                ax1.plot(poly1d_fn(list(range(len(episode_critic_loss)))))


                x = list(range(len(episode_rewards)))
                coef = np.polyfit(x, episode_rewards,1)
                poly1d_fn = np.poly1d(coef)
                ax2.plot(episode_rewards)
                ax2.plot(poly1d_fn(list(range(len(episode_rewards)))))
                fig.show()
                fig.savefig(save_path + "/performance.png", bbox_inches="tight")
            except:
                print("Failed saving figure")