def main(env):
    player1 = lh.BasicOpponent()
    player2 = lh.BasicOpponent()

    state_size =env.observation_space.shape[0]
    action_size = env.action_space.shape[0] // 2

    buffer = EfficientReplayMemory(Parameters.IMITATION_BUFFER_SIZE, state_size, action_size)

    while len(buffer) < Parameters.IMITATION_BUFFER_SIZE:
        state = env.reset()
        obs_agent2 = env.obs_agent_two()
        while True:
            #env.render()
            action = player1.act(state)
            #a2 = player2.act(obs_agent2)
            a2 = [0,0,0]
            next_state, reward, done, info = env.step(np.hstack([action,a2]))  
            reward = 100 * reward + 50 * info["reward_closeness_to_puck"] + 100 * info["reward_touch_puck"] + 80 * info["reward_puck_direction"]

            """ if done and info["winner"] == 0:
                reward -= 5 """

            # build transition
            action = torch.Tensor([action])
            mask = torch.Tensor([not done])
            reward = torch.Tensor([reward])

            buffer.push(torch.Tensor([state]), action, reward, torch.Tensor([next_state]), mask)
            #buffer.push(torch.Tensor([state]), action, mask, torch.Tensor([next_state]), reward)

            obs_agent2 = env.obs_agent_two()
            if done: 
                break
            else:
                state = next_state



    buffer.save_memory("imitations_normal.pt")
    print("Saved imitation data")
示例#2
0
## test the outputs
#ob = env.reset()
#ac_output = ddpg_agent._Mu.As(ob)
#q_output = ddpg_agent._Q.Qs(ob, ac_output)

# start training
stats = []
losses = []
rewards = []

writer = None

show = False
mode = "DDPG"
playerComputer = lh.BasicOpponent()
for i in range(max_episodes):
    start_noise -= noise_step
    start_noise = np.max([start_noise, 0.01])
    total_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        done = False
        action = ddpg_agent.act(ob)

        # adding noise to action
        a_t = np.clip(np.random.normal(action, start_noise), -1, 1)
        # opponent does total random actions
        #if i < 1000:
        #    a_opp = np.clip(np.random.normal([0, 0, 0], start_noise), -1, 1)
        #else:
示例#3
0
    plt.clf()
    plt.cla()
    plt.close()

    agent.save_models()
    environment.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--testing', type=int, default=0)
    parser.add_argument('--mode', type=int, default=PENDULUM)
    args = parser.parse_args()

    mode = args.mode
    player2 = lh.BasicOpponent()

    if mode == TRAIN_SHOOTING:
        imitation_data = "imitations_shooting.pt"
    elif mode == TRAIN_DEFENSE:
        imitation_data = "imitations_defense.pt"
    else:
        imitation_data = "imitations_normal.pt"

    environment, action_size = create_environment(mode, args.testing)
    agent = DDPGAgent(environment.observation_space.shape[0], action_size,
                      environment.action_space.high[0],
                      environment.action_space.low[0], imitation_data)
    if args.testing:
        agent.load_models()
        for _ in range(20):