def main(env): player1 = lh.BasicOpponent() player2 = lh.BasicOpponent() state_size =env.observation_space.shape[0] action_size = env.action_space.shape[0] // 2 buffer = EfficientReplayMemory(Parameters.IMITATION_BUFFER_SIZE, state_size, action_size) while len(buffer) < Parameters.IMITATION_BUFFER_SIZE: state = env.reset() obs_agent2 = env.obs_agent_two() while True: #env.render() action = player1.act(state) #a2 = player2.act(obs_agent2) a2 = [0,0,0] next_state, reward, done, info = env.step(np.hstack([action,a2])) reward = 100 * reward + 50 * info["reward_closeness_to_puck"] + 100 * info["reward_touch_puck"] + 80 * info["reward_puck_direction"] """ if done and info["winner"] == 0: reward -= 5 """ # build transition action = torch.Tensor([action]) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) buffer.push(torch.Tensor([state]), action, reward, torch.Tensor([next_state]), mask) #buffer.push(torch.Tensor([state]), action, mask, torch.Tensor([next_state]), reward) obs_agent2 = env.obs_agent_two() if done: break else: state = next_state buffer.save_memory("imitations_normal.pt") print("Saved imitation data")
## test the outputs #ob = env.reset() #ac_output = ddpg_agent._Mu.As(ob) #q_output = ddpg_agent._Q.Qs(ob, ac_output) # start training stats = [] losses = [] rewards = [] writer = None show = False mode = "DDPG" playerComputer = lh.BasicOpponent() for i in range(max_episodes): start_noise -= noise_step start_noise = np.max([start_noise, 0.01]) total_reward = 0 ob = env.reset() for t in range(max_steps): done = False action = ddpg_agent.act(ob) # adding noise to action a_t = np.clip(np.random.normal(action, start_noise), -1, 1) # opponent does total random actions #if i < 1000: # a_opp = np.clip(np.random.normal([0, 0, 0], start_noise), -1, 1) #else:
plt.clf() plt.cla() plt.close() agent.save_models() environment.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--testing', type=int, default=0) parser.add_argument('--mode', type=int, default=PENDULUM) args = parser.parse_args() mode = args.mode player2 = lh.BasicOpponent() if mode == TRAIN_SHOOTING: imitation_data = "imitations_shooting.pt" elif mode == TRAIN_DEFENSE: imitation_data = "imitations_defense.pt" else: imitation_data = "imitations_normal.pt" environment, action_size = create_environment(mode, args.testing) agent = DDPGAgent(environment.observation_space.shape[0], action_size, environment.action_space.high[0], environment.action_space.low[0], imitation_data) if args.testing: agent.load_models() for _ in range(20):