def td3(actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=200300, epochs=100, replay_size=int(2e6), gamma=0.99, polyak=0.995, pi_lr=1e-4, q_lr=1e-3, batch_size=100, start_steps=2000000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=3000, save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : red_a function which creates red_a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for red_a PyTorch Module with an ``act`` method, red_a ``pi`` module, red_a ``q1`` module, and red_a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept red_a batch of observations and red_a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ torch.manual_seed(seed) np.random.seed(seed) writer = SummaryWriter('./path/to/log') env, test_env = ICRABattleField(), ICRABattleField() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks red_ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # red_ac.red_load() # blue_ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # blue_ac.blue_load() print(red_ac) ac_targ = deepcopy(red_ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(red_ac.q1.parameters(), red_ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get red_a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [red_ac.pi, red_ac.q1, red_ac.q2]) # Set up function for computing TD3 Q-losses 与DDPG主要不同点 def compute_loss_q(data): red_o, red_a, r, red_o2, d = data['obs'], data['act'], data[ 'rew'], data['obs2'], data['done'] q1 = red_ac.q1(red_o, red_a) q2 = red_ac.q2(red_o, red_a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(red_o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) red_a2 = pi_targ + epsilon red_a2 = torch.clamp(red_a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(red_o2, red_a2) q2_pi_targ = ac_targ.q2(red_o2, red_a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): red_o = data['obs'] q1_pi = red_ac.q1(red_o, red_ac.pi(red_o)) return -q1_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(red_ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(red_ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) return loss_q def get_action(ac, o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): red_o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) red_o, r, d, _ = test_env.step(get_action(red_o, 0)) ep_ret += r ep_len += 1 # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() red_o, blue_o = env.reset() ep_ret, ep_len = 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from red_a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: red_a = get_action(red_ac, red_o, act_noise) else: red_a = env.action_space.sample() #blue_a = get_action(blue_ac,blue_o,act_noise) blue_a = env.action_space.sample() # Step the env red_o2, blue_o, r, d, _ = env.step(red_a, blue_a, ep_len) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(red_o, red_a, r, red_o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! red_o = red_o2 # End of trajectory handling if d or (ep_len == max_ep_len): writer.add_scalar('Return', ep_ret, t) print("times:", t, "/", total_steps, "Return:", ep_ret) red_o, blue_o = env.reset() ep_ret, ep_len = 0, 0 if t % 2000000 == 0: os.makedirs('./model1_8_2/' + str(int(t / 2000000))) red_ac.save('./model1_8_2/' + str(int(t / 2000000)) + '/') # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) loss_q = update(data=batch, timer=j) writer.add_scalar('Loss Q', loss_q, t)
torch.random.manual_seed(args.seed) torch.cuda.random.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) agent = ActorCriticAgent() if args.load_model: agent.load_model(args.load_model_path) if args.enemy == "hand": agent2 = HandAgent() elif args.enemy == "AC": agent2 = ActorCriticAgent() agent2.load_model(args.load_model_path) env = ICRABattleField() env.seed(args.seed) losses = [] rewards = [] for i_episode in range(1, args.epoch + 1): print("Epoch: [{}/{}]".format(i_episode, args.epoch)) # Initialize the environment and state action = Action() pos = env.reset() if args.enemy == "hand": agent2.reset(pos) state, reward, done, info = env.step(action) for t in (range(2*60*30)): # Other agent if args.enemy == "hand": env.set_robot_action(ID_B1, agent2.select_action(state[ID_B1]))
agent = ActorCriticAgent() # 新增的红方 agent aux_agent = ActorCriticAgent() if args.load_model: aux_agent.load_model(args.save_model_path) agent.load_model(args.save_model_path) if args.enemy == "hand": agent2 = HandAgent() aux_agent2 = HandAgent() elif args.enemy == "AC": agent2 = ActorCriticAgent() agent2.load_model(args.load_model_path) aux_agent2 = ActorCriticAgent() aux_agent2.load_model(args.load_model_path) env = ICRABattleField() if args.enemy == "person": env.render() env.viewer.window.on_key_press = env.key_press env.viewer.window.on_key_release = env.key_release env.seed(args.seed) losses = [] rewards = [] for i_episode in range(1, args.epoch + 1): print("Epoch: [{}/{}]".format(i_episode, args.epoch)) # Initialize the environment and state action = Action() pos = env.reset() if args.enemy == "hand": pass # agent2.reset([7.5, 0.5])
parser.add_argument("--load_model_path", type=str, default="ICRA_save.model", help="The path of trained model") parser.add_argument("--epoch", type=int, default=50, help="Number of epoches to test") args = parser.parse_args() torch.random.manual_seed(args.seed) torch.cuda.random.manual_seed(args.seed) np.random.seed(args.seed) #random.seed(args.seed) env = ICRABattleField() env.seed(args.seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] red_agent = MLPActorCritic(env.observation_space, env.action_space, hidden_sizes=(256, 256, 256, 256)) red_agent.red_load() # blue_agent = MLPActorCritic(env.observation_space, env.action_space, hidden_sizes=(256,256,256,256)) # blue_agent.blue_load() start_time = time.time()
if dones or step == max_steps - 1: dones = [1 for _ in range(maddpg.num_agents)] maddpg.replay_buffer.push(states, actions, rewards, next_states, dones) episode_rewards.append(episode_reward) print("episode: {} | reward: {} \n".format( episode, episode_reward)) writer.add_scalar('Return', episode_reward, episode) states = maddpg.env.reset() step = 0 else: dones = [0 for _ in range(maddpg.num_agents)] maddpg.replay_buffer.push(states, actions, rewards, next_states, dones) states = next_states if len(maddpg.replay_buffer) > batch_size: maddpg.update(batch_size, writer, episode) #env.render() if episode % 2000000 == 0: os.makedirs('./model2_0_1/' + str(int(episode / 2000000))) maddpg.agents[0].save_ID_R1('./model2_0_1/' + str(int(episode / 2000000)) + '/') maddpg.agents[1].save_ID_R2('./model2_0_1/' + str(int(episode / 2000000)) + '/') env = ICRABattleField() maddpg = MADDPG(env, 1000000) run(maddpg, 7000, 3000, 32, 2000000)