class Agent(object): def __init__(self, sess, hps, rm): self.sess = sess self.hps = hps self.rm = rm self.ou = OrnsteinUhlenbeck(hps['a_dim']) self.gamma = hps['gamma'] self.tau = hps['tau'] self.a_bound = hps['a_bound'] self.noise_decay = hps['noise_decay'] self.actor = Actor(self.sess, self.hps, 'actor', trainable=True) self.actor_target = Actor(self.sess, self.hps, 'actor_target', trainable=False) self.critic = Critic(self.sess, self.hps, 'critic', trainable=True) self.critic_target = Critic(self.sess, self.hps, 'critic_target', trainable=False) self.critic.build_train_op(self.actor, 'critic') self.actor.build_train_op(self.critic, 'actor') self.actor_soft_update_op = build_soft_update_op( self.sess, 'actor_target', 'actor', self.tau) self.critic_soft_update_op = build_soft_update_op( self.sess, 'critic_target', 'critic', self.tau) def explore(self, state, i): action = self.actor.act(state) # action += ( self.ou.sample() * self.a_bound * self.noise_decay ** i ) action += (self.ou.sample() * self.noise_decay**i) return action def exploit(self, state): action = self.actor.act(state) return action def learn(self): s1, a1, r1, s2 = self.rm.sample() # Optimize critic a2 = self.actor_target.act(s2) q2 = self.critic_target.predict(s2, a2) y1 = r1 + self.gamma * q2 loss, _ = self.critic.backward(s1, a1, y1) # Optimize actor loss, _ = self.actor.backward(s1) self.sess.run(self.actor_soft_update_op) self.sess.run(self.critic_soft_update_op)
class DDPGAgent(BaseAgent): def __init__(self, sess, hps, rm): # TODO: Here muss ich vermutlich auch noch die Parameter # für den BaseAgent einfügen super(DDPGAgent, self).__init__() self.sess = sess self.hps = hps self.rm = rm self.ou = OrnsteinUhlenbeck(hps['a_dim']) self.gamma = hps['gamma'] self.tau = hps['tau'] self.a_bound = hps['a_bound'] self.noise_decay = hps['noise_decay'] self.actor = Actor(self.sess, self.hps, 'actor', trainable=True) self.actor_target = Actor(self.sess, self.hps, 'actor_target', trainable=False) self.critic = Critic(self.sess, self.hps, 'critic', trainable=True) self.critic_target = Critic(self.sess, self.hps, 'critic_target', trainable=False) self.critic.build_train_op(self.actor, 'critic') self.actor.build_train_op(self.critic, 'actor') self.actor_soft_update_op = build_soft_update_op( self.sess, 'actor_target', 'actor', self.tau) self.critic_soft_update_op = build_soft_update_op( self.sess, 'critic_target', 'critic', self.tau) def explore(self, state, i): action = self.actor.act(state) # action += ( self.ou.sample() * self.a_bound * self.noise_decay ** i ) action += (self.ou.sample() * self.noise_decay**i) return action def exploit(self, state): action = self.actor.act(state) return action def think(self, state, i): if self.hps['mode'] == 'training': self.explore(state, i) else: self.exploit(state) def learn(self): s1, a1, r1, s2 = self.rm.sample() # Optimize critic a2 = self.actor_target.act(s2) q2 = self.critic_target.predict(s2, a2) y1 = r1 + self.gamma * q2 loss, _ = self.critic.backward(s1, a1, y1) # Optimize actor loss, _ = self.actor.backward(s1) self.sess.run(self.actor_soft_update_op) self.sess.run(self.critic_soft_update_op)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] ### for the number of processes to use if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) ## ALE Environments : mostly has Discrete action_space type if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] ### shape==3 for ALE Environments : States are 3D (Image Pi) if len(envs.observation_space.shape) == 3: actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) baseline_target = Baseline_Critic(in_channels=4, num_actions=envs.action_space.n) if args.cuda: actor.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() baseline_target.cuda() actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr) baseline_optim = optim.Adam(actor.parameters(), lr=1e-4) tau_soft_update = 0.001 mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): temperature = 1.0 ## num_steps = 5 as in A2C for step in range(args.num_steps): temperature = temperature / (step + 1) # Sample actions action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True), temperature, envs.action_space.n, args.num_processes) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, dist_entropy.data, value.data, reward, masks) nth_step_return = rollouts.returns[0].cpu().numpy() current_state = rollouts.observations[0].cpu().numpy() nth_state = rollouts.observations[-1].cpu().numpy() current_action = rollouts.action_log_probs[0].cpu().numpy() current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy() mem_buffer.add((current_state, nth_state, current_action, nth_step_return, done, current_action_dist_entropy)) action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True), temperature, envs.action_space.n, args.num_processes) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) bs_size = args.batch_size if len(mem_buffer.storage) >= bs_size: ##samples from the replay buffer state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample( bs_size) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, envs.action_space.n]) #current Q estimate q_batch = critic(to_tensor(state), to_tensor(action)) # target Q estimate next_state_action_probs = target_actor( to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True)) next_q_values = critic_target(to_tensor(next_state, volatile=True), next_state_action_probs[1]) next_q_values.volatile = False target_q_batch = to_tensor(returns) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() value_loss = criterion(q_batch, target_q_batch) if args.gradient_penalty == True: gradients = torch.autograd.grad(value_loss, critic.parameters(), allow_unused=True, retain_graph=True, create_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1)** 2).mean() * args.lambda_grad_penalty gradient_penalty.backward() else: value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor.zero_grad() policy_loss = -critic( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ### Soft trust region constraint for the actor current_action_probs = actor(to_tensor(state, volatile=False), to_tensor(state, volatile=False), to_tensor(state, volatile=False))[0] target_action_probs = target_actor(to_tensor(state, volatile=True), to_tensor(state, volatile=True), to_tensor(state, volatile=True))[0] policy_regularizer = criterion(current_action_probs, target_action_probs) ## Actor update with entropy penalty policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \ + args.actor_kl_lambda * policy_regularizer if args.actor_several_updates == True: for p in range(args.actor_updates): policy_loss.backward(retain_variables=True) else: policy_loss.backward() ##clipping of gradient norms gradient_norms = nn.utils.clip_grad_norm(actor.parameters(), args.max_grad_norm) print("gradient_norms", gradient_norms) actor_optim.step() if args.second_order_grads == True: """ Training the Baseline critic (f(s, \mu(s))) """ baseline_target.zero_grad() ## f(s, \mu(s)) current_baseline = baseline_target( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ## \grad f(s,a) grad_baseline_params = torch.autograd.grad( current_baseline.mean(), actor.parameters(), retain_graph=True, create_graph=True) ## MSE : (Q - f)^{2} baseline_loss = (q_batch.detach() - current_baseline).pow(2).mean() # baseline_loss.volatile=True actor.zero_grad() baseline_target.zero_grad() grad_norm = 0 for grad_1, grad_2 in zip(grad_params, grad_baseline_params): grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum() grad_norm = grad_norm.sqrt() ##Loss for the Baseline approximator (f) overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm overall_loss.backward() baseline_optim.step() soft_update(target_actor, actor, tau_soft_update) soft_update(critic_target, critic, tau_soft_update) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and len( mem_buffer.storage) >= bs_size: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor if args.cuda: save_model = copy.deepcopy(actor).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0], entropy_log_prob.mean())) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] all_value_loss = [value_loss.data.cpu().numpy()[0]] all_policy_loss = [policy_loss.data.cpu().numpy()[0]] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss) # # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass