def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8): names = [str(i) for i in range(1, n_threads + 1)] global_actor_critic = ActorCritic(input_shape, n_actions) global_actor_critic.share_memory() global_optim = SharedAdam(global_actor_critic.parameters()) if not icm: global_icm = None global_icm_optim = None else: global_icm = ICM(input_shape, n_actions) global_icm.share_memory() global_icm_optim = SharedAdam(global_icm.parameters()) self.ps = [ mp.Process(target=worker, args=(name, input_shape, n_actions, global_actor_critic, global_icm, global_optim, global_icm_optim, env_id, n_threads, icm)) for name in names ] [p.start() for p in self.ps] [p.join() for p in self.ps]
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n num_rewards = len(task_rewards[mode]) full_rollout = True env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards) env_model.load_state_dict(torch.load("env_model_" + mode)) distil_policy = ActorCritic(envs.observation_space.shape, envs.action_space.n) distil_optimizer = optim.Adam(distil_policy.parameters()) imagination = ImaginationCore(1, state_shape, num_actions, num_rewards, env_model, distil_policy, full_rollout=full_rollout) actor_critic = I2A(state_shape, num_actions, num_rewards, 256, imagination, full_rollout=full_rollout) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # env_model = env_model.cuda() # distil_policy = distil_policy.cuda() # actor_critic = actor_critic.cuda() gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e5) rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() current_state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(current_state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() current_state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, current_state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_logit, _, _, _ = distil_policy.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) distil_loss = 0.01 * (F.softmax(logit).detach() * F.log_softmax(distil_logit)).sum(1).mean() values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() distil_optimizer.zero_grad() distil_loss.backward() optimizer.step() if i_update % 100 == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "i2a_" + mode)
class PPO: def __init__(self, device, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.device = device self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss() def select_action(self, state, memory): if np.any(np.isnan(state)): print('in select action: state is nan', state) state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) return self.policy_old.act(state, memory).cpu().data.numpy().flatten() def update(self, memory): # Monte Carlo estimate of rewards: rewards = [] discounted_reward = 0 for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states_ = torch.squeeze( torch.stack(memory.states).to(self.device)).detach() old_actions_ = torch.squeeze( torch.stack(memory.actions).to(self.device)).detach() old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to( self.device).detach() batch_size = old_states_.shape[0] mini_batch_size = batch_size // 8 # 64 # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : for i in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) old_states = old_states_[rand_ids, :] old_actions = old_actions_[rand_ids, :] old_logprobs = old_logprobs_[rand_ids, :] rewards_batch = rewards[rand_ids] logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards_batch - state_values.detach() ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) #surr = -torch.min(ratios, 1) * advantages # as per the paper len_adv = advantages.shape[0] advantages = advantages.reshape((len_adv, 1)) surr1 = ratios * advantages surr2 = 1 * advantages ## as per the paper surr = -torch.min(surr1, surr2).mean() w_crit_loss = 1 loss = surr + w_crit_loss * (rewards_batch - state_values).pow( 2).mean() #- 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
def train(args, dynet): torch.manual_seed(args.seed) embedding_size = args.embedding_size lstm_size = args.lstm_size num_modules = len(dynet.library) + 1 libr = librarian.SimpleLibrarian(num_modules, embedding_size) print type(libr) model = ActorCritic(num_modules, libr, lstm_size) env = learning_env.Environment(args, dynet, libr) optimizer = optim.Adam(model.parameters(), lr=args.ac_lr) model.train() values = [] log_probs = [] state = env.reset() #state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model # model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, lstm_size)) hx = Variable(torch.zeros(1, lstm_size)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model((state.unsqueeze(0)), (hx, cx)) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()[0, 0]) done = done or episode_length >= args.num_steps reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() #state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0)), (hx, cx)) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() global_norm = 0 for param in model.parameters(): global_norm += param.grad.data.pow(2).sum() global_norm = math.sqrt(global_norm) ratio = 40 / global_norm if ratio < 1: for param in model.parameters(): param.grad.data.mul_(ratio) optimizer.step()
class PPO(nn.Module): def __init__(self, state_dim, action_dim, eps=0.2, gamma=0.99, lambda_=0.95, K_epoch=80, batch_size=64): super(PPO, self).__init__() self.eps = eps self.gamma = gamma self.lambda_ = lambda_ self.K_epoch = K_epoch self.batch_size = batch_size self.model = ActorCritic(state_dim, action_dim) self.model_old = ActorCritic(state_dim, action_dim) for param in self.model_old.parameters(): param.requires_grad = False self.copy_weights() def forward(self, x): self.pi, self.v = self.model_old(x) return self.pi, self.v def copy_weights(self): self.model_old.load_state_dict(self.model.state_dict()) def update(self, buffer, optimizer): self.model.train() self.model_old.eval() self.advantage_fcn(buffer.data) batch_loss, batch_clip_loss, batch_vf_loss = [], [], [] for epoch in range(self.K_epoch): for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data( self.batch_size): pi, v = self.model(state) log_prob_pi = pi.log_prob(action) prob_ratio = torch.exp(log_prob_pi - log_prob_old) first_term = prob_ratio * advantage second_term = self.clip_by_value(prob_ratio) * advantage loss_clip = (torch.min(first_term, second_term)).mean() _, v_next = self.model_old(next_s) v_target = reward + self.gamma * v_next loss_vf = ((v - v_target)**2).mean( ) # squared error loss: (v(s_t) - v_target)**2 loss = -(loss_clip - loss_vf ) #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean()) optimizer.zero_grad() loss.backward() optimizer.step() batch_loss.append(loss.detach().numpy()) batch_clip_loss.append(loss_clip.detach().numpy()) batch_vf_loss.append(loss_vf.detach().numpy()) self.copy_weights() buffer.reset() def advantage_fcn(self, buffer, normalize=True): _, v_st1 = self.model(torch.stack(buffer['next_s'])) _, v_s = self.model(torch.stack(buffer['s'])) deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s advantage, temp = [], 0 idxs = torch.tensor(range(len(deltas) - 1, -1, -1)) #reverse reverse_deltas = deltas.index_select(0, idxs) for delta_t in reverse_deltas: temp = delta_t + self.lambda_ * self.gamma * temp advantage.append(temp) advantage = torch.as_tensor(advantage[::-1]) #re-reverse if normalize: advantage = (advantage - advantage.mean()) / advantage.std() buffer['advantage'] = advantage.unsqueeze(1) def clip_by_value(self, x): return x.clamp(1 - self.eps, 1 + self.eps) # clamp(min, max)
def worker(name, input_shape, n_actions, global_agent, global_icm, optimizer, icm_optimizer, env_id, n_threads, icm=False): T_MAX = 20 local_agent = ActorCritic(input_shape, n_actions) if icm: local_icm = ICM(input_shape, n_actions) algo = 'ICM' else: intrinsic_reward = T.zeros(1) algo = 'A3C' memory = Memory() env = gym.make(env_id) t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0 while episode < max_eps: obs = env.reset() hx = T.zeros(1, 256) score, done, ep_steps = 0, False, 0 while not done: state = T.tensor([obs], dtype=T.float) action, value, log_prob, hx = local_agent(state, hx) obs_, reward, done, info = env.step(action) t_steps += 1 ep_steps += 1 score += reward reward = 0 # turn off extrinsic rewards memory.remember(obs, action, reward, obs_, value, log_prob) obs = obs_ if ep_steps % T_MAX == 0 or done: states, actions, rewards, new_states, values, log_probs = \ memory.sample_memory() if icm: intrinsic_reward, L_I, L_F = \ local_icm.calc_loss(states, new_states, actions) loss = local_agent.calc_loss(obs, hx, done, rewards, values, log_probs, intrinsic_reward) optimizer.zero_grad() hx = hx.detach_() if icm: icm_optimizer.zero_grad() (L_I + L_F).backward() loss.backward() T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40) for local_param, global_param in zip( local_agent.parameters(), global_agent.parameters()): global_param._grad = local_param.grad optimizer.step() local_agent.load_state_dict(global_agent.state_dict()) if icm: for local_param, global_param in zip( local_icm.parameters(), global_icm.parameters()): global_param._grad = local_param.grad icm_optimizer.step() local_icm.load_state_dict(global_icm.state_dict()) memory.clear_memory() if name == '1': scores.append(score) avg_score = np.mean(scores[-100:]) print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} ' 'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format( algo, episode, name, n_threads, t_steps/1e6, score, T.sum(intrinsic_reward), avg_score)) episode += 1 if name == '1': x = [z for z in range(episode)] fname = algo + '_CartPole_no_rewards.png' plot_learning_curve(x, scores, fname)