def main(): human_model = ActorCritic() human_model.load_state_dict(torch.load('ac_para.pkl')) env = gym.make('CartPole-v1') model = AskActorCritic() print_interval = 20 score = 0.0 for n_epi in range(10000): done = False s = env.reset() step,ask_step = 0,0 while not done: for t in range(n_rollout): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() if a == 2: # human action prob = human_model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() model.put_human_data((s, a)) ask_step += 1 s_prime, r, done, info = env.step(a) model.put_data((s,a,r,s_prime,done)) s = s_prime score += r step += 1 if done: break model.train_net() if n_epi%print_interval==0 and n_epi!=0: print("# of episode :{}, avg score : {:.1f}, ask rate : {:.2f}".format(n_epi, score/print_interval, ask_step/step)) score = 0.0
envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, envs.action_space.n, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters()) env_model = env_model.to(DEVICE) actor_critic = actor_critic.to(DEVICE) checkpoint = torch.load(os.path.join(ACTOR_CRITIC_PATH, "actor_critic_checkpoint")) actor_critic.load_state_dict(checkpoint['actor_critic_state_dict']) reward_coef = 0.1 num_updates = args.epoch losses = [] all_rewards = [] for frame_idx, states, actions, rewards, next_states, dones in play_games(envs, num_updates, actor_critic): states = torch.FloatTensor(states) actions = torch.LongTensor(actions) batch_size = states.size(0) onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
class PPO: def __init__(self, device, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.device = device self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss() def select_action(self, state, memory): if np.any(np.isnan(state)): print('in select action: state is nan', state) state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) return self.policy_old.act(state, memory).cpu().data.numpy().flatten() def update(self, memory): # Monte Carlo estimate of rewards: rewards = [] discounted_reward = 0 for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states_ = torch.squeeze( torch.stack(memory.states).to(self.device)).detach() old_actions_ = torch.squeeze( torch.stack(memory.actions).to(self.device)).detach() old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to( self.device).detach() batch_size = old_states_.shape[0] mini_batch_size = batch_size // 8 # 64 # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : for i in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) old_states = old_states_[rand_ids, :] old_actions = old_actions_[rand_ids, :] old_logprobs = old_logprobs_[rand_ids, :] rewards_batch = rewards[rand_ids] logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards_batch - state_values.detach() ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) #surr = -torch.min(ratios, 1) * advantages # as per the paper len_adv = advantages.shape[0] advantages = advantages.reshape((len_adv, 1)) surr1 = ratios * advantages surr2 = 1 * advantages ## as per the paper surr = -torch.min(surr1, surr2).mean() w_crit_loss = 1 loss = surr + w_crit_loss * (rewards_batch - state_values).pow( 2).mean() #- 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
class PPO(nn.Module): def __init__(self, state_dim, action_dim, eps=0.2, gamma=0.99, lambda_=0.95, K_epoch=80, batch_size=64): super(PPO, self).__init__() self.eps = eps self.gamma = gamma self.lambda_ = lambda_ self.K_epoch = K_epoch self.batch_size = batch_size self.model = ActorCritic(state_dim, action_dim) self.model_old = ActorCritic(state_dim, action_dim) for param in self.model_old.parameters(): param.requires_grad = False self.copy_weights() def forward(self, x): self.pi, self.v = self.model_old(x) return self.pi, self.v def copy_weights(self): self.model_old.load_state_dict(self.model.state_dict()) def update(self, buffer, optimizer): self.model.train() self.model_old.eval() self.advantage_fcn(buffer.data) batch_loss, batch_clip_loss, batch_vf_loss = [], [], [] for epoch in range(self.K_epoch): for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data( self.batch_size): pi, v = self.model(state) log_prob_pi = pi.log_prob(action) prob_ratio = torch.exp(log_prob_pi - log_prob_old) first_term = prob_ratio * advantage second_term = self.clip_by_value(prob_ratio) * advantage loss_clip = (torch.min(first_term, second_term)).mean() _, v_next = self.model_old(next_s) v_target = reward + self.gamma * v_next loss_vf = ((v - v_target)**2).mean( ) # squared error loss: (v(s_t) - v_target)**2 loss = -(loss_clip - loss_vf ) #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean()) optimizer.zero_grad() loss.backward() optimizer.step() batch_loss.append(loss.detach().numpy()) batch_clip_loss.append(loss_clip.detach().numpy()) batch_vf_loss.append(loss_vf.detach().numpy()) self.copy_weights() buffer.reset() def advantage_fcn(self, buffer, normalize=True): _, v_st1 = self.model(torch.stack(buffer['next_s'])) _, v_s = self.model(torch.stack(buffer['s'])) deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s advantage, temp = [], 0 idxs = torch.tensor(range(len(deltas) - 1, -1, -1)) #reverse reverse_deltas = deltas.index_select(0, idxs) for delta_t in reverse_deltas: temp = delta_t + self.lambda_ * self.gamma * temp advantage.append(temp) advantage = torch.as_tensor(advantage[::-1]) #re-reverse if normalize: advantage = (advantage - advantage.mean()) / advantage.std() buffer['advantage'] = advantage.unsqueeze(1) def clip_by_value(self, x): return x.clamp(1 - self.eps, 1 + self.eps) # clamp(min, max)
action_size=g_action_size, shared_layers=[128, 64], critic_hidden_layers=[], actor_hidden_layers=[], init_type='xavier-uniform', seed=0).to(g_device) saved_model = 'ppo_128x64_a0_c0_470e.pth' """ policy = ActorCritic(state_size=g_state_size, action_size=g_action_size, shared_layers=[128, 128], critic_hidden_layers=[64], actor_hidden_layers=[64], init_type='xavier-uniform', seed=0).to(g_device) saved_model = 'ppo_128x128_a64_c64_193e.pth' # load the model policy.load_state_dict(torch.load(saved_model)) # evaluate the model for e in range(episode): rewards = eval_policy(envs=g_env, policy=policy, tmax=1000) total_rewards = np.sum(rewards, 0) scores_window.append(total_rewards.mean()) print("Episode: {0:d}, score: {1}".format(e + 1, np.mean(scores_window)), end="\n") g_env.close()
def worker(name, input_shape, n_actions, global_agent, global_icm, optimizer, icm_optimizer, env_id, n_threads, icm=False): T_MAX = 20 local_agent = ActorCritic(input_shape, n_actions) if icm: local_icm = ICM(input_shape, n_actions) algo = 'ICM' else: intrinsic_reward = T.zeros(1) algo = 'A3C' memory = Memory() env = gym.make(env_id) t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0 while episode < max_eps: obs = env.reset() hx = T.zeros(1, 256) score, done, ep_steps = 0, False, 0 while not done: state = T.tensor([obs], dtype=T.float) action, value, log_prob, hx = local_agent(state, hx) obs_, reward, done, info = env.step(action) t_steps += 1 ep_steps += 1 score += reward reward = 0 # turn off extrinsic rewards memory.remember(obs, action, reward, obs_, value, log_prob) obs = obs_ if ep_steps % T_MAX == 0 or done: states, actions, rewards, new_states, values, log_probs = \ memory.sample_memory() if icm: intrinsic_reward, L_I, L_F = \ local_icm.calc_loss(states, new_states, actions) loss = local_agent.calc_loss(obs, hx, done, rewards, values, log_probs, intrinsic_reward) optimizer.zero_grad() hx = hx.detach_() if icm: icm_optimizer.zero_grad() (L_I + L_F).backward() loss.backward() T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40) for local_param, global_param in zip( local_agent.parameters(), global_agent.parameters()): global_param._grad = local_param.grad optimizer.step() local_agent.load_state_dict(global_agent.state_dict()) if icm: for local_param, global_param in zip( local_icm.parameters(), global_icm.parameters()): global_param._grad = local_param.grad icm_optimizer.step() local_icm.load_state_dict(global_icm.state_dict()) memory.clear_memory() if name == '1': scores.append(score) avg_score = np.mean(scores[-100:]) print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} ' 'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format( algo, episode, name, n_threads, t_steps/1e6, score, T.sum(intrinsic_reward), avg_score)) episode += 1 if name == '1': x = [z for z in range(episode)] fname = algo + '_CartPole_no_rewards.png' plot_learning_curve(x, scores, fname)
def main(): mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters()) actor_critic.load_state_dict(torch.load("actor_critic_" + mode)) def get_action(state): if state.ndim == 4: state = torch.FloatTensor(np.float32(state)) else: state = torch.FloatTensor(np.float32(state)).unsqueeze(0) action = actor_critic.act(autograd.Variable(state, volatile=True)) action = action.data.cpu().squeeze(1).numpy() return action def play_games(envs, frames): states = envs.reset() for frame_idx in range(frames): actions = get_action(states) next_states, rewards, dones, _ = envs.step(actions) yield frame_idx, states, actions, rewards, next_states, dones states = next_states reward_coef = 0.1 num_updates = 5000 losses = [] all_rewards = [] for frame_idx, states, actions, rewards, next_states, dones in tqdm( play_games(envs, num_updates), total=num_updates): states = torch.FloatTensor(states) actions = torch.LongTensor(actions) batch_size = states.size(0) onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:]) onehot_actions[range(batch_size), actions] = 1 inputs = autograd.Variable(torch.cat([states, onehot_actions], 1)) #if USE_CUDA: # inputs = inputs.cuda() imagined_state, imagined_reward = env_model(inputs) target_state = pix_to_target(next_states) target_state = autograd.Variable(torch.LongTensor(target_state)) target_reward = rewards_to_target(mode, rewards) target_reward = autograd.Variable(torch.LongTensor(target_reward)) optimizer.zero_grad() image_loss = criterion(imagined_state, target_state) reward_loss = criterion(imagined_reward, target_reward) loss = image_loss + reward_coef * reward_loss loss.backward() optimizer.step() losses.append(loss.item()) all_rewards.append(np.mean(rewards)) if frame_idx % num_updates == 0: plot(frame_idx, all_rewards, losses) torch.save(env_model.state_dict(), "env_model_" + mode) import time env = MiniPacman(mode, 1000) batch_size = 1 done = False state = env.reset() iss = [] ss = [] steps = 0 while not done: steps += 1 actions = get_action(state) onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:]) onehot_actions[range(batch_size), actions] = 1 state = torch.FloatTensor(state).unsqueeze(0) inputs = autograd.Variable(torch.cat([state, onehot_actions], 1)) #if USE_CUDA: # inputs = inputs.cuda() imagined_state, imagined_reward = env_model(inputs) imagined_state = F.softmax(imagined_state) iss.append(imagined_state) next_state, reward, done, _ = env.step(actions[0]) ss.append(state) state = next_state imagined_image = target_to_pix( imagined_state.view(batch_size, -1, len(pixels))[0].max(1)[1].data.cpu().numpy()) imagined_image = imagined_image.reshape(15, 19, 3) state_image = torch.FloatTensor(next_state).permute(1, 2, 0).cpu().numpy() #clear_output() plt.figure(figsize=(10, 3)) plt.subplot(131) plt.title("Imagined") plt.imshow(imagined_image) plt.subplot(132) plt.title("Actual") plt.imshow(state_image) plt.show() time.sleep(0.3) if steps > 30: break