def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len) discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len) actor.to(device), critic.to(device), discrim.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] similarity_scores = [] while steps < args.total_sample_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) for _ in range(10000): steps += 1 mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. action = get_action(mu.cpu(), std.cpu())[0] for i in range(5): emb_sum = expert_action[i,:].sum().cpu().item() if emb_sum == 0: # print(i) action[i:,:] = 0 # manual padding break done= env.step(action) irl_reward = get_reward(discrim, state, action, args) if done: mask = 0 else: mask = 1 memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action]) score += irl_reward similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 scores.append(score) similarity_scores.append(similarity_score) score_avg = np.mean(scores) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg if args.suspend_accu_exp is not None: #only if not None do we check. if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) actor.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) episodes = 0 for iter in range(args.max_iter_num): actor.eval() steps = 0 scores = [] states = [] expert_actions = [] while steps < args.batch_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) states.append(state) expert_actions.append(expert_action) similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 similarity_scores.append(similarity_score) states = torch.stack(states) actions_pred , _ = actor(states) expert_actions = torch.stack(expert_actions) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train() loss = F.mse_loss(actions_pred,expert_action) actor_optim.zero_grad() actor_optim.step() # and this is basically all we need to do train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
class DDPGAgent(): def __init__(self, env, hp): self.env = env self.hp = hp self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.target_critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.target_actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.dataset = ReplayBuffer(self.hp['batch_size'], self.hp['max_buffer_size']) self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0], sigma=self.hp['noise_sigma']) self.noise.reset_states() def take_action(self, state, greedy=False): state = Variable(torch.from_numpy(state)).float() action = self.actor.predict(state) if greedy: return action.detach().numpy() return action.detach().numpy() \ + (self.noise.sample() * self.env.action_space.high[0]) def collect(self, n_episodes, max_episodes): state = self.env.reset() reward_list = [] for _ in range(n_episodes): reward = 0 for step in range(max_episodes): action = self.take_action(state, greedy=True) s_next, r, done, _ = self.env.step(action) state = s_next reward += r if done: break reward_list.append(reward) state = self.env.reset() return np.mean(reward_list) def buffer_update(self, sample): self.dataset.add_sample(sample) def _critic_update(self, batch): s = batch[0] a = batch[1] r = batch[2] s_next = batch[3] done = batch[4] target_actions = self.target_actor.predict(s_next) Q_val = self.target_critic.predict(s_next, target_actions) y_target = r + done * (self.hp['gamma'] * Q_val) #y_target2 = r + self.hp['gamma'] * Q_val #print(y_target!=y_target2,done) y_pred = self.critic.predict(s, a) self.critic.train(y_pred, y_target) def _actor_update(self, batch): s = batch[0] pred_a = self.actor.predict(s) loss = torch.mean(-self.critic.predict(s, pred_a)) self.actor.train(loss) def update(self): if self.dataset.length < self.hp['batch_size']: return batch = self.dataset.get_batch() self._critic_update(batch) self._actor_update(batch) self._target_update(self.hp['tau'], self.target_critic, self.critic) self._target_update(self.hp['tau'], self.target_actor, self.actor) def _target_update(self, tau, target_network, network): for target_param, param in zip(target_network.parameters(), network.parameters()): target_param.data.copy_(tau * param.data + target_param.data * (1.0 - tau)) def save_models(self, episode): torch.save(self.target_actor.state_dict(), './trained_models/' + str(episode) + 'actor.pt') torch.save(self.target_critic.state_dict(), './trained_models/' + str(episode) + 'critic.pt') print('Models Saved!') def load_models(self, path): self.actor.load_state_dict(torch.load(path + 'actor.pt')) self.critic.load_state_dict(torch.load(path + 'critic.pt')) self._target_update(1, self.target_actor, self.actor) self._target_update(1, self.target_critic, self.critic) print('Models Loaded!')