예제 #1
0
def main():
    env = DialogEnvironment()
    experiment_name = args.logdir.split('/')[1] #model name

    torch.manual_seed(args.seed)

    #TODO
    actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size)
    critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len)
    discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len)
    
    actor.to(device), critic.to(device), discrim.to(device)
    
    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None: #TODO
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])


    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []
        similarity_scores = []
        while steps < args.total_sample_size: 
            scores = []
            similarity_scores = []
            state, expert_action, raw_state, raw_expert_action = env.reset()
            score = 0
            similarity_score = 0
            state = state[:args.seq_len,:]
            expert_action = expert_action[:args.seq_len,:]
            state = state.to(device)
            expert_action = expert_action.to(device)
            for _ in range(10000): 

                steps += 1

                mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. 
                action = get_action(mu.cpu(), std.cpu())[0]
                for i in range(5):
                    emb_sum = expert_action[i,:].sum().cpu().item()
                    if emb_sum == 0:
                       # print(i)
                        action[i:,:] = 0 # manual padding
                        break

                done= env.step(action)
                irl_reward = get_reward(discrim, state, action, args)
                if done:
                    mask = 0
                else:
                    mask = 1


                memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action])
                score += irl_reward
                similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5)
                #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim')
                if done:
                    break

            episodes += 1
            scores.append(score)
            similarity_scores.append(similarity_score)

        score_avg = np.mean(scores)
        similarity_score_avg = np.mean(similarity_scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg))

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) 
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg
            writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg
            writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg
            if args.suspend_accu_exp is not None: #only if not None do we check.
                if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                    train_discrim_flag = False

        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
        writer.add_scalar('log/score', float(score_avg), iter)
        writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter)
        writer.add_text('log/raw_state', raw_state[0],iter)
        raw_action = get_raw_action(action) #TODO
        writer.add_text('log/raw_action', raw_action,iter)
        writer.add_text('log/raw_expert_action', raw_expert_action,iter)

        if iter % 100:
            score_avg = int(score_avg)
            # Open a file with access mode 'a'
            file_object = open(experiment_name+'.txt', 'a')

            result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n'
            # Append at the end of file
            file_object.write(result_str)
            # Close the file
            file_object.close()

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'args': args,
                'score': score_avg,
            }, filename=ckpt_path)
예제 #2
0
def main():
    env = DialogEnvironment()
    experiment_name = args.logdir.split('/')[1] #model name

    torch.manual_seed(args.seed)

    #TODO
    actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size)
    
    actor.to(device)
    
    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    # load demonstrations

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None: #TODO
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
 

    
    episodes = 0


    for iter in range(args.max_iter_num):
        actor.eval()
 

        steps = 0
        scores = []
        states = []
        expert_actions = []
        while steps < args.batch_size: 
            scores = []
            similarity_scores = []
            state, expert_action, raw_state, raw_expert_action = env.reset()

            score = 0
            similarity_score = 0
            state = state[:args.seq_len,:]
            expert_action = expert_action[:args.seq_len,:]
            state = state.to(device)
            expert_action = expert_action.to(device)
            states.append(state)
            expert_actions.append(expert_action)



                similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5)
                #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim')
                if done:
                    break

            episodes += 1

            similarity_scores.append(similarity_score)
        states = torch.stack(states)
        actions_pred , _ = actor(states)
        expert_actions = torch.stack(expert_actions)



        similarity_score_avg = np.mean(similarity_scores)
        print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg))

        actor.train()
        loss = F.mse_loss(actions_pred,expert_action)
        actor_optim.zero_grad()
        actor_optim.step() 
        # and this is basically all we need to do



        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
        writer.add_scalar('log/score', float(score_avg), iter)
        writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter)
        writer.add_text('log/raw_state', raw_state[0],iter)
        raw_action = get_raw_action(action) #TODO
        writer.add_text('log/raw_action', raw_action,iter)
        writer.add_text('log/raw_expert_action', raw_expert_action,iter)

        if iter % 100:
            score_avg = int(score_avg)
            # Open a file with access mode 'a'
            file_object = open(experiment_name+'.txt', 'a')

            result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n'
            # Append at the end of file
            file_object.write(result_str)
            # Close the file
            file_object.close()

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'args': args,
                'score': score_avg,
            }, filename=ckpt_path)
예제 #3
0
class DDPGAgent():
    def __init__(self, env, hp):

        self.env = env
        self.hp = hp
        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0], hp)

        self.target_critic = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0], hp)

        self.actor = Actor(env.observation_space.shape[0],
                           env.action_space.shape[0], env.action_space.high[0],
                           hp)

        self.target_actor = Actor(env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  env.action_space.high[0], hp)

        self.dataset = ReplayBuffer(self.hp['batch_size'],
                                    self.hp['max_buffer_size'])

        self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0],
                                              sigma=self.hp['noise_sigma'])
        self.noise.reset_states()

    def take_action(self, state, greedy=False):

        state = Variable(torch.from_numpy(state)).float()
        action = self.actor.predict(state)
        if greedy:
            return action.detach().numpy()

        return action.detach().numpy() \
            + (self.noise.sample() * self.env.action_space.high[0])

    def collect(self, n_episodes, max_episodes):

        state = self.env.reset()
        reward_list = []

        for _ in range(n_episodes):
            reward = 0
            for step in range(max_episodes):
                action = self.take_action(state, greedy=True)
                s_next, r, done, _ = self.env.step(action)
                state = s_next
                reward += r
                if done:
                    break

            reward_list.append(reward)
            state = self.env.reset()

        return np.mean(reward_list)

    def buffer_update(self, sample):

        self.dataset.add_sample(sample)

    def _critic_update(self, batch):

        s = batch[0]
        a = batch[1]
        r = batch[2]
        s_next = batch[3]
        done = batch[4]
        target_actions = self.target_actor.predict(s_next)
        Q_val = self.target_critic.predict(s_next, target_actions)
        y_target = r + done * (self.hp['gamma'] * Q_val)
        #y_target2 = r + self.hp['gamma'] * Q_val
        #print(y_target!=y_target2,done)
        y_pred = self.critic.predict(s, a)
        self.critic.train(y_pred, y_target)

    def _actor_update(self, batch):

        s = batch[0]
        pred_a = self.actor.predict(s)
        loss = torch.mean(-self.critic.predict(s, pred_a))
        self.actor.train(loss)

    def update(self):

        if self.dataset.length < self.hp['batch_size']:
            return
        batch = self.dataset.get_batch()

        self._critic_update(batch)
        self._actor_update(batch)
        self._target_update(self.hp['tau'], self.target_critic, self.critic)
        self._target_update(self.hp['tau'], self.target_actor, self.actor)

    def _target_update(self, tau, target_network, network):
        for target_param, param in zip(target_network.parameters(),
                                       network.parameters()):
            target_param.data.copy_(tau * param.data + target_param.data *
                                    (1.0 - tau))

    def save_models(self, episode):

        torch.save(self.target_actor.state_dict(),
                   './trained_models/' + str(episode) + 'actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './trained_models/' + str(episode) + 'critic.pt')
        print('Models Saved!')

    def load_models(self, path):

        self.actor.load_state_dict(torch.load(path + 'actor.pt'))
        self.critic.load_state_dict(torch.load(path + 'critic.pt'))
        self._target_update(1, self.target_actor, self.actor)
        self._target_update(1, self.target_critic, self.critic)
        print('Models Loaded!')