예제 #1
0
    def collect_samples(self, min_batch_size):
        t_start = time.time()
        if use_gpu:
            self.policy = self.policy.cpu()
        thread_batch_size = int(math.floor(min_batch_size / self.num_threads))
        queue = multiprocessing.Queue()
        memory = Memory()
        workers = []

        for i in range(self.num_threads - 1):
            workers.append(
                Worker(queue, self.env_list[i + 1], self.policy,
                       self.custom_reward, self.tensor, False,
                       self.running_state, thread_batch_size))
        for worker in workers:
            worker.start()

        log = collect_samples(self.env_list[0], memory, self.policy,
                              self.custom_reward, self.tensor, self.render,
                              self.running_state, True, thread_batch_size)

        worker_logs = []
        for _ in workers:
            worker_memory, worker_log = queue.get()
            memory.append(worker_memory)
            worker_logs.append(worker_log)
        batch = memory.sample()
        if self.num_threads > 1:
            log_list = [log] + worker_logs
            log = merge_log(log_list)
        if use_gpu:
            self.policy = self.policy.cuda()
        t_end = time.time()
        log['sample_time'] = t_end - t_start
        return batch, log
예제 #2
0
    def train_gail(self, expert):
        '''Train Info-GAIL.'''
        args, dtype = self.args, self.dtype
        results = {
            'average_reward': [],
            'episode_reward': [],
            'true_traj': {},
            'pred_traj': {}
        }
        self.train_step_count, self.gail_step_count = 0, 0

        for ep_idx in range(args.num_epochs):
            memory = Memory()

            num_steps = 0
            reward_batch, true_reward_batch = [], []
            expert_true_reward_batch = []
            true_traj_curr_episode, gen_traj_curr_episode = [], []

            while num_steps < args.batch_size:
                traj_expert = expert.sample(size=1)
                state_expert, action_expert, _, _ = traj_expert

                # Expert state and actions
                state_expert = state_expert[0]
                action_expert = action_expert[0]
                expert_episode_len = len(state_expert)

                # Sample start state or should we just choose the start state
                # from the expert trajectory sampled above.
                # curr_state_obj = self.sample_start_state()
                curr_state_obj = State(state_expert[0], self.obstacles)
                curr_state_feat = self.get_state_features(
                    curr_state_obj, self.args.use_state_features)

                # Add history to state
                if args.history_size > 1:
                    curr_state = -1 * np.ones(
                        (args.history_size * curr_state_feat.shape[0]),
                        dtype=np.float32)
                    curr_state[(args.history_size-1) \
                            * curr_state_feat.shape[0]:] = curr_state_feat
                else:
                    curr_state = curr_state_feat

                # TODO: Make this a separate function. Can be parallelized.
                ep_reward, ep_true_reward, expert_true_reward = 0, 0, 0
                true_traj, gen_traj = [], []
                gen_traj_dict = {
                    'features': [],
                    'actions': [],
                    'c': [],
                    'mask': []
                }
                disc_reward, posterior_reward = 0.0, 0.0
                # Use a hard-coded list for memory to gather experience since we
                # need to mutate it before finally creating a memory object.

                c_sampled = np.zeros((self.num_goals), dtype=np.float32)
                c_sampled[np.random.randint(0, self.num_goals)] = 1.0
                c_sampled_tensor = torch.zeros((1)).type(torch.LongTensor)
                c_sampled_tensor[0] = int(np.argmax(c_sampled))
                if self.args.cuda:
                    c_sampled_tensor = torch.cuda.LongTensor(c_sampled_tensor)

                memory_list = []
                for t in range(expert_episode_len):
                    action = self.select_action(
                        np.concatenate((curr_state, c_sampled)))
                    action_numpy = action.data.cpu().numpy()

                    # Save generated and true trajectories
                    true_traj.append((state_expert[t], action_expert[t]))
                    gen_traj.append((curr_state_obj.coordinates, action_numpy))
                    gen_traj_dict['features'].append(
                        self.get_state_features(curr_state_obj,
                                                self.args.use_state_features))
                    gen_traj_dict['actions'].append(action_numpy)
                    gen_traj_dict['c'].append(c_sampled)

                    action = epsilon_greedy_linear_decay(action_numpy,
                                                         args.num_epochs * 0.5,
                                                         ep_idx,
                                                         self.action_size,
                                                         low=0.05,
                                                         high=0.3)

                    # Get the discriminator reward
                    disc_reward_t = float(
                        self.reward_net(
                            torch.cat((Variable(
                                torch.from_numpy(curr_state).unsqueeze(
                                    0)).type(dtype),
                                       Variable(
                                           torch.from_numpy(
                                               oned_to_onehot(
                                                   action, self.action_size)).
                                           unsqueeze(0)).type(dtype)),
                                      1)).data.cpu().numpy()[0, 0])

                    if args.use_log_rewards and disc_reward_t < 1e-6:
                        disc_reward_t += 1e-6

                    disc_reward_t = -math.log(disc_reward_t) \
                            if args.use_log_rewards else -disc_reward_t
                    disc_reward += disc_reward_t

                    # Predict c given (x_t)
                    predicted_posterior = self.posterior_net(
                        Variable(torch.from_numpy(curr_state).unsqueeze(
                            0)).type(dtype))
                    posterior_reward_t = self.criterion_posterior(
                        predicted_posterior,
                        Variable(c_sampled_tensor)).data.cpu().numpy()[0]

                    posterior_reward += (self.args.lambda_posterior *
                                         posterior_reward_t)

                    # Update Rewards
                    ep_reward += (disc_reward_t + posterior_reward_t)
                    true_goal_state = [
                        int(x) for x in state_expert[-1].tolist()
                    ]
                    if self.args.flag_true_reward == 'grid_reward':
                        ep_true_reward += self.true_reward.reward_at_location(
                            curr_state_obj.coordinates,
                            goals=[true_goal_state])
                        expert_true_reward += self.true_reward.reward_at_location(
                            state_expert[t], goals=[true_goal_state])
                    elif self.args.flag_true_reward == 'action_reward':
                        ep_true_reward += self.true_reward.reward_at_location(
                            np.argmax(action_expert[t]), action)
                        expert_true_reward += self.true_reward.corret_action_reward
                    else:
                        raise ValueError("Incorrect true reward type")

                    # Update next state
                    next_state_obj = self.transition_func(
                        curr_state_obj, Action(action), 0)
                    next_state_feat = self.get_state_features(
                        next_state_obj, self.args.use_state_features)
                    #next_state = running_state(next_state)

                    mask = 0 if t == expert_episode_len - 1 else 1

                    # Push to memory
                    memory_list.append([
                        curr_state,
                        np.array([oned_to_onehot(action,
                                                 self.action_size)]), mask,
                        next_state_feat, disc_reward_t + posterior_reward_t,
                        c_sampled, c_sampled
                    ])

                    if args.render:
                        env.render()

                    if not mask:
                        break

                    curr_state_obj = next_state_obj
                    curr_state_feat = next_state_feat

                    if args.history_size > 1:
                        curr_state[:(args.history_size-1) \
                                * curr_state_feat.shape[0]] = \
                                curr_state[curr_state_feat.shape[0]:]
                        curr_state[(args.history_size-1) \
                                * curr_state_feat.shape[0]:] = curr_state_feat
                    else:
                        curr_state = curr_state_feat



                assert memory_list[-1][2] == 0, \
                        "Mask for final end state is not 0."
                for memory_t in memory_list:
                    memory.push(*memory_t)

                self.logger.summary_writer.add_scalars(
                    'gen_traj/gen_reward', {
                        'discriminator': disc_reward,
                        'posterior': posterior_reward,
                    }, self.train_step_count)

                num_steps += (t - 1)
                reward_batch.append(ep_reward)
                true_reward_batch.append(ep_true_reward)
                expert_true_reward_batch.append(expert_true_reward)
                results['episode_reward'].append(ep_reward)

                # Append trajectories
                true_traj_curr_episode.append(true_traj)
                gen_traj_curr_episode.append(gen_traj)

            results['average_reward'].append(np.mean(reward_batch))

            # Add to tensorboard
            self.logger.summary_writer.add_scalars(
                'gen_traj/reward', {
                    'average': np.mean(reward_batch),
                    'max': np.max(reward_batch),
                    'min': np.min(reward_batch)
                }, self.train_step_count)
            self.logger.summary_writer.add_scalars(
                'gen_traj/true_reward', {
                    'average': np.mean(true_reward_batch),
                    'max': np.max(true_reward_batch),
                    'min': np.min(true_reward_batch),
                    'expert_true': np.mean(expert_true_reward_batch)
                }, self.train_step_count)

            # Add predicted and generated trajectories to results
            if ep_idx % self.args.save_interval == 0:
                results['true_traj'][ep_idx] = copy.deepcopy(
                    true_traj_curr_episode)
                results['pred_traj'][ep_idx] = copy.deepcopy(
                    gen_traj_curr_episode)

            # Update parameters
            gen_batch = memory.sample()

            # We do not get the context variable from expert trajectories.
            # Hence we need to fill it in later.
            expert_batch = expert.sample(size=args.num_expert_trajs)

            self.update_params(gen_batch, expert_batch, ep_idx,
                               args.optim_epochs, args.optim_batch_size)

            self.train_step_count += 1

            if ep_idx > 0 and ep_idx % args.log_interval == 0:
                print('Episode [{}/{}]  Avg R: {:.2f}   Max R: {:.2f} \t' \
                      'True Avg {:.2f}   True Max R: {:.2f}   ' \
                      'Expert (Avg): {:.2f}'.format(
                          ep_idx, args.num_epochs, np.mean(reward_batch),
                          np.max(reward_batch), np.mean(true_reward_batch),
                          np.max(true_reward_batch),
                          np.mean(expert_true_reward_batch)))

            results_path = os.path.join(args.results_dir, 'results.pkl')
            with open(results_path, 'wb') as results_f:
                pickle.dump((results), results_f, protocol=2)
                # print("Did save results to {}".format(results_path))

            if ep_idx % args.save_interval == 0:
                checkpoint_filepath = self.model_checkpoint_filepath(ep_idx)
                torch.save(self.checkpoint_data_to_save(), checkpoint_filepath)
                print("Did save checkpoint: {}".format(checkpoint_filepath))