Пример #1
0
    def pretrain_q_with_bc_data(self):
        logger.remove_tabular_output('progress.csv',
                                     relative_to_snapshot_dir=True)
        logger.add_tabular_output('pretrain_q.csv',
                                  relative_to_snapshot_dir=True)
        self.update_policy = False
        # first train only the Q function
        for i in range(self.q_num_pretrain_steps):
            self.eval_statistics = dict()
            self._need_to_update_eval_statistics = True

            train_data = self.replay_buffer.random_batch(128)
            train_data = np_to_pytorch_batch(train_data)
            obs = train_data['observations']
            next_obs = train_data['next_observations']
            if self.goal_conditioned:
                goals = train_data['resampled_goals']
                train_data['observations'] = torch.cat((obs, goals), dim=1)
                train_data['next_observations'] = torch.cat((next_obs, goals),
                                                            dim=1)
            self.train_from_torch(train_data)

            logger.record_dict(self.eval_statistics)
            logger.dump_tabular(with_prefix=True, with_timestamp=False)

        self.update_policy = True
        # then train policy and Q function together
        for i in range(self.q_num_pretrain_steps):
            self.eval_statistics = dict()
            self._need_to_update_eval_statistics = True

            train_data = self.replay_buffer.random_batch(128)
            train_data = np_to_pytorch_batch(train_data)
            obs = train_data['observations']
            next_obs = train_data['next_observations']
            if self.goal_conditioned:
                goals = train_data['resampled_goals']
                train_data['observations'] = torch.cat((obs, goals), dim=1)
                train_data['next_observations'] = torch.cat((next_obs, goals),
                                                            dim=1)
            self.train_from_torch(train_data)

            logger.record_dict(self.eval_statistics)
            logger.dump_tabular(with_prefix=True, with_timestamp=False)

        logger.remove_tabular_output(
            'pretrain_q.csv',
            relative_to_snapshot_dir=True,
        )
        logger.add_tabular_output(
            'progress.csv',
            relative_to_snapshot_dir=True,
        )
Пример #2
0
    def random_batch(self, batch_size):
        traj_i = np.random.choice(self.size, batch_size)
        trans_i = np.random.choice(self.traj_length - 1, batch_size)

        try:
            env = normalize_image(self.data['env'][traj_i, :])
        except:
            env = normalize_image(self.data['observations'][traj_i, 0, :])

        x_t = normalize_image(self.data['observations'][traj_i, trans_i, :])
        x_next = normalize_image(self.data['observations'][traj_i,
                                                           trans_i + 1, :])

        episode_num = np.random.randint(0, self.size)
        episode_obs = normalize_image(
            self.data['observations'][episode_num, :8, :])

        data_dict = {
            'x_t': x_t,
            'x_next': x_next,
            'env': env,
            'actions': self.data['actions'][traj_i, trans_i, :],
            'episode_obs': episode_obs,
            'episode_acts': self.data['actions'][episode_num, :7, :],
        }
        return np_to_pytorch_batch(data_dict)
Пример #3
0
    def random_batch(self, batch_size):
        env_i = np.random.choice(self.num_envs, batch_size)
        trans_i = np.random.choice(self.sample_size, batch_size)

        match_i = np.random.choice(self.num_envs, batch_size)
        trans_x = np.random.choice(self.sample_size, batch_size)
        trans_y = np.random.choice(self.sample_size, batch_size)

        rand_a = np.random.choice(self.num_envs - 1, batch_size // 2)
        rand_b = np.add(rand_a, np.ones(batch_size // 2)).astype(int)

        trans_m = np.random.choice(self.sample_size, batch_size // 2)
        trans_n = np.random.choice(self.sample_size, batch_size // 2)

        matches = np.random.uniform(0, 0.1, batch_size // 2)
        nonmatches = np.random.uniform(0.9, 1, batch_size // 2)
        swap_count = int(batch_size * 0.05)

        matches[:
                swap_count], nonmatches[:
                                        swap_count] = nonmatches[:
                                                                 swap_count], matches[:
                                                                                      swap_count]
        labels = np.concatenate([matches, nonmatches])

        data_dict = {
            'observations': self.data['observations'][env_i, trans_i, :],
            'env_set_1': self.data['observations'][match_i, trans_x, :],
            'env_set_2': self.data['observations'][match_i, trans_y, :],
        }

        return np_to_pytorch_batch(data_dict)
Пример #4
0
 def random_batch(self, batch_size):
     i = np.random.choice(self.size,
                          batch_size,
                          replace=(self.size < batch_size))
     data_dict = {
         'observations': self.data[i, :],
     }
     return np_to_pytorch_batch(data_dict)
Пример #5
0
 def get_test_batch(self):
     batch = self.test_replay_buffer.random_batch(self.batch_size)
     batch = np_to_pytorch_batch(batch)
     obs = batch['observations']
     next_obs = batch['next_observations']
     goals = batch['resampled_goals']
     batch['observations'] = torch.cat((obs, goals), dim=1)
     batch['next_observations'] = torch.cat((next_obs, goals), dim=1)
     return batch
Пример #6
0
    def random_batch(self, batch_size):
        traj_i = np.random.choice(np.arange(self.size), batch_size)
        trans_i = np.random.choice(np.arange(self.traj_length - 1), batch_size)
        data_dict = {
            'observations': self.data['observations'][traj_i, trans_i, :],
            'next_observations': self.data['observations'][traj_i,
                                                           trans_i + 1, :],
            'actions': self.data['actions'][traj_i, trans_i, :],
        }

        return np_to_pytorch_batch(data_dict)
Пример #7
0
 def random_batch(self, batch_size):
     i = np.random.choice(self.size,
                          batch_size,
                          replace=(self.size < batch_size))
     obs = self.data[i, :]
     if self.normalize:
         obs = normalize_image(obs)
     data_dict = {
         'observations': obs,
     }
     return np_to_pytorch_batch(data_dict)
Пример #8
0
 def get_batch(self):
     sample_size = self.batch_size // 2
     batch1 = self.replay_buffer1().random_batch(sample_size)
     batch2 = self.replay_buffer2().random_batch(sample_size)
     new_batch = {}
     for k, v in batch1.items():
         new_batch[k] = np.concatenate(
             (v, batch2[k]),
             axis=0,
         )
     return np_to_pytorch_batch(new_batch)
Пример #9
0
    def random_batch(self, batch_size):
        num_traj = self.replay_buffer._size // self.horizon
        traj_i = np.random.choice(num_traj, batch_size)
        trans_i = np.random.choice(self.horizon - 2, batch_size)

        indices = traj_i * self.horizon + trans_i
        batch = dict(
            x0=self.replay_buffer._obs["image_observation"][indices],
            x1=self.replay_buffer._obs["image_observation"][indices + 1],
            x2=self.replay_buffer._obs["image_observation"][indices + 2],
        )
        return np_to_pytorch_batch(batch)
Пример #10
0
 def _statistics_from_paths(self, paths, stat_prefix):
     rewards, terminals, obs, actions, next_obs = split_paths(paths)
     np_batch = dict(
         rewards=rewards,
         terminals=terminals,
         observations=obs,
         actions=actions,
         next_observations=next_obs,
     )
     batch = np_to_pytorch_batch(np_batch)
     statistics = self._statistics_from_batch(batch, stat_prefix)
     statistics.update(
         create_stats_ordered_dict('Num Paths',
                                   len(paths),
                                   stat_prefix=stat_prefix))
     return statistics
 def get_batch_from_buffer(self, replay_buffer):
     batch = replay_buffer.random_batch(self.bc_batch_size)
     batch = np_to_pytorch_batch(batch)
     # obs = batch['observations']
     # next_obs = batch['next_observations']
     # goals = batch['resampled_goals']
     # import ipdb; ipdb.set_trace()
     # batch['observations'] = torch.cat((
     #     obs,
     #     goals
     # ), dim=1)
     # batch['next_observations'] = torch.cat((
     #     next_obs,
     #     goals
     # ), dim=1)
     return batch
Пример #12
0
    def get_batch(self):
        batch = self.replay_buffer.random_batch_random_tau(
            self.batch_size, self.max_tau)
        """
        Update the goal states/rewards
        """
        num_steps_left = self._sample_taus_for_training(batch)
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        goals = batch['training_goals']
        rewards = self._compute_rewards_np(batch, obs, actions, next_obs,
                                           goals)
        terminals = batch['terminals']

        #not too sure what this code does
        if self.tau_sample_strategy == 'all_valid':
            obs = np.repeat(obs, self.max_tau + 1, 0)
            actions = np.repeat(actions, self.max_tau + 1, 0)
            next_obs = np.repeat(next_obs, self.max_tau + 1, 0)
            goals = np.repeat(goals, self.max_tau + 1, 0)
            rewards = np.repeat(rewards, self.max_tau + 1, 0)
            terminals = np.repeat(terminals, self.max_tau + 1, 0)

        if self.finite_horizon:
            terminals = 1 - (1 - terminals) * (num_steps_left != 0)
        if self.terminate_when_goal_reached:
            diff = self.env.convert_obs_to_goals(next_obs) - goals
            goal_not_reached = (np.linalg.norm(diff, axis=1, keepdims=True) >
                                self.goal_reached_epsilon)
            terminals = 1 - (1 - terminals) * goal_not_reached

        if not self.dense_rewards:
            rewards = rewards * terminals
        """
        Update the batch
        """
        batch['rewards'] = rewards
        batch['terminals'] = terminals
        batch['actions'] = actions
        batch['num_steps_left'] = num_steps_left
        batch['goals'] = goals
        batch['observations'] = obs
        batch['next_observations'] = next_obs

        return np_to_pytorch_batch(batch)
Пример #13
0
    def get_batch(self, training=True):
        if self.replay_buffer_is_split:
            replay_buffer = self.replay_buffer.get_replay_buffer(training)
        else:
            replay_buffer = self.replay_buffer
        batch = replay_buffer.random_batch(self.batch_size)
        """
        Update the goal states/rewards
        """
        num_steps_left = np.random.randint(0, self.max_tau + 1,
                                           (self.batch_size, 1))
        terminals = 1 - (1 - batch['terminals']) * (num_steps_left != 0)
        batch['terminals'] = terminals

        obs = batch['observations']
        next_obs = batch['next_observations']
        if self.sample_train_goals_from == 'her':
            goals = batch['goals']
        else:
            goals = self._sample_goals_for_training()
        goal_differences = np.abs(
            self.env.convert_obs_to_goals(next_obs)
            # - self.env.convert_obs_to_goals(obs)
            - goals)
        batch['goal_differences'] = goal_differences * self.reward_scale
        batch['goals'] = goals
        """
        Update the observations
        """
        batch['observations'] = merge_into_flat_obs(
            obs=batch['observations'],
            goals=batch['goals'],
            num_steps_left=num_steps_left,
        )
        batch['next_observations'] = merge_into_flat_obs(
            obs=batch['next_observations'],
            goals=batch['goals'],
            num_steps_left=num_steps_left - 1,
        )

        return np_to_pytorch_batch(batch)
Пример #14
0
    def random_batch(self, batch_size):
        traj_i = np.random.choice(self.size, batch_size)
        trans_i = np.random.choice(self.traj_length, batch_size)
        # conditioning = np.random.choice(self.traj_length, batch_size)
        # env = normalize_image(self.data['observations'][traj_i, conditioning, :])
        try:
            env = normalize_image(self.data['env'][traj_i, :])
        except:
            env = normalize_image(self.data['observations'][traj_i, 0, :])
        x_t = normalize_image(self.data['observations'][traj_i, trans_i, :])

        episode_num = np.random.randint(0, self.size)
        episode_obs = normalize_image(
            self.data['observations'][episode_num, :8, :])

        data_dict = {
            'x_t': x_t,
            'env': env,
            'episode_obs': episode_obs,
        }
        return np_to_pytorch_batch(data_dict)
Пример #15
0
 def fix_data_set(self):
     for training in [True, False]:
         replay_buffer = self.replay_buffer.get_replay_buffer(training)
         batch_dict = {}
         for i in range(self.num_unique_batches):
             batch_size = min(
                 replay_buffer.num_steps_can_sample(),
                 self.batch_size
             )
             batch = replay_buffer.random_batch(batch_size)
             goal_states = self.sample_goal_states(batch_size, training)
             new_rewards = self.env.compute_rewards(
                 batch['observations'],
                 batch['actions'],
                 batch['next_observations'],
                 goal_states,
             )
             batch['goal_states'] = goal_states
             batch['rewards'] = new_rewards
             torch_batch = np_to_pytorch_batch(batch)
             batch_dict[i] = torch_batch
         self.mode_to_batch_iterator[training] = create_batch_iterator(
             self.num_unique_batches, batch_dict
         )
Пример #16
0
    def _do_training(self):
        beta = self.per_beta_schedule.get_value(self._n_train_steps_total, )
        batches = []
        if self.train_with in ['on_policy', 'both']:
            batches.append(
                self.replay_buffer.most_recent_path_batch(beta=beta))
        if self.train_with in ['off_policy', 'both']:
            batches.append(
                self.replay_buffer.random_batch(
                    self.batch_size,
                    beta=beta,
                ))
        for tmp, np_batch in enumerate(batches):
            next_obs = np_batch['next_observations']
            goals = np_batch['goals']
            terminals = np_batch['terminals']
            indices = np_batch['indices']
            events = self.detect_event(next_obs, goals)
            np_batch['events'] = events
            terminals = 1 - (1 - terminals) * (1 - events)
            if self.finite_horizon:
                terminals = 1 - (1 -
                                 terminals) * (np_batch['num_steps_left'] != 0)
            np_batch['terminals'] = terminals
            batch = np_to_pytorch_batch(np_batch)

            # self.train_batches.append(batch)

            terminals = batch['terminals']
            obs = batch['observations']
            actions = batch['actions']
            next_obs = batch['next_observations']
            num_steps_left = batch['num_steps_left']
            goals = batch['goals']
            events = batch['events']
            if self.finite_horizon:
                next_num_steps_left = num_steps_left - 1
            else:
                next_num_steps_left = num_steps_left

            # next_actions = self.target_policy(
            #     observations=next_obs,
            #     goals=goals,
            #     num_steps_left=next_num_steps_left,
            # )
            # noise = torch.normal(
            #     torch.zeros_like(next_actions),
            #     self.target_policy_noise,
            # )
            # noise = torch.clamp(
            #     noise,
            #     -self.target_policy_noise_clip,
            #     self.target_policy_noise_clip
            # )
            # noisy_next_actions = next_actions + noise
            # next_beta_1 = self.target_beta_q(
            #     observations=next_obs,
            #     actions=noisy_next_actions,
            #     goals=goals,
            #     num_steps_left=next_num_steps_left,
            # )
            # next_beta_2 = self.target_beta_q2(
            #     observations=next_obs,
            #     actions=noisy_next_actions,
            #     goals=goals,
            #     num_steps_left=next_num_steps_left,
            # )
            # next_beta = torch.min(next_beta_1, next_beta_2)
            # noisy_next_actions = self.policy(
            #     observations=next_obs,
            #     goals=goals,
            #     num_steps_left=next_num_steps_left,
            # )
            next_actions = self.policy(
                observations=next_obs,
                goals=goals,
                num_steps_left=next_num_steps_left,
            )
            next_beta = self.beta_q(
                observations=next_obs,
                actions=next_actions,
                goals=goals,
                num_steps_left=next_num_steps_left,
            )
            if not self.finite_horizon:
                next_beta = next_beta * self.discount
            targets = (terminals * events +
                       (1 - terminals) * next_beta).detach()
            predictions = self.beta_q(obs, actions, goals, num_steps_left)
            if self.prioritized_replay:
                weights = ptu.from_numpy(np_batch['is_weights']).float()
                self.q_criterion.weight = weights
                priorities = ptu.get_numpy(torch.abs(predictions - targets))
                self.replay_buffer.update_priorities(indices, priorities)

            beta_q_loss = self.q_criterion(predictions, targets)

            # predictions2 = self.beta_q2(obs, actions, goals, num_steps_left)
            # beta_q2_loss = self.q_criterion(predictions2, targets)
            # self.beta_q2_optimizer.zero_grad()
            # beta_q2_loss.backward()

            policy_actions = self.policy(obs, goals, num_steps_left)

            policy_actions.register_hook(
                self.create_save_gradient_norm_hook('dQ/da'))
            beta_q_output = self.beta_q(
                observations=obs,
                actions=policy_actions,
                goals=goals,
                num_steps_left=num_steps_left,
            )
            beta_v_loss = self.v_criterion(
                self.beta_v(obs, goals, num_steps_left),
                beta_q_output.detach())
            self.beta_v_optimizer.zero_grad()
            beta_v_loss.backward()
            self.beta_v_optimizer.step()

            policy_loss = -beta_q_output.mean()
            if self.training_policy or self.train_simultaneously:
                self.policy_optimizer.zero_grad()
                policy_loss.backward()
                policy_grad_norms = []
                for param in self.policy.parameters():
                    policy_grad_norms.append(param.grad.data.norm())
                self.eval_statistics.update(
                    create_stats_ordered_dict(
                        'Policy Gradient Norms',
                        policy_grad_norms,
                    ))
                self.policy_optimizer.step()
            if not self.training_policy or self.train_simultaneously:
                self.beta_q_optimizer.zero_grad()
                beta_q_loss.backward()
                beta_q_grad_norms = []
                for param in self.beta_q.parameters():
                    beta_q_grad_norms.append(param.grad.data.norm())
                self.eval_statistics.update(
                    create_stats_ordered_dict(
                        'Beta Q Gradient Norms',
                        beta_q_grad_norms,
                    ))
                self.beta_q_optimizer.step()
                # self.beta_q2_optimizer.step()
            if self._n_train_steps_total % self.flip_training_period == 0:
                self.training_policy = not self.training_policy
            # ptu.soft_update_from_to(
            #     self.policy, self.target_policy, self.soft_target_tau
            # )
            # ptu.soft_update_from_to(
            #     self.beta_q, self.target_beta_q, self.soft_target_tau
            # )
            # ptu.soft_update_from_to(
            #     self.beta_q2, self.target_beta_q2, self.soft_target_tau
            # )
            if self.need_to_update_eval_statistics:
                self.need_to_update_eval_statistics = False
                self.eval_statistics['Policy Loss'] = np.mean(
                    ptu.get_numpy(policy_loss))
                self.eval_statistics['Beta Q Loss'] = np.mean(
                    ptu.get_numpy(beta_q_loss))
                self.eval_statistics['Beta V Loss'] = np.mean(
                    ptu.get_numpy(beta_v_loss))
                self.eval_statistics.update(
                    create_stats_ordered_dict(
                        'Beta Q Targets',
                        ptu.get_numpy(targets),
                    ))
                self.eval_statistics.update(
                    create_stats_ordered_dict(
                        'Beta Q Predictions',
                        ptu.get_numpy(predictions),
                    ))
                # self.eval_statistics.update(create_stats_ordered_dict(
                #     'Beta Q1 - Q2',
                #     ptu.get_numpy(next_beta_1 - next_beta_2),
                # ))
                real_goal = np.array([0., 4.])
                is_real_goal = (np_batch['goals'] == real_goal).all(axis=1)
                goal_is_corner = (np.abs(np_batch['goals']) == 4).all(axis=1)
                self.eval_statistics['Event Prob'] = np_batch['events'].mean()
                self.eval_statistics['Goal is Current Obs Prob'] = (
                    self.detect_event(
                        np_batch['observations'],
                        np_batch['goals'],
                    ).mean())
                self.eval_statistics['Training Goal is (0, 4) Prob'] = (
                    is_real_goal.mean())
                self.eval_statistics['Training Goal is Corner'] = (
                    goal_is_corner.mean())
                self.eval_statistics.update(self.extra_eval_statistics)
                self.extra_eval_statistics = OrderedDict()
Пример #17
0
 def get_batch(self):
     batch = self.replay_buffer.random_batch_random_tau(
         self.batch_size, max_tau=self.time_horizon)
     return np_to_pytorch_batch(batch)
 def train(self, np_batch):
     self._num_train_steps += 1
     batch = np_to_pytorch_batch(np_batch)
     self.train_from_torch(batch)
Пример #19
0
    def pretrain_q_with_bc_data(self):
        logger.remove_tabular_output(
            'progress.csv', relative_to_snapshot_dir=True
        )
        logger.add_tabular_output(
            'pretrain_q.csv', relative_to_snapshot_dir=True
        )

        self.update_policy = False
        # first train only the Q function
        for i in range(self.q_num_pretrain1_steps):
            self.eval_statistics = dict()

            train_data = self.replay_buffer.random_batch(self.bc_batch_size)
            train_data = np_to_pytorch_batch(train_data)
            obs = train_data['observations']
            next_obs = train_data['next_observations']
            # goals = train_data['resampled_goals']
            train_data['observations'] = obs # torch.cat((obs, goals), dim=1)
            train_data['next_observations'] = next_obs # torch.cat((next_obs, goals), dim=1)
            self.train_from_torch(train_data)
            if i%self.pretraining_logging_period == 0:
                logger.record_dict(self.eval_statistics)
                logger.dump_tabular(with_prefix=True, with_timestamp=False)

        self.update_policy = True
        # then train policy and Q function together
        prev_time = time.time()
        for i in range(self.q_num_pretrain2_steps):
            self.eval_statistics = dict()
            if i % self.pretraining_logging_period == 0:
                self._need_to_update_eval_statistics=True
            train_data = self.replay_buffer.random_batch(self.bc_batch_size)
            train_data = np_to_pytorch_batch(train_data)
            obs = train_data['observations']
            next_obs = train_data['next_observations']
            # goals = train_data['resampled_goals']
            train_data['observations'] = obs # torch.cat((obs, goals), dim=1)
            train_data['next_observations'] = next_obs # torch.cat((next_obs, goals), dim=1)
            self.train_from_torch(train_data)
            if self.do_pretrain_rollouts and i % self.pretraining_env_logging_period == 0:
                total_ret = self.do_rollouts()
                print("Return at step {} : {}".format(i, total_ret/20))

            if i%self.pretraining_logging_period==0:
                if self.do_pretrain_rollouts:
                    self.eval_statistics["pretrain_bc/avg_return"] = total_ret / 20
                self.eval_statistics["batch"] = i
                self.eval_statistics["epoch_time"] = time.time()-prev_time
                logger.record_dict(self.eval_statistics)
                logger.dump_tabular(with_prefix=True, with_timestamp=False)
                prev_time = time.time()

        logger.remove_tabular_output(
            'pretrain_q.csv',
            relative_to_snapshot_dir=True,
        )
        logger.add_tabular_output(
            'progress.csv',
            relative_to_snapshot_dir=True,
        )

        self._need_to_update_eval_statistics = True
        self.eval_statistics = dict()

        if self.post_pretrain_hyperparams:
            self.set_algorithm_weights(**self.post_pretrain_hyperparams)
Пример #20
0
 def get_batch_from_buffer(self, replay_buffer, batch_size):
     batch = replay_buffer.random_batch(batch_size)
     batch = np_to_pytorch_batch(batch)
     return batch
Пример #21
0
 def get_batch(self):
     batch = self.replay_buffer.random_batch(self.batch_size)
     return np_to_pytorch_batch(batch)
Пример #22
0
 def get_batch(self):
     batch = self.replay_buffer.get_training_data()
     return np_to_pytorch_batch(batch)