Exemplo n.º 1
0
    def optimize_policy(self, itr, samples_data):
        """Optimize network using experiences from replay buffer.

        Args:
            itr (int): Iterations.
            samples_data (list): Processed batch data.

        Returns:
            numpy.float64: Loss of policy.

        """
        del itr
        del samples_data

        transitions = self.replay_buffer.sample(self.buffer_batch_size)

        observations = transitions['observation']
        rewards = transitions['reward']
        actions = transitions['action']
        next_observations = transitions['next_observation']
        dones = transitions['terminal']

        # normalize pixel to range [0, 1] since the samples stored in the
        # replay buffer are of type uint8 and not normalized, for memory
        # optimization
        observations = normalize_pixel_batch(self.env_spec, observations)
        next_observations = normalize_pixel_batch(self.env_spec,
                                                  next_observations)
        loss, _ = self._train_qf(observations, actions, rewards, dones,
                                 next_observations)

        return loss
Exemplo n.º 2
0
 def optimize_policy(self, itr, observations, rewards, actions,
                     next_observations, dones, jole_obs, jole_actions):
     """Optimize network using experiences from replay buffer."""
     # normalize pixel to range [0, 1] since the samples stored in the
     # replay buffer are of type uint8 and not normalized, for memory
     # optimization
     observations = normalize_pixel_batch(self.env_spec, observations)
     next_observations = normalize_pixel_batch(self.env_spec,
                                               next_observations)
     loss, _, qval, y = self._train_qf(observations, actions, rewards,
                                       dones, next_observations, jole_obs,
                                       jole_actions, self.use_jole_qf,
                                       self.jole_clip_return_max,
                                       self.jole_clip_return_min)
     return loss, qval, y
Exemplo n.º 3
0
    def optimize_policy(self, itr, sample_data):
        """Optimize network using experiences from replay buffer."""
        transitions = self.replay_buffer.sample(self.buffer_batch_size)

        observations = transitions['observation']
        rewards = transitions['reward']
        actions = transitions['action']
        next_observations = transitions['next_observation']
        dones = transitions['terminal']

        # normalize pixel to range [0, 1] since the samples stored in the
        # replay buffer are of type uint8 and not normalized, for memory
        # optimization
        observations = normalize_pixel_batch(self.env_spec, observations)
        next_observations = normalize_pixel_batch(self.env_spec,
                                                  next_observations)
        loss, _ = self._train_qf(observations, actions, rewards, dones,
                                 next_observations)

        return loss
Exemplo n.º 4
0
    def fit(self, paths):
        """Fit regressor based on paths.

        Args:
            paths (dict[numpy.ndarray]): Sample paths.

        """
        observations = np.concatenate([p['observations'] for p in paths])
        if isinstance(self.env_spec.observation_space, akro.Image):
            observations = normalize_pixel_batch(observations)

        returns = np.concatenate([p['returns'] for p in paths])
        self._regressor.fit(observations, returns.reshape((-1, 1)))
Exemplo n.º 5
0
    def predict(self, path):
        """Predict value based on paths.

        Args:
            path (dict[numpy.ndarray]): Sample paths.

        Returns:
            numpy.ndarray: Predicted value.

        """
        observations = path['observations']
        if isinstance(self.env_spec.observation_space, akro.Image):
            observations = normalize_pixel_batch(observations)

        return self._regressor.predict(observations).flatten()
Exemplo n.º 6
0
    def predict(self, paths):
        """Predict ys based on input xs.

        Args:
            paths (dict[numpy.ndarray]): Sample paths.

        Return:
            numpy.ndarray: The predicted ys.

        """
        xs = paths['observations']
        if isinstance(self.env_spec.observation_space, akro.Image):
            xs = normalize_pixel_batch(xs)

        return self._f_predict(xs).flatten()
Exemplo n.º 7
0
    def fit(self, paths):
        """Fit regressor based on paths.

        Args:
            paths (dict[numpy.ndarray]): Sample paths.

        """
        xs = np.concatenate([p['observations'] for p in paths])
        if isinstance(self.env_spec.observation_space, akro.Image):
            xs = normalize_pixel_batch(xs)

        ys = np.concatenate([p['returns'] for p in paths])
        ys = ys.reshape((-1, 1))

        if self._subsample_factor < 1:
            num_samples_tot = xs.shape[0]
            idx = np.random.randint(
                0, num_samples_tot,
                int(num_samples_tot * self._subsample_factor))
            xs, ys = xs[idx], ys[idx]

        if self._normalize_inputs:
            # recompute normalizing constants for inputs
            self._x_mean.load(np.mean(xs, axis=0, keepdims=True))
            self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8)
            self._old_network.x_mean.load(np.mean(xs, axis=0, keepdims=True))
            self._old_network.x_std.load(
                np.std(xs, axis=0, keepdims=True) + 1e-8)
        if self._normalize_outputs:
            # recompute normalizing constants for outputs
            self._y_mean.load(np.mean(ys, axis=0, keepdims=True))
            self._y_std.load(np.std(ys, axis=0, keepdims=True) + 1e-8)
            self._old_network.y_mean.load(np.mean(ys, axis=0, keepdims=True))
            self._old_network.y_std.load(
                np.std(ys, axis=0, keepdims=True) + 1e-8)
        inputs = [xs, ys]
        loss_before = self._optimizer.loss(inputs)
        tabular.record('{}/LossBefore'.format(self._name), loss_before)
        self._optimizer.optimize(inputs)
        loss_after = self._optimizer.loss(inputs)
        tabular.record('{}/LossAfter'.format(self._name), loss_after)
        if self._use_trust_region:
            tabular.record('{}/MeanKL'.format(self._name),
                           self._optimizer.constraint_val(inputs))
        tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
        self._old_model.parameters = self.parameters
Exemplo n.º 8
0
 def test_normalize_pixel_batch(self):
     env = GarageEnv(DummyDiscretePixelEnv(), is_image=True)
     obs = env.reset()
     obs_normalized = normalize_pixel_batch(obs)
     expected = [ob / 255.0 for ob in obs]
     assert np.allclose(obs_normalized, expected)
Exemplo n.º 9
0
 def test_normalize_pixel_patch_not_trigger(self):
     env = TfEnv(DummyBoxEnv())
     obs = env.reset()
     obs_normalized = normalize_pixel_batch(env, obs)
     assert np.array_equal(obs, obs_normalized)
Exemplo n.º 10
0
 def test_normalize_pixel_patch(self):
     env = TfEnv(DummyDiscretePixelEnv())
     obs = env.reset()
     obs_normalized = normalize_pixel_batch(env, obs)
     expected = [ob / 255.0 for ob in obs]
     assert np.allclose(obs_normalized, expected)
Exemplo n.º 11
0
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.
            whole_paths(bool): Not effective. Only keep here to comply
                with base class.

        Returns:
            list: A list of paths.

        """
        assert batch_size is not None

        paths = []
        if not self._no_reset or self._last_obses is None:
            obses = self._vec_env.reset()
        else:
            obses = self._last_obses
        completes = np.asarray([True] * self._vec_env.num_envs)
        running_paths = [None] * self._vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(completes)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            obs_normalized = tensor_utils.normalize_pixel_batch(
                self._env_spec, input_obses)
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, obs_normalized, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    obs_normalized)

            next_obses, rewards, dones, env_infos = \
                self._vec_env.step(actions)
            completes = env_infos['vec_env_executor.complete']
            self._last_obses = next_obses
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self._vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self._vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transitions(
                    observation=obses,
                    action=actions,
                    reward=rewards,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=np.asarray(running_paths[idx]['rewards']),
                            dones=np.asarray(running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses
        return paths
Exemplo n.º 12
0
    def obtain_samples_for_evaluation(self, num_paths=20):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.

        """
        paths = []

        policy = self.algo.policy

        for i in range(num_paths):
            obses = self.evaluate_env.reset()
            #print(obses)

            dones = np.asarray([True] * self.evaluate_env.num_envs)
            running_paths = [None] * self.evaluate_env.num_envs
            policy.reset(dones)
            end_of_path = False

            for j in range(500):
                input_obses = obses
                obs_normalized = tensor_utils.normalize_pixel_batch(
                    self.env_spec, input_obses)
                obses = obs_normalized

                actions = self.algo.policy.get_actions(obs_normalized)
                if len(actions) > 1:
                    actions = actions[0]
                agent_infos = None

                next_obses, rewards, dones, env_infos = self.evaluate_env.step(
                    actions)
                original_next_obses = next_obses
                next_obses = tensor_utils.normalize_pixel_batch(
                    self.env_spec, next_obses)

                env_infos = tensor_utils.split_tensor_dict_list(env_infos)

                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self.evaluate_env.num_envs)
                    ]
                if env_infos is None:
                    env_infos = [
                        dict() for _ in range(self.evaluate_env.num_envs)
                    ]

                for idx, reward, env_info, done in zip(itertools.count(),
                                                       rewards, env_infos,
                                                       dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            rewards=[],
                            env_infos=[],
                            dones=[],
                            undiscounted_return=0,
                            # running_length: Length of path up to now
                            # Note that running_length is not len(rewards)
                            # Because a path may not be complete in one batch
                            running_length=0,
                            success_count=0)

                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['running_length'] += 1
                    running_paths[idx]['undiscounted_return'] += reward
                    running_paths[idx]['success_count'] += env_info.get(
                        'is_success') or 0

                    if done or j == 499:
                        paths.append(
                            dict(rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]['rewards']),
                                 dones=tensor_utils.stack_tensor_list(
                                     running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 running_length=running_paths[idx]
                                 ['running_length'],
                                 undiscounted_return=running_paths[idx]
                                 ['undiscounted_return'],
                                 success_count=running_paths[idx]
                                 ['success_count']))
                        running_paths[idx] = None

                        end_of_path = True
                if end_of_path:
                    break
                obses = original_next_obses
        #print(paths)
        return paths
Exemplo n.º 13
0
    def obtain_samples(self, itr, batch_size, is_evaluate=False):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.

        """
        paths = []
        if not self.no_reset or self._last_obses is None:
            obses = self.vec_env.reset()
        else:
            obses = self._last_obses
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses

            obs_normalized = tensor_utils.normalize_pixel_batch(
                self.env_spec, input_obses)
            obses = obs_normalized

            if self.algo.es and not is_evaluate:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, obs_normalized, self.algo.policy)
                agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            else:
                actions = self.algo.policy.get_actions(obs_normalized)
                if len(actions) > 1:
                    actions = actions[0]
                agent_infos = None

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            original_next_obses = next_obses
            next_obses = tensor_utils.normalize_pixel_batch(
                self.env_spec, next_obses)

            self._last_obses = next_obses

            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transitions(
                    observation=obs_normalized,
                    action=actions,
                    reward=rewards * self.algo.reward_scale,
                    terminal=dones,
                    next_observation=next_obses,
                )

            if self._bound_start == False:
                self._bound_start = True
                self._obs_upper = obses[0]
                self._obs_lower = obses[0]
                self._action_upper = actions[0]
                self._action_lower = actions[0]

            for obs in obses:
                self._obs_upper = np.maximum(self._obs_upper, obs)
                self._obs_lower = np.minimum(self._obs_lower, obs)
            for action in actions:
                self._action_upper = np.maximum(self._action_upper, action)
                self._action_lower = np.minimum(self._action_lower, action)

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]['rewards']),
                            dones=tensor_utils.stack_tensor_list(
                                running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            obses = original_next_obses
        return paths, self._obs_upper, self._obs_lower, self._action_upper, self._action_lower
    def obtain_samples(self, itr, batch_size):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.

        """
        paths = []
        if not self.no_reset or self._last_obses is None:
            obses = self.vec_env.reset()
        else:
            obses = self._last_obses
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            obs_normalized = tensor_utils.normalize_pixel_batch(
                self.env_spec, input_obses)
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, obs_normalized, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    obs_normalized)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            new_episode_obs = None
            if "reset_new_obs" in env_infos:
                new_episode_obs = next_obses.copy()
                for i, reset_new_obs in env_infos["reset_new_obs"][0]:
                    new_episode_obs[i] = reset_new_obs
                del env_infos["reset_new_obs"]

            #self.vec_env.envs[0].render()

            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                payload = {
                    "observation": obses,
                    "action": actions,
                    "reward": rewards * self.algo.reward_scale,
                    "terminal": dones,
                    "next_observation": next_obses
                }
                if env_infos and env_infos[0].get("ground_truth_state") is not None:
                    payload["ground_truth_state"] = [env_info.get("ground_truth_state") for env_info in env_infos]

                self.algo.replay_buffer.add_transitions(
                    **payload
                )

            for idx, reward, env_info, q_val, done in zip(itertools.count(), rewards,
                                                   env_infos, agent_infos["q_vals"], dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        q_vals=self._last_q_vals[idx].copy(),
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['q_vals'].append(q_val)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_q_vals[idx].append(q_val)
                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=np.asarray(running_paths[idx]['rewards']),
                            dones=np.asarray(running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            q_vals=np.asarray(running_paths[idx]["q_vals"]),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_q_vals[idx] = []
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            if new_episode_obs:
                obses = new_episode_obs
            else:
                obses = next_obses
            self._last_obses = obses
        return paths