Пример #1
0
    def _collect_samples(self,
                         total_timesteps,
                         run_steps=None,
                         random_actions=False):
        """Perform the sample collection operation.

        This method is responsible for executing rollouts for a number of steps
        before training is executed. The data from the rollouts is stored in
        the policy's replay buffer(s).

        Parameters
        ----------
        total_timesteps : int
            the total number of samples to train on. Used by the fingerprint
            element
        run_steps : int, optional
            number of steps to collect samples from. If not provided, the value
            defaults to `self.nb_rollout_steps`.
        random_actions : bool
            if set to True, actions are sampled randomly from the action space
            instead of being computed by the policy. This is used for
            exploration purposes.
        """
        for _ in range(run_steps or self.nb_rollout_steps):
            # Collect the contextual term. None if it is not passed.
            context = [self.env.current_context] \
                if hasattr(self.env, "current_context") else None

            # Predict next action. Use random actions when initializing the
            # replay buffer.
            action = self._policy(
                self.obs,
                context,
                apply_noise=True,
                random_actions=random_actions,
            )

            # Execute next action.
            new_obs, reward, done, info = self.env.step(action)
            new_obs, new_all_obs = self._get_obs(new_obs)

            # Done mask for multi-agent policies is slightly different.
            if is_multiagent_policy(self.policy):
                done = done["__all__"]

            # Visualize the current step.
            if self.render:
                self.env.render()  # pragma: no cover

            # Add the fingerprint term, if needed. When collecting the initial
            # random actions, we assume the fingerprint does not change from
            # its initial value.
            new_obs = self._add_fingerprint(
                new_obs, 0 if random_actions else self.total_steps,
                total_timesteps)

            # Get the contextual term.
            context0 = context1 = getattr(self.env, "current_context", None)

            # Store a transition in the replay buffer. The terminal flag is
            # chosen to match the TD3 implementation (see Appendix 1 of their
            # paper).
            self._store_transition(
                obs0=self.obs,
                context0=context0,
                action=action,
                reward=reward,
                obs1=new_obs,
                context1=context1,
                terminal1=done,
                is_final_step=self.episode_step >= self.horizon - 1,
                all_obs0=self.all_obs,
                all_obs1=new_all_obs,
            )

            # Book-keeping.
            self.total_steps += 1
            self.episode_step += 1
            if isinstance(reward, dict):
                self.episode_reward += sum(reward[k] for k in reward.keys())
            else:
                self.episode_reward += reward

            # Update the current observation.
            self.obs = new_obs.copy()
            self.all_obs = new_all_obs

            if done:
                # Episode done.
                self.epoch_episode_rewards.append(self.episode_reward)
                self.episode_rew_history.append(self.episode_reward)
                self.epoch_episode_steps.append(self.episode_step)
                self.episode_reward = 0
                self.episode_step = 0
                self.epoch_episodes += 1
                self.episodes += 1

                # Reset the environment.
                obs = self.env.reset()
                self.obs, self.all_obs = self._get_obs(obs)

                # Add the fingerprint term, if needed.
                self.obs = self._add_fingerprint(self.obs, self.total_steps,
                                                 total_timesteps)
Пример #2
0
def get_hyperparameters(args, policy):
    """Return the hyperparameters of a training algorithm from the parser."""
    algorithm_params = {
        "nb_train_steps": args.nb_train_steps,
        "nb_rollout_steps": args.nb_rollout_steps,
        "nb_eval_episodes": args.nb_eval_episodes,
        "actor_update_freq": args.actor_update_freq,
        "meta_update_freq": args.meta_update_freq,
        "reward_scale": args.reward_scale,
        "render": args.render,
        "render_eval": args.render_eval,
        "save_replay_buffer": args.save_replay_buffer,
        "verbose": args.verbose,
        "num_envs": args.num_envs,
        "_init_setup_model": True,
    }

    # add FeedForwardPolicy parameters
    policy_kwargs = {
        "l2_penalty": args.l2_penalty,
        "model_params": {
            "model_type":
            getattr(args, "model_params:model_type"),
            "layer_norm":
            getattr(args, "model_params:layer_norm"),
            "ignore_image":
            getattr(args, "model_params:ignore_image"),
            "image_height":
            getattr(args, "model_params:image_height"),
            "image_width":
            getattr(args, "model_params:image_width"),
            "image_channels":
            getattr(args, "model_params:image_channels"),
            "ignore_flat_channels":
            getattr(args, "model_params:ignore_flat_channels")
            or FEEDFORWARD_PARAMS["model_params"]["ignore_flat_channels"],
            "filters":
            getattr(args, "model_params:filters")
            or FEEDFORWARD_PARAMS["model_params"]["filters"],
            "kernel_sizes":
            getattr(args, "model_params:kernel_sizes")
            or FEEDFORWARD_PARAMS["model_params"]["kernel_sizes"],
            "strides":
            getattr(args, "model_params:strides")
            or FEEDFORWARD_PARAMS["model_params"]["strides"],
            "layers":
            getattr(args, "model_params:layers")
            or FEEDFORWARD_PARAMS["model_params"]["layers"],
        }
    }

    # add TD3 parameters
    if is_td3_policy(policy):
        policy_kwargs.update({
            "buffer_size": args.buffer_size,
            "batch_size": args.batch_size,
            "actor_lr": args.actor_lr,
            "critic_lr": args.critic_lr,
            "tau": args.tau,
            "gamma": args.gamma,
            "use_huber": args.use_huber,
            "noise": args.noise,
            "target_policy_noise": args.target_policy_noise,
            "target_noise_clip": args.target_noise_clip,
        })

    # add SAC parameters
    if is_sac_policy(policy):
        policy_kwargs.update({
            "buffer_size": args.buffer_size,
            "batch_size": args.batch_size,
            "actor_lr": args.actor_lr,
            "critic_lr": args.critic_lr,
            "tau": args.tau,
            "gamma": args.gamma,
            "use_huber": args.use_huber,
            "target_entropy": args.target_entropy,
        })

    # add PPO parameters
    if is_ppo_policy(policy):
        policy_kwargs.update({
            "learning_rate": args.learning_rate,
            "n_minibatches": args.n_minibatches,
            "n_opt_epochs": args.n_opt_epochs,
            "gamma": args.gamma,
            "lam": args.lam,
            "ent_coef": args.ent_coef,
            "vf_coef": args.vf_coef,
            "max_grad_norm": args.max_grad_norm,
            "cliprange": args.cliprange,
            "cliprange_vf": args.cliprange_vf,
        })

    # add GoalConditionedPolicy parameters
    if is_goal_conditioned_policy(policy):
        policy_kwargs.update({
            "num_levels": args.num_levels,
            "meta_period": args.meta_period,
            "intrinsic_reward_type": args.intrinsic_reward_type,
            "intrinsic_reward_scale": args.intrinsic_reward_scale,
            "relative_goals": args.relative_goals,
            "off_policy_corrections": args.off_policy_corrections,
            "hindsight": args.hindsight,
            "subgoal_testing_rate": args.subgoal_testing_rate,
            "cooperative_gradients": args.cooperative_gradients,
            "cg_weights": args.cg_weights,
            "cg_delta": args.cg_delta,
            "pretrain_worker": args.pretrain_worker,
            "pretrain_path": args.pretrain_path,
            "pretrain_ckpt": args.pretrain_ckpt,
        })

    # add MultiActorCriticPolicy parameters
    if is_multiagent_policy(policy):
        policy_kwargs.update({
            "shared": args.shared,
            "maddpg": args.maddpg,
            "n_agents": args.n_agents,
        })

    # add the policy_kwargs term to the algorithm parameters
    algorithm_params['policy_kwargs'] = policy_kwargs

    return algorithm_params
Пример #3
0
    def __init__(self,
                 policy,
                 env,
                 eval_env=None,
                 nb_train_steps=1,
                 nb_rollout_steps=1,
                 nb_eval_episodes=50,
                 actor_update_freq=2,
                 meta_update_freq=10,
                 reward_scale=1.,
                 render=False,
                 render_eval=False,
                 eval_deterministic=True,
                 verbose=0,
                 policy_kwargs=None,
                 _init_setup_model=True):
        """Instantiate the algorithm object.

        Parameters
        ----------
        policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use
        env : gym.Env or str
            the environment to learn from (if registered in Gym, can be str)
        eval_env : gym.Env or str
            the environment to evaluate from (if registered in Gym, can be str)
        nb_train_steps : int
            the number of training steps
        nb_rollout_steps : int
            the number of rollout steps
        nb_eval_episodes : int
            the number of evaluation episodes
        actor_update_freq : int
            number of training steps per actor policy update step. The critic
            policy is updated every training step.
        meta_update_freq : int
            number of training steps per meta policy update step. The actor
            policy of the meta-policy is further updated at the frequency
            provided by the actor_update_freq variable. Note that this value is
            only relevant when using the GoalConditionedPolicy policy.
        reward_scale : float
            the value the reward should be scaled by
        render : bool
            enable rendering of the training environment
        render_eval : bool
            enable rendering of the evaluation environment
        eval_deterministic : bool
            if set to True, the policy provides deterministic actions to the
            evaluation environment. Otherwise, stochastic or noisy actions are
            returned.
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        policy_kwargs : dict
            policy-specific hyperparameters
        _init_setup_model : bool
            Whether or not to build the network at the creation of the instance
        """
        shared = False if policy_kwargs is None else \
            policy_kwargs.get("shared", False)
        maddpg = False if policy_kwargs is None else \
            policy_kwargs.get("maddpg", False)

        self.policy = policy
        self.env_name = deepcopy(env) if isinstance(env, str) \
            else env.__str__()
        self.env = create_env(env, render, shared, maddpg, evaluate=False)
        self.eval_env = create_env(eval_env,
                                   render_eval,
                                   shared,
                                   maddpg,
                                   evaluate=True)
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_episodes = nb_eval_episodes
        self.actor_update_freq = actor_update_freq
        self.meta_update_freq = meta_update_freq
        self.reward_scale = reward_scale
        self.render = render
        self.render_eval = render_eval
        self.eval_deterministic = eval_deterministic
        self.verbose = verbose
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.context_space = getattr(self.env, "context_space", None)
        self.policy_kwargs = {'verbose': verbose}

        # add the default policy kwargs to the policy_kwargs term
        if is_feedforward_policy(policy):
            self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy())
        elif is_goal_conditioned_policy(policy):
            self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy())
            self.policy_kwargs['env_name'] = self.env_name.__str__()
        elif is_multiagent_policy(policy):
            self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy())
            self.policy_kwargs["all_ob_space"] = getattr(
                self.env, "all_observation_space",
                Box(-1, 1, (1, ), dtype=np.float32))

        if is_td3_policy(policy):
            self.policy_kwargs.update(TD3_PARAMS.copy())
        elif is_sac_policy(policy):
            self.policy_kwargs.update(SAC_PARAMS.copy())

        self.policy_kwargs.update(policy_kwargs or {})

        # Compute the time horizon, which is used to check if an environment
        # terminated early and used to compute the done mask as per TD3
        # implementation (see appendix A of their paper). If the horizon cannot
        # be found, it is assumed to be 500 (default value for most gym
        # environments).
        if hasattr(self.env, "horizon"):
            self.horizon = self.env.horizon
        elif hasattr(self.env, "_max_episode_steps"):
            self.horizon = self.env._max_episode_steps
        elif hasattr(self.env, "env_params"):
            # for Flow environments
            self.horizon = self.env.env_params.horizon
        else:
            raise ValueError("Horizon attribute not found.")

        # init
        self.graph = None
        self.policy_tf = None
        self.sess = None
        self.summary = None
        self.obs = None
        self.all_obs = None
        self.episode_step = 0
        self.episodes = 0
        self.total_steps = 0
        self.epoch_episode_steps = []
        self.epoch_episode_rewards = []
        self.epoch_episodes = 0
        self.epoch = 0
        self.episode_rew_history = deque(maxlen=100)
        self.episode_reward = 0
        self.rew_ph = None
        self.rew_history_ph = None
        self.eval_rew_ph = None
        self.eval_success_ph = None
        self.saver = None

        # Append the fingerprint dimension to the observation dimension, if
        # needed.
        if self.policy_kwargs.get("use_fingerprints", False):
            fingerprint_range = self.policy_kwargs["fingerprint_range"]
            low = np.concatenate(
                (self.observation_space.low, fingerprint_range[0]))
            high = np.concatenate(
                (self.observation_space.high, fingerprint_range[1]))
            self.observation_space = Box(low=low, high=high, dtype=np.float32)

        # Create the model variables and operations.
        if _init_setup_model:
            self.trainable_vars = self.setup_model()
Пример #4
0
    def _collect_samples(self,
                         total_steps,
                         run_steps=None,
                         random_actions=False):
        """Perform the sample collection operation over multiple steps.

        This method is responsible for executing rollouts for a number of steps
        before training is executed. The data from the rollouts is stored in
        the policy's replay buffer(s).

        Parameters
        ----------
        total_steps : int
            the total number of samples to train on. Used by the fingerprint
            element
        run_steps : int, optional
            number of steps to collect samples from. If not provided, the value
            defaults to `self.nb_rollout_steps`.
        random_actions : bool
            if set to True, actions are sampled randomly from the action space
            instead of being computed by the policy. This is used for
            exploration purposes.
        """
        for _ in range(run_steps or self.nb_rollout_steps):
            # Predict next action. Use random actions when initializing the
            # replay buffer.
            action = self._policy(
                obs=self.obs,
                context=self.sampler.get_context(),
                apply_noise=True,
                random_actions=random_actions,
            )

            # Update the environment.
            ret = self.sampler.collect_sample(
                action=action,
                multiagent=is_multiagent_policy(self.policy),
                steps=0 if random_actions else self.total_steps,
                total_steps=total_steps,
                use_fingerprints=self.policy_kwargs.get(
                    "use_fingerprints", False),
            )

            obs = ret["obs"]
            context0 = context1 = ret["context"]
            action = ret["action"]
            reward = ret["reward"]
            done = ret["done"]
            all_obs = ret["all_obs"]

            # Store a transition in the replay buffer.
            self._store_transition(
                obs0=self.obs,
                context0=context0,
                action=action,
                reward=reward,
                obs1=obs[0] if done else obs,
                context1=context1,
                terminal1=done,
                is_final_step=self.episode_step >= self.horizon - 1,
                all_obs0=self.all_obs,
                all_obs1=all_obs[0] if done else all_obs,
            )

            # Book-keeping.
            self.total_steps += 1
            self.episode_step += 1
            if isinstance(reward, dict):
                self.episode_reward += sum(reward[k] for k in reward.keys())
            else:
                self.episode_reward += reward

            # Update the current observation.
            self.obs = obs[1].copy() if done else obs.copy()
            self.all_obs = all_obs[1] if done else all_obs

            if done:
                # Episode done.
                self.epoch_episode_rewards.append(self.episode_reward)
                self.episode_rew_history.append(self.episode_reward)
                self.epoch_episode_steps.append(self.episode_step)
                self.episode_reward = 0
                self.episode_step = 0
                self.epoch_episodes += 1
                self.episodes += 1
Пример #5
0
def get_hyperparameters(args, policy):
    """Return the hyperparameters of a training algorithm from the parser."""
    algorithm_params = {
        "nb_train_steps": args.nb_train_steps,
        "nb_rollout_steps": args.nb_rollout_steps,
        "nb_eval_episodes": args.nb_eval_episodes,
        "actor_update_freq": args.actor_update_freq,
        "meta_update_freq": args.meta_update_freq,
        "reward_scale": args.reward_scale,
        "render": args.render,
        "render_eval": args.render_eval,
        "verbose": args.verbose,
        "num_envs": args.num_envs,
        "_init_setup_model": True,
    }

    # add FeedForwardPolicy parameters
    policy_kwargs = {
        "buffer_size": args.buffer_size,
        "batch_size": args.batch_size,
        "actor_lr": args.actor_lr,
        "critic_lr": args.critic_lr,
        "tau": args.tau,
        "gamma": args.gamma,
        "layer_norm": args.layer_norm,
        "use_huber": args.use_huber,
    }

    # add TD3 parameters
    if is_td3_policy(policy):
        policy_kwargs.update({
            "noise": args.noise,
            "target_policy_noise": args.target_policy_noise,
            "target_noise_clip": args.target_noise_clip,
        })

    # add SAC parameters
    if is_sac_policy(policy):
        policy_kwargs.update({
            "target_entropy": args.target_entropy,
        })

    # add GoalConditionedPolicy parameters
    if is_goal_conditioned_policy(policy):
        policy_kwargs.update({
            "num_levels": args.num_levels,
            "meta_period": args.meta_period,
            "intrinsic_reward_type": args.intrinsic_reward_type,
            "intrinsic_reward_scale": args.intrinsic_reward_scale,
            "relative_goals": args.relative_goals,
            "off_policy_corrections": args.off_policy_corrections,
            "hindsight": args.hindsight,
            "subgoal_testing_rate": args.subgoal_testing_rate,
            "connected_gradients": args.connected_gradients,
            "cg_weights": args.cg_weights,
            "use_fingerprints": args.use_fingerprints,
            "centralized_value_functions": args.centralized_value_functions,
        })

    # add MultiFeedForwardPolicy parameters
    if is_multiagent_policy(policy):
        policy_kwargs.update({
            "shared": args.shared,
            "maddpg": args.maddpg,
        })

    # add the policy_kwargs term to the algorithm parameters
    algorithm_params['policy_kwargs'] = policy_kwargs

    return algorithm_params
Пример #6
0
    def _collect_samples(self, run_steps=None, random_actions=False):
        """Perform the sample collection operation over multiple steps.

        This method calls collect_sample for a multiple steps, and attempts to
        run the operation in parallel if multiple environments are available.

        Parameters
        ----------
        run_steps : int, optional
            number of steps to collect samples from. If not provided, the value
            defaults to `self.nb_rollout_steps`.
        random_actions : bool
            if set to True, actions are sampled randomly from the action space
            instead of being computed by the policy. This is used for
            exploration purposes.
        """
        # Loop through the sampling procedure the number of times it would
        # require to run through each environment in parallel until the number
        # of required steps have been collected.
        run_steps = run_steps or self.nb_rollout_steps
        n_itr = math.ceil(run_steps / self.num_envs)
        for itr in range(n_itr):
            n_steps = self.num_envs if itr < n_itr - 1 \
                else run_steps - (n_itr - 1) * self.num_envs

            # Collect the most recent contextual term from every environment.
            if self.num_envs > 1:
                context = [
                    ray.get(self.sampler[env_num].get_context.remote())
                    for env_num in range(self.num_envs)
                ]
            else:
                context = [self.sampler[0].get_context()]

            # Predict next action. Use random actions when initializing the
            # replay buffer.
            action = [
                self._policy(
                    obs=self.obs[env_num],
                    context=context[env_num],
                    apply_noise=True,
                    random_actions=random_actions,
                    env_num=env_num,
                ) for env_num in range(n_steps)
            ]

            # Update the environment.
            if self.num_envs > 1:
                ret = ray.get([
                    self.sampler[env_num].collect_sample.remote(
                        action=action[env_num],
                        multiagent=is_multiagent_policy(self.policy),
                    ) for env_num in range(n_steps)
                ])
            else:
                ret = [
                    self.sampler[0].collect_sample(
                        action=action[0],
                        multiagent=is_multiagent_policy(self.policy),
                    )
                ]

            for ret_i in ret:
                num = ret_i["env_num"]
                context = ret_i["context"]
                action = ret_i["action"]
                reward = ret_i["reward"]
                obs = ret_i["obs"]
                done = ret_i["done"]
                all_obs = ret_i["all_obs"]

                # Store a transition in the replay buffer.
                self._store_transition(
                    obs0=self.obs[num],
                    context0=context,
                    action=action,
                    reward=reward,
                    obs1=obs[0] if done else obs,
                    context1=context,
                    terminal1=done,
                    is_final_step=(self.episode_step[num] >= self.horizon - 1),
                    all_obs0=self.all_obs[num],
                    all_obs1=all_obs[0] if done else all_obs,
                    env_num=num,
                )

                # Book-keeping.
                self.total_steps += 1
                self.episode_step[num] += 1
                if isinstance(reward, dict):
                    self.episode_reward[num] += sum(reward[k]
                                                    for k in reward.keys())
                else:
                    self.episode_reward[num] += reward

                # Update the current observation.
                self.obs[num] = (obs[1] if done else obs).copy()
                self.all_obs[num] = all_obs[1] if done else all_obs

                # Handle episode done.
                if done:
                    self.epoch_episode_rewards.append(self.episode_reward[num])
                    self.episode_rew_history.append(self.episode_reward[num])
                    self.epoch_episode_steps.append(self.episode_step[num])
                    self.episode_reward[num] = 0
                    self.episode_step[num] = 0
                    self.epoch_episodes += 1
                    self.episodes += 1
Пример #7
0
    def __init__(self,
                 policy,
                 env,
                 eval_env=None,
                 nb_train_steps=1,
                 nb_rollout_steps=1,
                 nb_eval_episodes=50,
                 actor_update_freq=2,
                 meta_update_freq=10,
                 reward_scale=1.,
                 render=False,
                 render_eval=False,
                 eval_deterministic=True,
                 save_replay_buffer=False,
                 num_envs=1,
                 verbose=0,
                 policy_kwargs=None,
                 _init_setup_model=True):
        """Instantiate the algorithm object.

        Parameters
        ----------
        policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use
        env : gym.Env or str
            the environment to learn from (if registered in Gym, can be str)
        eval_env : gym.Env or str
            the environment to evaluate from (if registered in Gym, can be str)
        nb_train_steps : int
            the number of training steps
        nb_rollout_steps : int
            the number of rollout steps
        nb_eval_episodes : int
            the number of evaluation episodes
        actor_update_freq : int
            number of training steps per actor policy update step. The critic
            policy is updated every training step.
        meta_update_freq : int
            number of training steps per meta policy update step. The actor
            policy of the meta-policy is further updated at the frequency
            provided by the actor_update_freq variable. Note that this value is
            only relevant when using the GoalConditionedPolicy policy.
        reward_scale : float
            the value the reward should be scaled by
        render : bool
            enable rendering of the training environment
        render_eval : bool
            enable rendering of the evaluation environment
        eval_deterministic : bool
            if set to True, the policy provides deterministic actions to the
            evaluation environment. Otherwise, stochastic or noisy actions are
            returned.
        save_replay_buffer : bool
            whether to save the data from the replay buffer, at the frequency
            that the model is saved. Only the most recent replay buffer is
            stored.
        num_envs : int
            number of environments used to run simulations in parallel. Each
            environment is run on a separate CPUS and uses the same policy as
            the rest. Must be less than or equal to nb_rollout_steps.
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        policy_kwargs : dict
            policy-specific hyperparameters
        _init_setup_model : bool
            Whether or not to build the network at the creation of the instance

        Raises
        ------
        AssertionError
            if num_envs > nb_rollout_steps
        """
        shared = False if policy_kwargs is None else \
            policy_kwargs.get("shared", False)
        maddpg = False if policy_kwargs is None else \
            policy_kwargs.get("maddpg", False)

        # Run assertions.
        assert num_envs <= nb_rollout_steps, \
            "num_envs must be less than or equal to nb_rollout_steps"

        # Instantiate the ray instance.
        if num_envs > 1:
            ray.init(num_cpus=num_envs + 1, ignore_reinit_error=True)

        self.policy = policy
        self.env_name = deepcopy(env) if isinstance(env, str) \
            else env.__str__()
        self.eval_env, _ = create_env(eval_env,
                                      render_eval,
                                      shared,
                                      maddpg,
                                      evaluate=True)
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_episodes = nb_eval_episodes
        self.actor_update_freq = actor_update_freq
        self.meta_update_freq = meta_update_freq
        self.reward_scale = reward_scale
        self.render = render
        self.render_eval = render_eval
        self.eval_deterministic = eval_deterministic
        self.save_replay_buffer = save_replay_buffer
        self.num_envs = num_envs
        self.verbose = verbose
        self.policy_kwargs = {'verbose': verbose}

        # Create the environment and collect the initial observations.
        self.sampler, self.obs, self.all_obs = self.setup_sampler(
            env, render, shared, maddpg)

        # Collect the spaces of the environments.
        self.ac_space, self.ob_space, self.co_space, all_ob_space = \
            self.get_spaces()

        # Add the default policy kwargs to the policy_kwargs term.
        if is_feedforward_policy(policy):
            self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy())

        if is_goal_conditioned_policy(policy):
            self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy())
            self.policy_kwargs['env_name'] = self.env_name.__str__()
            self.policy_kwargs['num_envs'] = num_envs

        if is_multiagent_policy(policy):
            self.policy_kwargs.update(MULTIAGENT_PARAMS.copy())
            self.policy_kwargs["all_ob_space"] = all_ob_space

        if is_td3_policy(policy):
            self.policy_kwargs.update(TD3_PARAMS.copy())
        elif is_sac_policy(policy):
            self.policy_kwargs.update(SAC_PARAMS.copy())

        self.policy_kwargs = recursive_update(self.policy_kwargs, policy_kwargs
                                              or {})

        # Compute the time horizon, which is used to check if an environment
        # terminated early and used to compute the done mask for TD3.
        if self.num_envs > 1:
            self.horizon = ray.get(self.sampler[0].horizon.remote())
        else:
            self.horizon = self.sampler[0].horizon()

        # init
        self.graph = None
        self.policy_tf = None
        self.sess = None
        self.summary = None
        self.episode_step = [0 for _ in range(num_envs)]
        self.episodes = 0
        self.total_steps = 0
        self.epoch_episode_steps = []
        self.epoch_episode_rewards = []
        self.epoch_episodes = 0
        self.epoch = 0
        self.episode_rew_history = deque(maxlen=100)
        self.episode_reward = [0 for _ in range(num_envs)]
        self.rew_ph = None
        self.rew_history_ph = None
        self.eval_rew_ph = None
        self.eval_success_ph = None
        self.saver = None

        # Create the model variables and operations.
        if _init_setup_model:
            self.trainable_vars = self.setup_model()