示例#1
0
    def test_ddpg_double_pendulum(self):
        """Test DDPG with Pendulum environment."""
        deterministic.set_seed(0)
        runner = LocalRunner(snapshot_config)
        env = GarageEnv(gym.make('InvertedDoublePendulum-v2'))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=torch.tanh)

        exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                       policy,
                                                       sigma=0.2)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=F.relu)

        replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                           size_in_transitions=int(1e6),
                                           time_horizon=100)

        algo = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    steps_per_epoch=20,
                    n_train_steps=50,
                    min_buffer_size=int(1e4),
                    exploration_policy=exploration_policy,
                    target_update_tau=1e-2,
                    discount=0.9)

        runner.setup(algo, env)
        last_avg_ret = runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 45

        env.close()
示例#2
0
    def test_ppo_pendulum(self):
        """Test PPO with Pendulum environment."""
        deterministic.set_seed(0)

        episodes_per_task = 5
        max_episode_length = self.env.spec.max_episode_length

        runner = LocalRunner(snapshot_config)
        algo = MAMLPPO(env=self.env,
                       policy=self.policy,
                       value_function=self.value_function,
                       meta_batch_size=5,
                       discount=0.99,
                       gae_lambda=1.,
                       inner_lr=0.1,
                       num_grad_updates=1)

        runner.setup(algo, self.env, sampler_cls=LocalSampler)
        last_avg_ret = runner.train(n_epochs=10,
                                    batch_size=episodes_per_task *
                                    max_episode_length)

        assert last_avg_ret > -5
示例#3
0
    def test_ppo_pendulum(self):
        """Test PPO with Pendulum environment."""
        deterministic.set_seed(0)

        rollouts_per_task = 5
        max_path_length = 100

        runner = LocalRunner(snapshot_config)
        algo = MAMLPPO(env=self.env,
                       policy=self.policy,
                       baseline=self.baseline,
                       max_path_length=max_path_length,
                       meta_batch_size=5,
                       discount=0.99,
                       gae_lambda=1.,
                       inner_lr=0.1,
                       num_grad_updates=1)

        runner.setup(algo, self.env)
        last_avg_ret = runner.train(n_epochs=10,
                                    batch_size=rollouts_per_task *
                                    max_path_length)

        assert last_avg_ret > -5
示例#4
0
class TestVPG:
    """Test class for VPG."""
    @classmethod
    def setup_class(cls):
        """Setup method which is called once before all tests in this class."""
        deterministic.set_seed(0)

    def setup_method(self):
        """Setup method which is called before every test."""
        self._env = GymEnv('InvertedDoublePendulum-v2')
        self._runner = LocalRunner(snapshot_config)

        self._policy = GaussianMLPPolicy(env_spec=self._env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=torch.tanh,
                                         output_nonlinearity=None)
        self._params = {
            'env_spec': self._env.spec,
            'policy': self._policy,
            'value_function':
            GaussianMLPValueFunction(env_spec=self._env.spec),
            'max_episode_length': 100,
            'discount': 0.99,
        }

    def teardown_method(self):
        """Teardown method which is called after every test."""
        self._env.close()

    @pytest.mark.mujoco
    def test_vpg_no_entropy(self):
        """Test VPG with no_entropy."""
        self._params['positive_adv'] = True
        self._params['use_softplus_entropy'] = True

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env, sampler_cls=LocalSampler)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0

    @pytest.mark.mujoco
    def test_vpg_max(self):
        """Test VPG with maximum entropy."""
        self._params['center_adv'] = False
        self._params['stop_entropy_gradient'] = True
        self._params['entropy_method'] = 'max'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env, sampler_cls=LocalSampler)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0

    @pytest.mark.mujoco
    def test_vpg_regularized(self):
        """Test VPG with entropy_regularized."""
        self._params['entropy_method'] = 'regularized'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env, sampler_cls=LocalSampler)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0

    @pytest.mark.mujoco
    @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG)
    def test_invalid_entropy_config(self, algo_param, error, msg):
        """Test VPG with invalid entropy config."""
        self._params.update(algo_param)
        with pytest.raises(error, match=msg):
            VPG(**self._params)
示例#5
0
def pearl_metaworld_ml10(ctxt=None,
                         seed=1,
                         num_epochs=1000,
                         num_train_tasks=10,
                         num_test_tasks=5,
                         latent_size=7,
                         encoder_hidden_size=200,
                         net_size=300,
                         meta_batch_size=16,
                         num_steps_per_epoch=4000,
                         num_initial_steps=4000,
                         num_tasks_sample=15,
                         num_steps_prior=750,
                         num_extra_rl_steps_posterior=750,
                         batch_size=256,
                         embedding_batch_size=64,
                         embedding_mini_batch_size=64,
                         max_path_length=150,
                         reward_scale=10.,
                         use_gpu=False):
    """Train PEARL with ML10 environments.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        num_epochs (int): Number of training epochs.
        num_train_tasks (int): Number of tasks for training.
        num_test_tasks (int): Number of tasks for testing.
        latent_size (int): Size of latent context vector.
        encoder_hidden_size (int): Output dimension of dense layer of the
            context encoder.
        net_size (int): Output dimension of a dense layer of Q-function and
            value function.
        meta_batch_size (int): Meta batch size.
        num_steps_per_epoch (int): Number of iterations per epoch.
        num_initial_steps (int): Number of transitions obtained per task before
            training.
        num_tasks_sample (int): Number of random tasks to obtain data for each
            iteration.
        num_steps_prior (int): Number of transitions to obtain per task with
            z ~ prior.
        num_extra_rl_steps_posterior (int): Number of additional transitions
            to obtain per task with z ~ posterior that are only used to train
            the policy and NOT the encoder.
        batch_size (int): Number of transitions in RL batch.
        embedding_batch_size (int): Number of transitions in context batch.
        embedding_mini_batch_size (int): Number of transitions in mini context
            batch; should be same as embedding_batch_size for non-recurrent
            encoder.
        max_path_length (int): Maximum path length.
        reward_scale (int): Reward scale.
        use_gpu (bool): Whether or not to use GPU for training.

    """
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks
    ML_train_envs = [
        GarageEnv(normalize(mwb.ML10.from_task(task_name)))
        for task_name in mwb.ML10.get_train_tasks().all_task_names
    ]

    ML_test_envs = [
        GarageEnv(normalize(mwb.ML10.from_task(task_name)))
        for task_name in mwb.ML10.get_test_tasks().all_task_names
    ]

    env_sampler = EnvPoolSampler(ML_train_envs)
    env_sampler.grow_pool(num_train_tasks)
    env = env_sampler.sample(num_train_tasks)
    test_env_sampler = EnvPoolSampler(ML_test_envs)
    test_env_sampler.grow_pool(num_test_tasks)

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    runner.train(n_epochs=num_epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
    """Train MTSAC with the ML1 pick-place-v1 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).

    """
    deterministic.set_seed(seed)
    runner = LocalRunner(ctxt)
    train_envs = []
    test_envs = []
    env_names = []
    for i in range(50):
        train_env = normalize(
            GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'),
                   normalize_reward=True))
        test_env = pickle.loads(pickle.dumps(train_env))
        env_names.append('pick_place_{}'.format(i))
        train_envs.append(train_env)
        test_envs.append(test_env)
    ml1_train_envs = MultiEnvWrapper(train_envs,
                                     sample_strategy=round_robin_strategy,
                                     env_names=env_names)
    ml1_test_envs = MultiEnvWrapper(test_envs,
                                    sample_strategy=round_robin_strategy,
                                    env_names=env_names)
    policy = TanhGaussianMLPPolicy(
        env_spec=ml1_train_envs.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    timesteps = 10000000
    batch_size = int(150 * ml1_train_envs.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_episode_length=150,
                  eval_env=ml1_test_envs,
                  env_spec=ml1_train_envs.spec,
                  num_tasks=50,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler)
    runner.train(n_epochs=epochs, batch_size=batch_size)
示例#7
0
def diayn_half_cheetah_vel_batch_for_pearl(ctxt=None, seed=1):
    deterministic.set_seed(seed)
    runner = LocalRunner(snapshot_config=ctxt)
    env = GarageEnv(normalize(HalfCheetahVelEnv()))

    policy = TanhGaussianMLPSkillPolicy(
        env_spec=env.spec,
        skills_num=skills_num,
        hidden_sizes=[256, 256],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    discriminator = MLPDiscriminator(env_spec=env.spec,
                                     skills_num=skills_num,
                                     hidden_sizes=[64, 64],
                                     hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    diayn = DIAYN(
        env_spec=env.spec,
        skills_num=skills_num,
        discriminator=discriminator,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        gradient_steps_per_itr=1000,
        max_path_length=300,
        replay_buffer=replay_buffer,
        min_buffer_size=1e4,
        recorded=True,  # enable the video recording func
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=256,
        reward_scale=1.,
        steps_per_epoch=1)

    if torch.cuda.is_available():
        tu.set_gpu_mode(True)
    else:
        tu.set_gpu_mode(False)
    diayn.to()
    worker_args = {"skills_num": skills_num}
    runner.setup(algo=diayn,
                 env=env,
                 sampler_cls=LocalSkillSampler,
                 worker_class=SkillWorker,
                 worker_args=worker_args)
    runner.train(n_epochs=1000, batch_size=1000)  # 1000
    # runner.restore(from_dir=os.path.join(os.getcwd(), 'data/local/experiment/diayn_half_cheetah_batch_50'))
    # diayn = runner.get_algo()
    runner.save(999)  # saves the last episode

    return discriminator, diayn
示例#8
0
def pearl_half_cheetah(
        ctxt=None,
        seed=1,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        reward_scale=param_reward_scale,
        use_gpu=param_use_gpu):
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks
    env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))
    env = env_sampler.sample(num_train_tasks)
    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
示例#9
0
def diayn_pearl_half_cheeth(
        ctxt=None,
        seed=1,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        reward_scale=param_reward_scale,
        use_gpu=param_use_gpu):
    if task_proposer is None:
        raise ValueError("Task proposer is empty")

    assert num_train_tasks is skills_num

    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]
    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent)
    #               for train_env in ML_train_envs]

    # ML_test_envs = [
    #     GarageEnv(normalize(
    #         DiaynEnvWrapper(env, task_proposer, skills_num, task_name)))
    #     for task_name in random.sample(range(skills_num), test_tasks_num)
    # ]

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
示例#10
0
文件: sac.py 项目: roamlab/roam_rl
        def run(ctxt=None):
            """ Set up environment and algorithm and run the task.

            Args:
                ctxt (garage.experiment.ExperimentContext): The experiment
                    configuration used by LocalRunner to create the snapshotter.
                seed (int): Used to seed the random number generator to produce
                    determinism.

            """
            deterministic.set_seed(self.seed)
            runner = LocalRunner(snapshot_config=ctxt, max_cpus=32)
            env = GarageEnv(normalize(self.env_maker()))

            policy = TanhGaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=self.policy_hidden_sizes,
                hidden_nonlinearity=nn.ReLU,
                output_nonlinearity=None,
                min_std=np.exp(-20.),
                max_std=np.exp(2.),
            )

            qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                         hidden_sizes=self.qf_hidden_sizes,
                                         hidden_nonlinearity=F.relu)

            qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                         hidden_sizes=self.qf_hidden_sizes,
                                         hidden_nonlinearity=F.relu)

            replay_buffer = PathBuffer(
                capacity_in_transitions=self.buffer_capacity_in_transitions)

            algo = _SAC_(env_spec=env.spec,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         gradient_steps_per_itr=self.gradient_steps_per_itr,
                         max_path_length=self.max_path_length,
                         max_eval_path_length=self.max_eval_path_length,
                         replay_buffer=replay_buffer,
                         min_buffer_size=self.min_buffer_size,
                         target_update_tau=self.target_update_tau,
                         discount=self.discount,
                         buffer_batch_size=self.buffer_batch_size,
                         reward_scale=self.reward_scale,
                         steps_per_epoch=self.steps_per_epoch)

            if torch.cuda.is_available():
                set_gpu_mode(True)
            else:
                set_gpu_mode(False)
            algo.to()

            if self.parallel_sampling:
                runner.setup(algo=algo,
                             env=env,
                             sampler_cls=RaySampler,
                             n_workers=self.n_workers)
            else:
                runner.setup(algo=algo, env=env, sampler_cls=LocalSampler)

            runner.train(n_epochs=self.n_epochs, batch_size=self.batch_size)
示例#11
0
    def test_pearl_ml1_push(self):
        """Test PEARL with ML1 Push environment."""
        params = dict(seed=1,
                      num_epochs=1,
                      num_train_tasks=5,
                      num_test_tasks=1,
                      latent_size=7,
                      encoder_hidden_sizes=[10, 10, 10],
                      net_size=30,
                      meta_batch_size=16,
                      num_steps_per_epoch=40,
                      num_initial_steps=40,
                      num_tasks_sample=15,
                      num_steps_prior=15,
                      num_extra_rl_steps_posterior=15,
                      batch_size=256,
                      embedding_batch_size=8,
                      embedding_mini_batch_size=8,
                      max_episode_length=50,
                      reward_scale=10.,
                      use_information_bottleneck=True,
                      use_next_obs_in_context=False,
                      use_gpu=False)

        net_size = params['net_size']
        set_seed(params['seed'])
        env_sampler = SetTaskSampler(lambda: GarageEnv(
            normalize(ML1.get_train_tasks('push-v1'))))
        env = env_sampler.sample(params['num_train_tasks'])

        test_env_sampler = SetTaskSampler(lambda: GarageEnv(
            normalize(ML1.get_test_tasks('push-v1'))))

        augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size'])
        qf = ContinuousMLPQFunction(
            env_spec=augmented_env,
            hidden_sizes=[net_size, net_size, net_size])

        vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf')
        vf = ContinuousMLPQFunction(
            env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size])

        inner_policy = TanhGaussianMLPPolicy(
            env_spec=augmented_env,
            hidden_sizes=[net_size, net_size, net_size])

        pearl = PEARL(
            env=env,
            policy_class=ContextConditionedPolicy,
            encoder_class=MLPEncoder,
            inner_policy=inner_policy,
            qf=qf,
            vf=vf,
            num_train_tasks=params['num_train_tasks'],
            num_test_tasks=params['num_test_tasks'],
            latent_dim=params['latent_size'],
            encoder_hidden_sizes=params['encoder_hidden_sizes'],
            test_env_sampler=test_env_sampler,
            meta_batch_size=params['meta_batch_size'],
            num_steps_per_epoch=params['num_steps_per_epoch'],
            num_initial_steps=params['num_initial_steps'],
            num_tasks_sample=params['num_tasks_sample'],
            num_steps_prior=params['num_steps_prior'],
            num_extra_rl_steps_posterior=params[
                'num_extra_rl_steps_posterior'],
            batch_size=params['batch_size'],
            embedding_batch_size=params['embedding_batch_size'],
            embedding_mini_batch_size=params['embedding_mini_batch_size'],
            max_episode_length=params['max_episode_length'],
            reward_scale=params['reward_scale'],
        )

        set_gpu_mode(params['use_gpu'], gpu_id=0)
        if params['use_gpu']:
            pearl.to()

        runner = LocalRunner(snapshot_config)
        runner.setup(
            algo=pearl,
            env=env[0](),
            sampler_cls=LocalSampler,
            sampler_args=dict(max_episode_length=params['max_episode_length']),
            n_workers=1,
            worker_class=PEARLWorker)

        runner.train(n_epochs=params['num_epochs'],
                     batch_size=params['batch_size'])
def meta_kant_cheetah_vel(
        ctxt=None,
        seed=seed,
        num_skills=skills_num,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        is_encoder_recurrent=False,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        num_skills_sample=param_num_skills_sample,
        num_skills_reason_steps=param_num_skills_reason_steps,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        skills_reason_reward_scale=param_skills_reason_reward_scale,
        tasks_adapt_reward_scale=param_tasks_adapt_reward_scale,
        use_gpu=param_use_gpu):
    assert num_train_tasks is skills_num

    set_seed(seed)

    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]

    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf")

    qf = ContinuousMLPQFunction(env_spec=qf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    controller_policy_env = MetaKant.get_env_spec(env[0](),
                                                  latent_size,
                                                  module="controller_policy",
                                                  num_skills=num_skills)

    controller_policy = CategoricalMLPPolicy(
        env_spec=controller_policy_env,
        hidden_sizes=[net_size, net_size],
        hidden_nonlinearity=functional.relu)

    metakant = MetaKant(
        env=env,
        skill_env=skill_env,
        controller_policy=controller_policy,
        skill_actor=skill_actor,
        qf=qf,
        vf=vf,
        num_skills=num_skills,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        sampler_class=LocalSkillSampler,
        is_encoder_recurrent=is_encoder_recurrent,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_per_epoch=num_steps_per_epoch,
        num_steps_prior=num_steps_prior,  # num_steps_posterior
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        num_skills_reason_steps=num_skills_reason_steps,
        num_skills_sample=num_skills_sample,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        skills_reason_reward_scale=skills_reason_reward_scale,
        tasks_adapt_reward_scale=tasks_adapt_reward_scale)

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        metakant.to()

    worker_args = dict(num_skills=num_skills,
                       skill_actor_class=type(skill_actor),
                       controller_class=OpenContextConditionedControllerPolicy,
                       deterministic=False,
                       accum_context=True)

    runner.setup(algo=metakant,
                 env=env[0](),
                 sampler_cls=LocalSkillSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=KantWorker,
                 worker_args=worker_args)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
示例#13
0
def run_garage_pytorch(env, seed, log_dir):
    """Create garage PyTorch PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    env = TfEnv(normalize(env))

    deterministic.set_seed(seed)

    runner = LocalRunner(snapshot_config)

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=(32, 32),
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        policy,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)
    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                    value_function,
                                    max_optimization_epochs=10,
                                    minibatch_size=64)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       policy_optimizer=policy_optimizer,
                       vf_optimizer=vf_optimizer,
                       max_path_length=hyper_parameters['max_path_length'],
                       discount=0.99,
                       gae_lambda=0.95,
                       center_adv=True,
                       lr_clip_range=0.2)

    # Set up logger since we are not using run_experiment
    tabular_log_file = osp.join(log_dir, 'progress.csv')
    dowel_logger.add_output(dowel.StdOutput())
    dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
    dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

    runner.setup(algo, env)
    runner.train(n_epochs=hyper_parameters['n_epochs'],
                 batch_size=hyper_parameters['batch_size'])

    dowel_logger.remove_all()

    return tabular_log_file
示例#14
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.
    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters
    """

    th = 1.8
    g_max = 0.1
    #delta = 1e-7
    if args.env == 'CartPole':
        #CartPole

        env = TfEnv(normalize(CartPoleEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 5000
        max_length = 100
        n_timestep = 5e5
        n_counts = 5
        name = 'CartPole'
        grad_factor = 5
        th = 1.2
        #batchsize: 1
        # lr = 0.1
        # w = 2
        # c = 50

        #batchsize: 50
        lr = 0.75
        c = 3
        w = 2

        discount = 0.995
        path = './init/CartPole_policy.pth'

    if args.env == 'Walker':
        #Walker_2d
        env = TfEnv(normalize(Walker2dEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 2
        c = 12
        grad_factor = 6

        discount = 0.999

        name = 'Walk'
        path = './init/Walk_policy.pth'

    if args.env == 'HalfCheetah':
        env = TfEnv(normalize(HalfCheetahEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.6
        w = 1
        c = 4
        grad_factor = 5
        th = 1.2
        g_max = 0.06

        discount = 0.999

        name = 'HalfCheetah'
        path = './init/HalfCheetah_policy.pth'

    if args.env == 'Hopper':
        #Hopper
        env = TfEnv(normalize(HopperEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 1000
        th = 1.5
        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 1
        c = 3
        grad_factor = 6
        g_max = 0.15
        discount = 0.999

        name = 'Hopper'
        path = './init/Hopper_policy.pth'

    for i in range(n_counts):
        # print(env.spec)
        if args.env == 'CartPole':
            policy = CategoricalMLPPolicy(env.spec,
                                       hidden_sizes=[8, 8],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)
        else:
            policy = GaussianMLPPolicy(env.spec,
                                       hidden_sizes=[64, 64],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)


        policy.load_state_dict(torch.load(path))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = MBPG_HA(env_spec=env.spec,
                   env = env,
                    env_name= name,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=max_length,
                   discount=discount,
                   grad_factor=grad_factor,
                   policy_lr= lr,
                   c = c,
                   w = w,
                   th=th,
                   g_max=g_max,
                   n_timestep=n_timestep,

                   batch_size=batch_size,
                   center_adv=True,
                   # delta=delta
                   #decay_learning_rate=d_lr,

                   )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
示例#15
0
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
    """Train MTSAC with MT50 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        use_gpu (bool): Used to enable ussage of GPU in training.
        _gpu (int): The ID of the gpu (used on multi-gpu machines).

    """
    deterministic.set_seed(seed)
    runner = LocalRunner(ctxt)
    task_names = mwb.MT50.get_train_tasks().all_task_names
    train_envs = []
    test_envs = []
    for task_name in task_names:
        train_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)),
                              normalize_reward=True)
        test_env = normalize(GarageEnv(mwb.MT50.from_task(task_name)))
        train_envs.append(train_env)
        test_envs.append(test_env)
    mt50_train_envs = MultiEnvWrapper(train_envs,
                                      sample_strategy=round_robin_strategy,
                                      mode='vanilla')
    mt50_test_envs = MultiEnvWrapper(test_envs,
                                     sample_strategy=round_robin_strategy,
                                     mode='vanilla')
    policy = TanhGaussianMLPPolicy(
        env_spec=mt50_train_envs.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    timesteps = 100000000
    batch_size = int(150 * mt50_train_envs.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_episode_length=250,
                  eval_env=mt50_test_envs,
                  env_spec=mt50_train_envs.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=7500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=6400)
    set_gpu_mode(use_gpu, _gpu)
    mtsac.to()
    runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler)
    runner.train(n_epochs=epochs, batch_size=batch_size)
示例#16
0
class TestVPG:
    @classmethod
    def setup_class(cls):
        deterministic.set_seed(0)

    def setup_method(self):
        self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2'))
        self._runner = LocalRunner(snapshot_config)

        policy = GaussianMLPPolicy(env_spec=self._env.spec,
                                   hidden_sizes=[64, 64],
                                   hidden_nonlinearity=torch.tanh,
                                   output_nonlinearity=None)
        self._params = {
            'env_spec': self._env.spec,
            'policy': policy,
            'optimizer': torch.optim.Adam,
            'baseline': LinearFeatureBaseline(env_spec=self._env.spec),
            'max_path_length': 100,
            'discount': 0.99,
            'policy_lr': 1e-2
        }

    def teardown_method(self):
        self._env.close()

    def test_vpg_no_entropy(self):
        """Test VPG with no_entropy."""
        self._params['positive_adv'] = True
        self._params['use_softplus_entropy'] = True

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0

    def test_vpg_max(self):
        """Test VPG with maximum entropy."""
        self._params['center_adv'] = False
        self._params['stop_entropy_gradient'] = True
        self._params['entropy_method'] = 'max'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 0

    def test_vpg_regularized(self):
        """Test VPG with entropy_regularized."""
        self._params['entropy_method'] = 'regularized'

        algo = VPG(**self._params)
        self._runner.setup(algo, self._env)
        last_avg_ret = self._runner.train(n_epochs=10, batch_size=100)
        assert last_avg_ret > 30

    @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG)
    def test_invalid_entropy_config(self, algo_param, error, msg):
        self._params.update(algo_param)
        with pytest.raises(error, match=msg):
            VPG(**self._params)
def diayn_point_mass_multigoal(ctxt=None, seed=1):

    deterministic.set_seed(seed)
    runner = LocalRunner(snapshot_config=ctxt)
    env = MultiGoalEnv()
    skills_num = 6

    policy = TanhGaussianMLPSkillPolicy(
        env_spec=env.spec,
        skills_num=skills_num,
        hidden_sizes=[256, 256],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    discriminator = MLPDiscriminator(env_spec=env.spec,
                                     skills_num=skills_num,
                                     hidden_sizes=[64, 64],
                                     hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    diayn = DIAYN(
        env_spec=env.spec,
        skills_num=skills_num,
        discriminator=discriminator,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        gradient_steps_per_itr=1000,
        max_path_length=500,
        replay_buffer=replay_buffer,
        min_buffer_size=1e4,
        recorded=True,  # enable the video recording func
        is_gym_render=False,
        media_save_path='diayn_2d_multigoal/',
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=256,
        reward_scale=1.,
        steps_per_epoch=1)

    if torch.cuda.is_available():
        tu.set_gpu_mode(True)
    else:
        tu.set_gpu_mode(False)
    diayn.to()
    worker_args = {"skills_num": skills_num}
    runner.setup(algo=diayn,
                 env=env,
                 sampler_cls=LocalSkillSampler,
                 worker_class=SkillWorker,
                 worker_args=worker_args)
    runner.train(n_epochs=1000, batch_size=1000)