示例#1
0
def diayn_half_cheetah_vel_batch_for_pearl(ctxt=None, seed=1):
    deterministic.set_seed(seed)
    runner = LocalRunner(snapshot_config=ctxt)
    env = GarageEnv(normalize(HalfCheetahVelEnv()))

    policy = TanhGaussianMLPSkillPolicy(
        env_spec=env.spec,
        skills_num=skills_num,
        hidden_sizes=[256, 256],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec,
                                      skills_num=skills_num,
                                      hidden_sizes=[256, 256],
                                      hidden_nonlinearity=F.relu)

    discriminator = MLPDiscriminator(env_spec=env.spec,
                                     skills_num=skills_num,
                                     hidden_sizes=[64, 64],
                                     hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    diayn = DIAYN(
        env_spec=env.spec,
        skills_num=skills_num,
        discriminator=discriminator,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        gradient_steps_per_itr=1000,
        max_path_length=300,
        replay_buffer=replay_buffer,
        min_buffer_size=1e4,
        recorded=True,  # enable the video recording func
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=256,
        reward_scale=1.,
        steps_per_epoch=1)

    if torch.cuda.is_available():
        tu.set_gpu_mode(True)
    else:
        tu.set_gpu_mode(False)
    diayn.to()
    worker_args = {"skills_num": skills_num}
    runner.setup(algo=diayn,
                 env=env,
                 sampler_cls=LocalSkillSampler,
                 worker_class=SkillWorker,
                 worker_args=worker_args)
    runner.train(n_epochs=1000, batch_size=1000)  # 1000
    # runner.restore(from_dir=os.path.join(os.getcwd(), 'data/local/experiment/diayn_half_cheetah_batch_50'))
    # diayn = runner.get_algo()
    runner.save(999)  # saves the last episode

    return discriminator, diayn
示例#2
0
def diayn_pearl_half_cheeth(
        ctxt=None,
        seed=1,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        reward_scale=param_reward_scale,
        use_gpu=param_use_gpu):
    if task_proposer is None:
        raise ValueError("Task proposer is empty")

    assert num_train_tasks is skills_num

    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]
    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent)
    #               for train_env in ML_train_envs]

    # ML_test_envs = [
    #     GarageEnv(normalize(
    #         DiaynEnvWrapper(env, task_proposer, skills_num, task_name)))
    #     for task_name in random.sample(range(skills_num), test_tasks_num)
    # ]

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
示例#3
0
def pearl_half_cheetah(
        ctxt=None,
        seed=1,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        reward_scale=param_reward_scale,
        use_gpu=param_use_gpu):
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks
    env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))
    env = env_sampler.sample(num_train_tasks)
    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns
def meta_kant_cheetah_vel(
        ctxt=None,
        seed=seed,
        num_skills=skills_num,
        num_epochs=param_num_epoches,
        num_train_tasks=param_train_tasks_num,
        num_test_tasks=param_test_tasks_num,
        is_encoder_recurrent=False,
        latent_size=param_latent_size,
        encoder_hidden_size=param_encoder_hidden_size,
        net_size=param_net_size,
        meta_batch_size=param_meta_batch_size,
        num_steps_per_epoch=param_num_steps_per_epoch,
        num_initial_steps=param_num_initial_steps,
        num_tasks_sample=param_num_tasks_sample,
        num_steps_prior=param_num_steps_prior,
        num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior,
        num_skills_sample=param_num_skills_sample,
        num_skills_reason_steps=param_num_skills_reason_steps,
        batch_size=param_batch_size,
        embedding_batch_size=param_embedding_batch_size,
        embedding_mini_batch_size=param_embedding_mini_batch_size,
        max_path_length=param_max_path_length,
        skills_reason_reward_scale=param_skills_reason_reward_scale,
        tasks_adapt_reward_scale=param_tasks_adapt_reward_scale,
        use_gpu=param_use_gpu):
    assert num_train_tasks is skills_num

    set_seed(seed)

    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)

    ML_train_envs = [
        DiaynEnvWrapper(task_proposer, skills_num, task_name,
                        normalize(HalfCheetahVelEnv()))
        for task_name in range(skills_num)
    ]

    env_sampler = EnvPoolSampler(ML_train_envs)
    env = env_sampler.sample(num_train_tasks)

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(HalfCheetahVelEnv())))

    runner = LocalRunner(ctxt)

    qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf")

    qf = ContinuousMLPQFunction(env_spec=qf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    controller_policy_env = MetaKant.get_env_spec(env[0](),
                                                  latent_size,
                                                  module="controller_policy",
                                                  num_skills=num_skills)

    controller_policy = CategoricalMLPPolicy(
        env_spec=controller_policy_env,
        hidden_sizes=[net_size, net_size],
        hidden_nonlinearity=functional.relu)

    metakant = MetaKant(
        env=env,
        skill_env=skill_env,
        controller_policy=controller_policy,
        skill_actor=skill_actor,
        qf=qf,
        vf=vf,
        num_skills=num_skills,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        sampler_class=LocalSkillSampler,
        is_encoder_recurrent=is_encoder_recurrent,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_per_epoch=num_steps_per_epoch,
        num_steps_prior=num_steps_prior,  # num_steps_posterior
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        num_skills_reason_steps=num_skills_reason_steps,
        num_skills_sample=num_skills_sample,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        skills_reason_reward_scale=skills_reason_reward_scale,
        tasks_adapt_reward_scale=tasks_adapt_reward_scale)

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        metakant.to()

    worker_args = dict(num_skills=num_skills,
                       skill_actor_class=type(skill_actor),
                       controller_class=OpenContextConditionedControllerPolicy,
                       deterministic=False,
                       accum_context=True)

    runner.setup(algo=metakant,
                 env=env[0](),
                 sampler_cls=LocalSkillSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=KantWorker,
                 worker_args=worker_args)

    average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size)
    runner.save(num_epochs - 1)

    return average_returns