def diayn_half_cheetah_vel_batch_for_pearl(ctxt=None, seed=1): deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(HalfCheetahVelEnv())) policy = TanhGaussianMLPSkillPolicy( env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPSkillQFunction(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) discriminator = MLPDiscriminator(env_spec=env.spec, skills_num=skills_num, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) diayn = DIAYN( env_spec=env.spec, skills_num=skills_num, discriminator=discriminator, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=300, replay_buffer=replay_buffer, min_buffer_size=1e4, recorded=True, # enable the video recording func target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) diayn.to() worker_args = {"skills_num": skills_num} runner.setup(algo=diayn, env=env, sampler_cls=LocalSkillSampler, worker_class=SkillWorker, worker_args=worker_args) runner.train(n_epochs=1000, batch_size=1000) # 1000 # runner.restore(from_dir=os.path.join(os.getcwd(), 'data/local/experiment/diayn_half_cheetah_batch_50')) # diayn = runner.get_algo() runner.save(999) # saves the last episode return discriminator, diayn
def diayn_pearl_half_cheeth( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): if task_proposer is None: raise ValueError("Task proposer is empty") assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent) # for train_env in ML_train_envs] # ML_test_envs = [ # GarageEnv(normalize( # DiaynEnvWrapper(env, task_proposer, skills_num, task_name))) # for task_name in random.sample(range(skills_num), test_tasks_num) # ] test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def pearl_half_cheetah( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def meta_kant_cheetah_vel( ctxt=None, seed=seed, num_skills=skills_num, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, is_encoder_recurrent=False, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, num_skills_sample=param_num_skills_sample, num_skills_reason_steps=param_num_skills_reason_steps, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, skills_reason_reward_scale=param_skills_reason_reward_scale, tasks_adapt_reward_scale=param_tasks_adapt_reward_scale, use_gpu=param_use_gpu): assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) qf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, "qf") qf = ContinuousMLPQFunction(env_spec=qf_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = MetaKant.get_env_spec(env[0](), latent_size, num_skills, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) controller_policy_env = MetaKant.get_env_spec(env[0](), latent_size, module="controller_policy", num_skills=num_skills) controller_policy = CategoricalMLPPolicy( env_spec=controller_policy_env, hidden_sizes=[net_size, net_size], hidden_nonlinearity=functional.relu) metakant = MetaKant( env=env, skill_env=skill_env, controller_policy=controller_policy, skill_actor=skill_actor, qf=qf, vf=vf, num_skills=num_skills, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, sampler_class=LocalSkillSampler, is_encoder_recurrent=is_encoder_recurrent, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_per_epoch=num_steps_per_epoch, num_steps_prior=num_steps_prior, # num_steps_posterior num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, num_skills_reason_steps=num_skills_reason_steps, num_skills_sample=num_skills_sample, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, skills_reason_reward_scale=skills_reason_reward_scale, tasks_adapt_reward_scale=tasks_adapt_reward_scale) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: metakant.to() worker_args = dict(num_skills=num_skills, skill_actor_class=type(skill_actor), controller_class=OpenContextConditionedControllerPolicy, deterministic=False, accum_context=True) runner.setup(algo=metakant, env=env[0](), sampler_cls=LocalSkillSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=KantWorker, worker_args=worker_args) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns