def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_specs_vg = VariantGenerator() env_spec_constants = {} env_spec_ranges = {} for k, v in env_specs.items(): if isinstance(v, list): env_specs_vg.add(k, v) env_spec_ranges[k] = v else: env_spec_constants[k] = v env_specs_list = [] for es in env_specs_vg.variants(): del es['_hidden_keys'] es.update(env_spec_constants) env_specs_list.append(es) env_sampler = EnvSampler(env_specs_list) # make the normalizer function for the env_params mean = [] half_diff = [] for k in sorted(env_spec_ranges.keys()): r = env_spec_ranges[k] if len(r) == 1: mean.append(0) half_diff.append(r[0]) else: mean.append((r[0] + r[1]) / 2.0) half_diff.append((r[1] - r[0]) / 2.0) mean = np.array(mean) half_diff = np.array(half_diff) def env_params_normalizer(params): return (params - mean) / half_diff variant['algo_params']['env_params_normalizer'] = env_params_normalizer # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) if exp_specs['use_new_sac']: qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = NewMetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) else: policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def run_rlkit(env, seed, log_dir): """ Create rlkit model and training. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return result csv file """ reset_execution_environment() gt.reset() setup_logger(log_dir=log_dir) expl_env = NormalizedBoxEnv(env) eval_env = NormalizedBoxEnv(env) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=params['qf_hidden_sizes']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=params['policy_hidden_sizes']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, hidden_sizes=params['policy_hidden_sizes']) es = RLkitGaussianStrategy( action_space=expl_env.action_space, max_sigma=params['sigma'], min_sigma=params['sigma'], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( params['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, discount=params['discount']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, num_epochs=params['n_epochs'], num_train_loops_per_epoch=params['steps_per_epoch'], num_trains_per_train_loop=params['n_train_steps'], num_expl_steps_per_train_loop=params['n_rollout_steps'], num_eval_steps_per_epoch=params['n_rollout_steps'], min_num_steps_before_training=params['min_buffer_size'], max_path_length=params['n_rollout_steps'], batch_size=params['buffer_batch_size'], ) algorithm.to(ptu.device) algorithm.train() return osp.join(log_dir, 'progress.csv')
def experiment(variant): base_expl_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) base_eval_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print(expl_env.observation_space, expl_env.action_space) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.train()
def experiment(variant): dummy_env = make_env(variant['env']) obs_dim = dummy_env.observation_space.low.size action_dim = dummy_env.action_space.low.size expl_env = VectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['expl_env_num']) ]) expl_env.seed(variant["seed"]) expl_env.action_space.seed(variant["seed"]) eval_env = SubprocVectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['eval_env_num']) ]) eval_env.seed(variant["seed"]) M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) es = GaussianStrategy( action_space=dummy_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = VecMdpPathCollector( eval_env, policy, ) expl_path_collector = VecMdpStepCollector( expl_env, exploration_policy, ) replay_buffer = TorchReplayBuffer( variant['replay_buffer_size'], dummy_env, ) trainer = TD3Trainer( policy=policy, target_policy=target_policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'], ) algorithm = TorchVecOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def skewfit_experiment(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.online_vae_replay_buffer \ import OnlineVaeRelabelingBuffer from rlkit.torch.networks import FlattenMlp from rlkit.torch.sac.policies import TanhGaussianPolicy import rlkit.torch.vae.vae_schedules as vae_schedules #### getting parameter for training VAE and RIG env = get_envs(variant) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) replay_buffer_kwargs = variant.get( 'replay_buffer_kwargs', dict( start_skew_epoch=10, max_size=int(100000), fraction_goals_rollout_goals=0.2, fraction_goals_env_goals=0.5, exploration_rewards_type='None', vae_priority_type='vae_prob', priority_function_kwargs=dict( sampling_method='importance_sampling', decoder_distribution='gaussian_identity_variance', num_latents_to_sample=10, ), power=0, relabeling_goal_sampling_mode='vae_prior', )) online_vae_trainer_kwargs = variant.get('online_vae_trainer_kwargs', dict(beta=20, lr=1e-3)) max_path_length = variant.get('max_path_length', 50) algo_kwargs = variant.get( 'algo_kwargs', dict( batch_size=1024, num_epochs=1000, num_eval_steps_per_epoch=500, num_expl_steps_per_train_loop=500, num_trains_per_train_loop=1000, min_num_steps_before_training=10000, vae_training_schedule=vae_schedules.custom_schedule_2, oracle_data=False, vae_save_period=50, parallel_vae_train=False, )) twin_sac_trainer_kwargs = variant.get( 'twin_sac_trainer_kwargs', dict( discount=0.99, reward_scale=1, soft_target_tau=1e-3, target_update_period=1, # 1 use_automatic_entropy_tuning=True, )) ############################################################################ qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes) vae = variant['vae_model'] # create a replay buffer for training an online VAE replay_buffer = OnlineVaeRelabelingBuffer( vae=vae, env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs) # create an online vae_trainer to train vae on the fly vae_trainer = ConvVAETrainer(variant['vae_train_data'], variant['vae_test_data'], vae, **online_vae_trainer_kwargs) # create a SACTrainer to learn a soft Q-function and appropriate policy trainer = SACTrainer(env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs) trainer = HERTrainer(trainer) eval_path_collector = VAEWrappedEnvPathCollector( variant.get('evaluation_goal_sampling_mode', 'reset_of_env'), env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = VAEWrappedEnvPathCollector( variant.get('exploration_goal_sampling_mode', 'vae_prior'), env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = OnlineVaeAlgorithm( trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, vae=vae, vae_trainer=vae_trainer, max_path_length=max_path_length, **algo_kwargs) if variant['custom_goal_sampler'] == 'replay_buffer': env.custom_goal_sampler = replay_buffer.sample_buffer_goals algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def grill_her_td3_experiment(variant): print("variant ") print(variant) env = get_envs(variant) es = get_exploration_strategy(variant, env) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) algo_kwargs = variant['algo_kwargs'] algo_kwargs['replay_buffer'] = replay_buffer td3_kwargs = algo_kwargs['td3_kwargs'] td3_kwargs['training_env'] = env td3_kwargs['render'] = variant["render"] her_kwargs = algo_kwargs['her_kwargs'] her_kwargs['observation_key'] = observation_key her_kwargs['desired_goal_key'] = desired_goal_key algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant.get("save_video", True): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, algorithm.eval_policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) env.vae.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make(variant['env_name']) eval_env = expl_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], # Making it easier to visualize ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector(eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer) trainer = BEARTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs']) # variant['algorithm_kwargs']['max_path_length'] = expl_env._max_episode_steps algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg= True, ### SET THIS TO TRUE, BEAR is a Q-learning algorithm **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): dummy_env = make_env(variant['env']) obs_dim = dummy_env.observation_space.low.size action_dim = dummy_env.action_space.low.size expl_env = VectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['expl_env_num']) ]) expl_env.seed(variant["seed"]) expl_env.action_space.seed(variant["seed"]) eval_env = SubprocVectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['eval_env_num']) ]) eval_env.seed(variant["seed"]) M = variant['layer_size'] num_quantiles = variant['num_quantiles'] zf1 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) zf2 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) target_zf1 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) target_zf2 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) target_policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) # fraction proposal network fp = target_fp = None if variant['trainer_kwargs'].get('tau_type') == 'fqf': fp = FlattenMlp( input_size=obs_dim + action_dim, output_size=num_quantiles, hidden_sizes=[M // 2, M // 2], output_activation=softmax, ) target_fp = FlattenMlp( input_size=obs_dim + action_dim, output_size=num_quantiles, hidden_sizes=[M // 2, M // 2], output_activation=softmax, ) eval_path_collector = VecMdpPathCollector( eval_env, eval_policy, ) expl_path_collector = VecMdpStepCollector( expl_env, policy, ) replay_buffer = TorchReplayBuffer( variant['replay_buffer_size'], dummy_env, ) trainer = DSACTrainer( env=dummy_env, policy=policy, target_policy=target_policy, zf1=zf1, zf2=zf2, target_zf1=target_zf1, target_zf2=target_zf2, fp=fp, target_fp=target_fp, num_quantiles=num_quantiles, **variant['trainer_kwargs'], ) algorithm = TorchVecOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # Or for a specific version (Daniel: doesn't work): # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) if 'Ant' in args.env: expl_env = NormalizedBoxEnv(AntEnv()) eval_env = NormalizedBoxEnv(AntEnv()) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv(InvertedPendulumEnv()) eval_env = NormalizedBoxEnv(InvertedPendulumEnv()) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv(HopperEnv()) eval_env = NormalizedBoxEnv(HopperEnv()) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv(ReacherEnv()) eval_env = NormalizedBoxEnv(ReacherEnv()) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv(SwimmerEnv()) eval_env = NormalizedBoxEnv(SwimmerEnv()) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv(Walker2dEnv()) eval_env = NormalizedBoxEnv(Walker2dEnv()) else: raise ValueError(args.env) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # unwrap the TimeLimitEnv wrapper since we manually termiante after 50 steps eval_env = gym.make('FetchReach-v1').env expl_env = gym.make('FetchReach-v1').env observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) eval_policy = MakeDeterministic(policy) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs'] ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from multiworld.envs.mujoco import register_mujoco_envs register_mujoco_envs() env_id = variant['env_id'] eval_env = gym.make(env_id) expl_env = gym.make(env_id) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' eval_env.reward_type = variant['reward_type'] expl_env.reward_type = variant['reward_type'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(args, variant): #eval_env = gym.make('FetchReach-v1') #expl_env = gym.make('FetchReach-v1') core_env = env.DeepBuilderEnv(args.session_name, args.act_dim, args.box_dim, args.max_num_boxes, args.height_field_dim) eval_env = stuff.NormalizedActions(core_env) expl_env = stuff.NormalizedActions(core_env) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size resumed = args.resume == 1 if resumed: variant, params = doc.load_rklit_file(args.session_name) variant['algorithm_kwargs']['min_num_steps_before_training'] = 0 M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/qf1'] qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/qf2'] target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/target_qf1'] target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) if not resumed else params['trainer/target_qf2'] policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) if not resumed else params['trainer/policy'] eval_policy = MakeDeterministic( policy) if not resumed else params['evaluation/policy'] eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer_expl = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) replay_buffer_eval = EnvReplayBuffer( int(variant['replay_buffer_size'] * (float(args.num_plays_eval) / float(args.num_plays_expl))), eval_env, ) if resumed: replay_buffer_expl._actions = params['replay_buffer_expl/actions'] replay_buffer_expl._env_infos = params['replay_buffer_expl/env_infos'] replay_buffer_expl._next_obs = params['replay_buffer_expl/next_obs'] replay_buffer_expl._observations = params[ 'replay_buffer_expl/observations'] replay_buffer_expl._rewards = params['replay_buffer_expl/rewards'] replay_buffer_expl._size = params['replay_buffer_expl/size'] replay_buffer_expl._terminals = params['replay_buffer_expl/terminals'] replay_buffer_expl._top = params['replay_buffer_expl/top'] replay_buffer_eval._actions = params['replay_buffer_eval/actions'] replay_buffer_eval._env_infos = params['replay_buffer_eval/env_infos'] replay_buffer_eval._next_obs = params['replay_buffer_eval/next_obs'] replay_buffer_eval._observations = params[ 'replay_buffer_eval/observations'] replay_buffer_eval._rewards = params['replay_buffer_eval/rewards'] replay_buffer_eval._size = params['replay_buffer_eval/size'] replay_buffer_eval._terminals = params['replay_buffer_eval/terminals'] replay_buffer_eval._top = params['replay_buffer_eval/top'] elif args.replay_add_sess_name != '': _, other_params = doc.load_rklit_file(args.replay_add_sess_name) num_samples = int(args.replay_add_num_samples) replay_buffer_expl._size = 0 replay_buffer_expl._top = 0 print("Loading " + str(num_samples) + " batch samples from session " + args.replay_add_sess_name) zeroes = [] offset = 0 for i in range(num_samples): act = other_params['replay_buffer_expl/actions'][i] obs = other_params['replay_buffer_expl/observations'][i] if act.min() == 0.0 and act.max() == 0.0 and obs.min( ) == 0.0 and obs.max() == 0.0: zeroes.append(i) continue replay_buffer_expl._actions[offset] = copy.deepcopy(act.tolist()) replay_buffer_expl._next_obs[offset] = copy.deepcopy( other_params['replay_buffer_expl/next_obs'][i].tolist()) replay_buffer_expl._observations[offset] = copy.deepcopy( obs.tolist()) replay_buffer_expl._rewards[offset] = copy.deepcopy( other_params['replay_buffer_expl/rewards'][i].tolist()) replay_buffer_expl._terminals[offset] = copy.deepcopy( other_params['replay_buffer_expl/terminals'][i].tolist()) replay_buffer_expl._size += 1 replay_buffer_expl._top += 1 offset += 1 print( "Detected and ignored " + str(len(zeroes)) + " zero samples in replay buffer. Total num samples loaded into replay buffer: " + str(replay_buffer_expl._size)) other_params = {} trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'], starting_train_steps=0 if not resumed else (params['replay_buffer_expl/top'] * variant['algorithm_kwargs']['num_trains_per_train_loop']), ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer_eval=replay_buffer_eval, replay_buffer_expl=replay_buffer_expl, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder hidden_sizes = [200, 200, 200] if variant['algo_params']['snail']: encoder_model = SnailEncoder hidden_sizes = [20] context_encoder = encoder_model( hidden_sizes=hidden_sizes, input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) context_encoder.use_next_obs_in_context = variant['algo_params'][ 'use_next_obs_in_context'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = PEARLTanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) qf1_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + context_encoder_output_dim, output_size=1, ) qf2_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + context_encoder_output_dim, output_size=1, ) vf_exp = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + context_encoder_output_dim, output_size=1, ) policy_exp = PEARLTanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + context_encoder_output_dim, action_dim=action_dim, latent_dim=latent_dim) agent_exp = ExpAgent(latent_dim, context_encoder, policy_exp, **variant['algo_params']) algorithm = ExpSAC(env=env, train_tasks=list(tasks[:variant['n_train_tasks']]), eval_tasks=list(tasks[-variant['n_eval_tasks']:]), nets=[agent, qf1, qf2, vf], nets_exp=[agent_exp, qf1_exp, qf2_exp, vf_exp], encoder=context_encoder, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-6].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): device = torch.device('cuda:0') print(device) algorithm.to(device) context_encoder.to(device) # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the critic model critic_model = MLPDisc(variant['policy_net_size'], num_layer_blocks=variant['critic_num_blocks'], hid_dim=variant['critic_hid_dim'], hid_act=variant['critic_hid_act'], use_bn=variant['critic_use_bn']) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, critic=critic_model, expert_replay_buffer=expert_replay_buffer, **variant['adp_bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def skewfit_experiment(variant, other_variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.online_vae_replay_buffer import \ OnlineVaeRelabelingBuffer from rlkit.torch.networks import FlattenMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.vae.vae_trainer import ConvVAETrainer skewfit_preprocess_variant(variant) env = get_envs(variant) uniform_dataset_fn = variant.get('generate_uniform_dataset_fn', None) if uniform_dataset_fn: uniform_dataset = uniform_dataset_fn( **variant['generate_uniform_dataset_kwargs']) else: uniform_dataset = None observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=hidden_sizes, ) vae = env.vae replay_buffer = OnlineVaeRelabelingBuffer( automatic_policy_schedule=other_variant, vae=env.vae, env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) vae_trainer = ConvVAETrainer(variant['vae_train_data'], variant['vae_test_data'], env.vae, **variant['online_vae_trainer_kwargs'], mode='online_vae') assert 'vae_training_schedule' not in variant, "Just put it in algo_kwargs" max_path_length = variant['max_path_length'] trainer = SACTrainer(automatic_policy_schedule=other_variant, env=env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['twin_sac_trainer_kwargs']) trainer = HERTrainer(trainer) eval_path_collector = VAEWrappedEnvPathCollector( variant['evaluation_goal_sampling_mode'], env, MakeDeterministic(policy), max_path_length, other_variant=other_variant, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = VAEWrappedEnvPathCollector( variant['exploration_goal_sampling_mode'], env, policy, max_path_length, other_variant=other_variant, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = OnlineVaeAlgorithm( automatic_policy_schedule=other_variant, trainer=trainer, exploration_env=env, evaluation_env=env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, vae=vae, vae_trainer=vae_trainer, uniform_dataset=uniform_dataset, max_path_length=max_path_length, **variant['algo_kwargs']) if variant['custom_goal_sampler'] == 'replay_buffer': env.custom_goal_sampler = replay_buffer.sample_buffer_goals algorithm.to(ptu.device) vae.to(ptu.device) algorithm.train()
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] print(demos_path) buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] # target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) load_ebm_dir = ebm_dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError # Test if variant['test']: batch_data = target_state_buffer / variant['rescale'] obs = torch.Tensor(batch_data[:1000]).to(ptu.device) print("Not expert data", ebm_model(obs * 200).mean().item()) print("Expert data", ebm_model(obs).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], rescale=variant['rescale'], ebm=ebm_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): # create multi-task environment and sample tasks env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) env_eval = NormalizedBoxEnv( ENVS[variant['env_name']](**variant['env_params2'])) tasks = env.get_all_task_idx() tasks_eval = env_eval.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # instantiate networks latent_dim = variant['latent_size'] context_encoder = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[400, 400, 400], input_size=obs_dim + action_dim + reward_dim, output_size=context_encoder, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) #qnetwork1 qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) #qnetwork2 vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) #qnetwork3? policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) #actornetwork agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic(env=env, env_eval=env_eval, train_tasks=list(tasks), eval_tasks=list(tasks_eval), nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['env_name'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm set_seed_everywhere(variant['random_seed']) algorithm.train()
def experiment(variant): eval_env = gym.make( variant['env_name'], **{ "headless": variant["headless"], "verbose": variant["verbose"] }) eval_env.seed(variant['seed']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) dataset = get_dataset(variant["h5path"], eval_env) load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('RLkitUR-v0')._start_ros_services() eval_env = gym.make('RLkitUR-v0') expl_env = gym.make('RLkitUR-v0') eval_env = NormalizedBoxEnv(eval_env) expl_env = NormalizedBoxEnv(expl_env) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1_n, qf2_n, cactor_n, policy_n = [], [], [], [] target_qf1_n, target_qf2_n, target_policy_n = [], [], [] expl_policy_n, eval_policy_n = [], [] log_alpha_n, log_calpha_n = [], [] for i in range(num_agent): from rlkit.torch.networks import FlattenMlp qf1 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) target_qf2 = copy.deepcopy(qf2) from rlkit.torch.layers import SplitLayer cactor = nn.Sequential( nn.Linear((obs_dim*num_agent+action_dim*(num_agent-1)),variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['cactor_kwargs']['hidden_dim'],variant['cactor_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim)]) ) from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = TanhGaussianPolicy(module=cactor) policy = nn.Sequential( nn.Linear(obs_dim,variant['policy_kwargs']['hidden_dim']), nn.ReLU(), nn.Linear(variant['policy_kwargs']['hidden_dim'],variant['policy_kwargs']['hidden_dim']), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) target_policy = copy.deepcopy(policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy if variant['random_exploration']: from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy qf1_n.append(qf1) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf1_n.append(target_qf1) target_qf2_n.append(target_qf2) target_policy_n.append(target_policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) if variant['trainer_kwargs']['state_dependent_alpha']: log_alpha = FlattenMlp( input_size=obs_dim*num_agent, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) log_calpha = FlattenMlp( input_size=obs_dim*num_agent, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2, ) log_alpha_n.append(log_alpha) log_calpha_n.append(log_calpha) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.prg.prg import PRGTrainer trainer = PRGTrainer( env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n = qf2_n, target_qf2_n = target_qf2_n, policy_n=policy_n, target_policy_n=target_policy_n, cactor_n=cactor_n, log_alpha_n=log_alpha_n, log_calpha_n=log_calpha_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import multiworld multiworld.register_all_envs() eval_env = gym.make("SawyerReachXYZEnv-v0") expl_env = gym.make("SawyerReachXYZEnv-v0") observation_key = "state_observation" desired_goal_key = "state_desired_goal" achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=0.2, min_sigma=0.2, # constant sigma epsilon=0.3, ) obs_dim = expl_env.observation_space.spaces["observation"].low.size goal_dim = expl_env.observation_space.spaces["desired_goal"].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant["policy_kwargs"]) expl_policy = PolicyWrappedWithExplorationStrategy(exploration_strategy=es, policy=policy) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant["replay_buffer_kwargs"]) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algo_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make(variant["env_name"]) eval_env = expl_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant["layer_size"] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[ M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[ M, M, ], # Making it easier to visualize ) # behavior_policy = TanhGaussianPolicy( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_sizes=[M, M], # ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, sparse_reward=False, target_goal=eval_env.unwrapped.wrapped_env.target_goal, ) expl_path_collector = MdpPathCollector( expl_env, policy, sparse_reward=False, target_goal=eval_env.unwrapped.wrapped_env.target_goal, ) replay_buffer = EnvReplayBuffer( variant["replay_buffer_size"], expl_env, with_per=False, ) if variant["load_buffer"]: load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, behavior_policy=None, **variant["trainer_kwargs"]) print(variant["algorithm_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=variant["load_buffer"], **variant["algorithm_kwargs"]) algorithm.to(ptu.device) print("training!") algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, M], ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = CustomMDPPathCollector( eval_env, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) if variant['load_buffer'] and buffer_filename is not None: replay_buffer.load_buffer(buffer_filename) elif 'random-expert' in variant['env_name']: load_hdf5(d4rl.basic_dataset(eval_env), replay_buffer) else: load_hdf5(d4rl.qlearning_dataset(eval_env), replay_buffer) trainer = CQLTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, eval_both=True, batch_rl=variant['load_buffer'], **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(create_swingup()) #env = NormalizedBoxEnv(HalfCheetahEnv()) #env = NormalizedBoxEnv(Continuous_MountainCarEnv()) #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv())) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) skill_dim = 0#50 obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim + action_dim, output_size=1, ) rf = FlattenMlp( hidden_sizes=[16, 16], input_size=obs_dim + skill_dim, output_size=16, ) pf = FlattenMlp( hidden_sizes=[16, 16, 16], input_size=obs_dim + skill_dim, output_size=16, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + skill_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + skill_dim, action_dim=action_dim, #k=4, ) disc = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=skill_dim if skill_dim > 0 else 1, ) algorithm = RNDSoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, rf=rf, pf=pf, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from rlkit.envs.zmq_env import ZMQEnv expl_env = ZMQEnv(variant['port']) eval_env = expl_env obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf_n, qf2_n, cactor_n, policy_n, target_qf_n, target_qf2_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) cactor = GumbelSoftmaxMlpPolicy( input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['cactor_kwargs']) policy = GumbelSoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_qf2 = copy.deepcopy(qf2) target_policy = copy.deepcopy(policy) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) qf_n.append(qf) qf2_n.append(qf2) cactor_n.append(cactor) policy_n.append(policy) target_qf_n.append(target_qf) target_qf2_n.append(target_qf2) target_policy_n.append(target_policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = PRGTrainer(env=expl_env, qf_n=qf_n, target_qf_n=target_qf_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, target_policy_n=target_policy_n, cactor_n=cactor_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = gym.make(variant['env_name']) expl_env = eval_env obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, ], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, ], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, ], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M, ], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M, ], ) vae_policy = VAEPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[750, 750], latent_dim=action_dim * 2, ) eval_path_collector = CustomMDPPathCollector( eval_env, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) buffer_filename = None if variant['buffer_filename'] is not None: buffer_filename = variant['buffer_filename'] replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) load_hdf5(eval_env.unwrapped.get_dataset(), replay_buffer, max_size=variant['replay_buffer_size']) trainer = MUSATTrainer( pre_model=args.pre_model, env_name=args.env, env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, vae=vae_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, batch_rl=True, q_learning_alg=True, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] net_size = variant['net_size'] num_hidden = variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) trainer = SoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params'] ) algorithm = TorchRLAlgorithm( trainer=trainer, env=env, training_env=training_env, exploration_policy=policy, **variant['rl_alg_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): domain = variant['domain'] seed = variant['seed'] exp_mode = variant['exp_mode'] max_path_length = variant['algo_params']['max_path_length'] bcq_interactions = variant['bcq_interactions'] num_tasks = variant['num_tasks'] filename = f'./goals/{domain}-{exp_mode}-goals.pkl' idx_list, train_goals, wd_goals, ood_goals = pickle.load( open(filename, 'rb')) idx_list = idx_list[:num_tasks] sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}" buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir) print("Buffer directory: " + buffer_dir) # Load buffer bcq_buffers = [] buffer_loader_id_list = [] for i, idx in enumerate(idx_list): bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl' filename = os.path.join(buffer_dir, bname) rp_buffer = ReplayBuffer.remote( index=i, seed=seed, num_trans_context=variant['num_trans_context'], in_mdp_batch_size=variant['in_mdp_batch_size'], ) buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename)) bcq_buffers.append(rp_buffer) ray.get(buffer_loader_id_list) assert len(bcq_buffers) == len(idx_list) train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, ) set_seed(variant['seed']) # create multi-task environment and sample tasks env = env_producer(variant['domain'], seed=0) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) reward_dim = 1 # instantiate networks latent_dim = variant['latent_size'] context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[ 'algo_params'][ 'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=context_encoder_input_dim, output_size=context_encoder_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) algorithm = PEARLSoftActorCritic(env=env, train_goals=train_goals, wd_goals=wd_goals, ood_goals=ood_goals, replay_buffers=train_buffer, nets=[agent, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params']) # optionally load pre-trained weights if variant['path_to_weights'] is not None: path = variant['path_to_weights'] context_encoder.load_state_dict( torch.load(os.path.join(path, 'context_encoder.pth'))) qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth'))) qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth'))) vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth'))) # TODO hacky, revisit after model refactor algorithm.networks[-2].load_state_dict( torch.load(os.path.join(path, 'target_vf.pth'))) policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth'))) # optional GPU mode ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id']) if ptu.gpu_enabled(): algorithm.to() # debugging triggers a lot of printing and logs to a debug directory DEBUG = variant['util_params']['debug'] os.environ['DEBUG'] = str(int(DEBUG)) # create logging directory # TODO support Docker exp_id = 'debug' if DEBUG else None experiment_log_dir = setup_logger( variant['domain'], variant=variant, exp_id=exp_id, base_log_dir=variant['util_params']['base_log_dir']) # optionally save eval trajectories as pkl files if variant['algo_params']['dump_eval_paths']: pickle_dir = experiment_log_dir + '/eval_trajectories' pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True) # run the algorithm algorithm.train()
def experiment(variant): expert_buffer = joblib.load(variant['xy_data_path'])['xy_data'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert False assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( # policy = ReparamMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, # std=0.1 ) # set up the discriminator models disc_model_class = ThreeWayResNetAIRLDisc if variant[ 'threeway'] else ResNetAIRLDisc disc_model = disc_model_class( 2, # obs is just x-y pos num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the RL algorithm used to train the policy policy_optimizer = EntConstSAC(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, action_dim=action_dim, **variant['policy_params']) # set up the AIRL algorithm alg_class = ThreewayStateMarginalMatchingAlg if variant[ 'threeway'] else StateMarginalMatchingAlg algorithm = alg_class(env, policy, disc_model, policy_optimizer, expert_buffer, training_env=training_env, **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.policy_optimizer.policy_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr']) print(algorithm.disc_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): intrinsic_reward = variant['intrinsic_reward'] # Create environment. num_skills = variant['smm_kwargs']['num_skills'] if variant[ 'intrinsic_reward'] == 'smm' else 0 env, training_env = create_env(variant['env_id'], variant['env_kwargs'], num_skills) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size # Initialize networks. net_size = variant['net_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, hidden_sizes=[net_size, net_size], output_size=1, ) vf = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=1, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if intrinsic_reward == 'smm': discriminator = FlattenMlp( input_size=obs_dim - num_skills, hidden_sizes=[net_size, net_size], output_size=num_skills, ) density_model = VAEDensity(input_size=obs_dim, num_skills=num_skills, code_dim=128, **variant['vae_density_kwargs']) # Overwrite appropriate functions of algorithm. smm_algorithm_hook = SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) elif intrinsic_reward == 'icm': embedding_model = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) forward_model = FlattenMlp( input_size=net_size + action_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) inverse_model = FlattenMlp( input_size=net_size + net_size, hidden_sizes=[], output_size=action_dim, ) # Overwrite appropriate functions of algorithm. ICMHook(base_algorithm=algorithm, embedding_model=embedding_model, forward_model=forward_model, inverse_model=inverse_model, **variant['icm_kwargs']) elif intrinsic_reward == 'count': count_algorithm_hook = CountHook(base_algorithm=algorithm, **variant['count_kwargs']) elif intrinsic_reward == 'pseudocount': density_model = VAEDensity( input_size=obs_dim, num_skills=0, code_dim=128, **variant['vae_density_kwargs'], ) # Overwrite appropriate functions of algorithm. PseudocountHook(base_algorithm=algorithm, density_model=density_model, **variant['pseudocount_kwargs']) algorithm.to(ptu.device) algorithm.train()