def create_env(env_id, env_kwargs, num_skills=0): if env_id == 'ManipulationEnv': env = NormalizedBoxEnv(ManipulationEnv(**env_kwargs)) training_env = NormalizedBoxEnv(ManipulationEnv(**env_kwargs)) elif env_id == 'StarEnv': env = NormalizedBoxEnv(StarEnv(**env_kwargs)) training_env = NormalizedBoxEnv(StarEnv(**env_kwargs)) elif env_id == 'PointEnv': env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs)) training_env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs)) elif env_id == 'PointEnv_evolve': env = NormalizedBoxEnv(PointEnv_SMM_evolution(**env_kwargs)) training_env = NormalizedBoxEnv(PointEnv_SMM_evolution(**env_kwargs)) elif env_id == 'ant_goal': env = NormalizedBoxEnv(AntGoalEnv_SMM(**env_kwargs)) training_env = NormalizedBoxEnv(AntGoalEnv_SMM(**env_kwargs)) else: raise NotImplementedError('Unrecognized environment:', env_id) # Append skill to observation vector. if num_skills > 0: env = AugmentedBoxObservationShapeEnv(env, num_skills) training_env = AugmentedBoxObservationShapeEnv(env, num_skills) return env, training_env
def simulate_policy(args): data = torch.load(str(args.file)) #data = joblib.load(str(args.file)) policy = data['evaluation/policy'] env = NormalizedBoxEnv(HalfCheetahEnv()) #env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() if args.collect: data = [] for trial in tqdm(range(100)): path = rollout( env, policy, max_path_length=args.H + 1, render=not args.collect, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.collect: data.append([path['actions'], path['next_observations']]) if args.collect: import pickle with open("data/expert.pkl", mode='wb') as f: pickle.dump(data, f)
def simulate_policy(args): # data = joblib.load(args.file) data = torch.load(args.file) policy = data['evaluation/policy'] env = NormalizedBoxEnv(Mani2dEnv()) # env.reset() # print(env.step(env.action_space.sample())) # sys.exit() # env = env.wrapped_env.unwrapped print("Policy loaded") if args.gpu: set_gpu_mode(True) # policy.cuda() # import cv2 # video = cv2.VideoWriter('diayn_bipedal_walker_hardcore.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 30, (1200, 800)) index = 0 for skill in range(policy.stochastic_policy.skill_dim): print(skill) for _ in range(3): path = rollout( env, policy, skill, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): # data = joblib.load(args.file) data = torch.load(args.file) policy = data['evaluation/policy'] env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() import cv2 video = cv2.VideoWriter('ppo_test.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30, (640, 480)) index = 0 path = rollout( env, policy, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() for i, img in enumerate(path['images']): print(i) video.write(img[:, :, ::-1].astype(np.uint8)) cv2.imwrite("frames/ppo_test/%06d.png" % index, img[:, :, ::-1]) index += 1 video.release() print("wrote video")
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] if args.gpu: ptu.set_gpu_mode(True) policy.cuda() print("set gpu") print(ptu.device) config_file = get_config_file(args.config_file) env = NormalizedBoxEnv( load_env(args, config_file, args.env_mode, ptu.device.index)) print("Policy loaded") while True: path = rollout( env, policy, max_path_length=args.H, render=False, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def get_meta_env(env_specs): base_env_name = env_specs['base_env_name'] env_dict = meta_envs[base_env_name] meta_train_env, meta_test_env = meta_envs[base_env_name]['meta_train']( ), meta_envs[base_env_name]['meta_test']() if env_specs['need_pixels']: if env_dict['info']['is_dmcs_env']: meta_train_env = pixels.Wrapper( meta_train_env, pixels_only=False, render_kwargs=env_specs['render_kwargs']) meta_test_env = pixels.Wrapper( meta_test_env, pixels_only=False, render_kwargs=env_specs['render_kwargs']) else: raise NotImplementedError() # if its a dmcs env we need to wrap it to look like a gym env if env_dict['info']['is_dmcs_env']: meta_train_env = DmControlWrapper(meta_train_env) meta_test_env = DmControlWrapper(meta_test_env) if env_specs['normalized']: meta_train_env, meta_test_env = NormalizedBoxEnv( meta_train_env), NormalizedBoxEnv(meta_test_env) return meta_train_env, meta_test_env
def experiment(variant): torch.autograd.set_detect_anomaly(True) #expl_env = NormalizedBoxEnv(HalfCheetahEnv()) #eval_env = NormalizedBoxEnv(HalfCheetahEnv()) # expl_env = NormalizedBoxEnv(PendulumEnv()) # eval_env = NormalizedBoxEnv(PendulumEnv()) expl_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) eval_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) #expl_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2")) #eval_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2")) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_step_collector = PPOMdpPathCollector( eval_env, eval_policy, calculate_advantages=False ) expl_step_collector = PPOMdpPathCollector( expl_env, policy, calculate_advantages=True, vf=vf, gae_lambda=0.97, discount=0.995, ) replay_buffer = PPOEnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = PPOTrainer( env=eval_env, policy=policy, vf=vf, **variant['trainer_kwargs'] ) algorithm = PPOTorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_step_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(PointEnv(**variant['task_params'])) ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id']) tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) latent_dim = 5 task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] # start with linear task encoding recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder task_enc = encoder_model( hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better input_size=obs_dim + action_dim + reward_dim, output_size=task_enc_output_dim, ) qf1 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + action_dim + latent_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size, net_size], input_size=obs_dim + latent_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = ProtoAgent( latent_dim, [task_enc, policy, qf1, qf2, vf], **variant['algo_params'] ) algorithm = ProtoSoftActorCritic( env=env, train_tasks=list(tasks[:-20]), eval_tasks=list(tasks[-20:]), nets=[agent, task_enc, policy, qf1, qf2, vf], latent_dim=latent_dim, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.to() algorithm.train()
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def main(goal_idx=0, args=args): variant = default_config if args.config: with open(os.path.join(args.config)) as f: exp_params = json.load(f) variant = deep_update_dict(exp_params, variant) # variant['util_params']['gpu_id'] = gpu env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])) env.reset_task(goal_idx) experiment(env=env, goal_idx=goal_idx)
def experiment(variant): torch.autograd.set_detect_anomaly(True) # expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(gym.make(str(args.env))) eval_env = NormalizedBoxEnv(gym.make(str(args.env))) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size skill_dim = 10 M = variant['layer_size'] vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) worker = RandomSkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim, action_dim=action_dim, hidden_sizes=[M, M]) torch.save(worker, "data/random_policy_params.pkl") policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=skill_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_step_collector = DirichletManagerPPOMdpPathCollector( eval_env, eval_policy, worker, calculate_advantages=False) expl_step_collector = DirichletManagerPPOMdpPathCollector( expl_env, policy, worker, calculate_advantages=True, vf=vf, gae_lambda=0.97, discount=0.995, ) replay_buffer = ManagerPPOEnvReplayBuffer(variant['replay_buffer_size'], expl_env, skill_dim=skill_dim) trainer = PPOTrainer(env=eval_env, policy=policy, vf=vf, **variant['trainer_kwargs']) algorithm = PPOTorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_step_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def create_env(env_id, env_kwargs, num_skills=0): if env_id == 'PointEnv': env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs)) training_env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs)) else: raise NotImplementedError('Unrecognized environment:', env_id) # Append skill to observation vector. if num_skills > 0: env = AugmentedBoxObservationShapeEnv(env, num_skills) training_env = AugmentedBoxObservationShapeEnv(env, num_skills) return env, training_env
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): torch.autograd.set_detect_anomaly(True) # expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(Mani2dEnv()) eval_env = NormalizedBoxEnv(Mani2dEnv()) obs_dim = expl_env.observation_space.low.size worker = torch.load(str(args.worker))['trainer/policy'] skill_dim = worker.skill_dim M = variant['layer_size'] vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = DiscretePolicy( obs_dim=obs_dim, action_dim=skill_dim, hidden_sizes=[M, M], ) eval_policy = MakeDeterministic(policy) eval_step_collector = ManagerPPOMdpPathCollector( eval_env, eval_policy, worker, calculate_advantages=False) expl_step_collector = ManagerPPOMdpPathCollector( expl_env, policy, worker, calculate_advantages=True, vf=vf, gae_lambda=0.97, discount=0.995, ) replay_buffer = ManagerPPOEnvReplayBuffer(variant['replay_buffer_size'], expl_env, skill_dim=skill_dim) trainer = DiscretePPOTrainer(env=eval_env, policy=policy, vf=vf, **variant['trainer_kwargs']) algorithm = PPOTorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_step_collector, evaluation_data_collector=eval_step_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) vf = ConcatMlp( input_size=obs_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(CartpoleSwingupSparseEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) heads = 5 net_size = variant['net_size'] qf1 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) pqf1 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) pqf2 = EnsembleFlattenMlp( heads, hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[1], input_size=obs_dim, output_size=1, ) policy = MultiTanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, heads=heads, ) algorithm = BigThompsonSoftActorCritic( env=env, policy=policy, qf1=qf1, qf2=qf2, pqf1=pqf1, pqf2=pqf2, prior_coef=10, vf=vf, #disc=disc, #skill_dim=skill_dim, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make(variant['env'])) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = BetaVirel(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym env = NormalizedBoxEnv(gym.make('Pointmass-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def example(variant): env = CartpoleEnv() env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['qf_params'], ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def build_PEARL_envs(seed, env_name, params=None): ''' Build env from PEARL ''' from rlkit.envs import ENVS from rlkit.envs.wrappers import NormalizedBoxEnv if env_name == 'ant-dir': env_params = { 'n_tasks': params.n_tasks, 'randomize_tasks': params.randomize_tasks, #"low_gear": params.low_gear, "forward_backward": params.forward_backward, } elif env_name == 'ant-goal': env_params = { 'n_tasks': params.n_tasks, 'randomize_tasks': params.randomize_tasks, #"low_gear": params.low_gear, } else: env_params = { 'n_tasks': params.n_tasks, 'randomize_tasks': params.randomize_tasks } env = ENVS[env_name](**env_params) env.seed(seed) env = NormalizedBoxEnv(env) env.action_space.np_random.seed(seed) return env
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) env = NormalizedBoxEnv(create_swingup()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = SawyerXYZEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = ConcatMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def make_env(name): env = gym.make(name) # Remove TimeLimit Wrapper if isinstance(env, TimeLimit): env = env.unwrapped env = CustomInfoEnv(env) env = NormalizedBoxEnv(env) return env
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make(args.env_name)) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] algo_map = dict( sac=dict(algo=SoftActorCritic, network=dict( policy=TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ), qf=FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ), vf=FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ), )), tsac=dict(algo=TwinSAC, network=dict( policy=TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ), qf1=FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ), qf2=FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ), vf=FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ), )), ) algo_type = algo_map[args.algo] algorithm = algo_type['algo'](env=env, **algo_type['network'], **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()