def run_experiment(self): all_imgs = [] policy = OUStrategy(env.action_space) for i in range(self.num_episodes): state = self.env.reset() img = ptu.from_numpy(state['image_observation']).view(1, 6912) latent_state = self.vae.encode(img)[0] true_curr = state['image_observation'] * 255.0 all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48)) actions = [] for j in range(self.episode_len): u = policy.get_action_from_raw_action( env.action_space.sample()) actions.append(u) state = self.env.step(u)[0] true_curr = state['image_observation'] * 255.0 all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48)) pred_curr = self.vae.decode(latent_state)[0] * 255.0 all_imgs.append(pred_curr.view(3, 48, 48)) for j in range(self.episode_len): u = ptu.from_numpy(actions[j]).view(1, 2) latent_state = self.vae.process_dynamics(latent_state, u) pred_curr = self.vae.decode(latent_state)[0] * 255.0 all_imgs.append(pred_curr.view(3, 48, 48)) all_imgs = torch.stack(all_imgs) save_image( all_imgs.data, "/home/khazatsky/rail/data/rail-khazatsky/sasha/dynamics_visualizer/dynamics.png", nrow=self.episode_len + 1, )
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=True, imsize=84, show=False, dataset_path=None, ): filename = "/tmp/sawyer_push_new_easy_wider2_" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() env = SawyerPushXYEasyEnv(hide_goal=True) env = ImageMujocoEnv( env, imsize, transpose=True, init_camera=sawyer_init_camera_zoomed_in, # init_camera=sawyer_init_camera, normalize=True, ) info['env'] = env policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * 3)) for i in range(N): # env.reset() if i % 100 == 0: g = env.sample_goal_for_rollout() env.set_goal(g) policy.reset() u = policy.get_action_from_raw_action(env.action_space.sample()) img = env.step(u)[0] dataset[i, :] = img if show: # env.render() cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print("done making training data", filename, time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def experiment(variant): farmlist_base = [('123.123.123.123', 4)] farmer = Farmer(farmlist_base) environment = acq_remote_env(farmer) env = NormalizedBoxEnv(environment) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params']) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=False, imsize=84, show=False, dataset_path=None, env_class=SawyerReachTorqueEnv, env_kwargs=None, init_camera=sawyer_torque_reacher_camera, ): filename = "/tmp/sawyer_torque_data" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() if env_kwargs == None: env_kwargs = dict() env = env_class(**env_kwargs) env = ImageEnv( env, imsize, transpose=True, init_camera=init_camera, normalize=True, ) info['env'] = env policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) dataset = np.zeros((N, imsize * imsize * 3), dtype=np.uint8) for i in range(N): if i % 50 == 0: print('Reset') env.reset_model() exploration_policy.reset() for _ in range(1): action = exploration_policy.get_action()[0] * 1 / 10 env.wrapped_env.step(action) img = env._get_flat_img() dataset[i, :] = unormalize_image(img) if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) print("done making training data", time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = CartpoleEnv() env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['qf_params'], ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def get_ddpg(evaluation_environment, parameters): obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy( action_space=evaluation_environment.action_space), policy=policy, ) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def run_linear_ocm_exp(variant): from rlkit.tf.ddpg import DDPG from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from rlkit.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] algo_params = variant['algo_params'] set_seed(seed) onehot_dim = num_values + 1 env_action_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) # env = FlattenedProductBox(env) # qf = FeedForwardCritic( # name_or_scope="critic", # env_spec=env.spec, # ) qf = MlpMemoryQFunction( name_or_scope="critic", env_spec=env.spec, ) policy = ActionAwareMemoryPolicy( name_or_scope="noisy_policy", action_dim=env_action_dim, memory_dim=memory_dim, env_spec=env.spec, ) es = OUStrategy(env_spec=env.spec) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def experiment(variant): #env = NormalizedBoxEnv(HalfCheetahEnv()) env = NormalizedBoxEnv(create_swingup()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) # env = NormalizedBoxEnv(env) # tdm_normalizer = TdmNormalizer( # env, # vectorized=True, # max_tau=variant['algo_kwargs']['tdm_kwargs']['max_tau'], # ) tdm_normalizer = None qf = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['algo_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def get_exploration_strategy(variant, env): from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy exploration_type = variant['exploration_type'] exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) return es
def experiment(variant): # env = HalfCheetahEnv() # env = PointEnv() env = gym_env("Pendulum-v0") # env = HopperEnv() horizon = variant['algo_params']['max_path_length'] env = TimeLimitedEnv(env, horizon) env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = MultiStepDdpg(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.train() return algorithm.final_score
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=True, imsize=84, show=False, dataset_path=None, env_class=None, env_kwargs=None, init_camera=sawyer_door_env_camera, ): filename = "/tmp/sawyer_door_push_open_and_reach" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: env = env_class(**env_kwargs) env = ImageEnv( env, imsize, transpose=True, init_camera=init_camera, normalize=True, ) oracle_sampled_data = int(N/2) dataset = np.zeros((N, imsize * imsize * 3)) print('Goal Space Sampling') for i in range(oracle_sampled_data): goal = env.sample_goal() env.set_to_goal(goal) img = env._get_flat_img() dataset[i, :] = img if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) env._wrapped_env.min_y_pos=.6 policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) print('Random Sampling') for i in range(oracle_sampled_data, N): if i % 20==0: env.reset() exploration_policy.reset() for _ in range(10): action = exploration_policy.get_action()[0] env.wrapped_env.step( action ) img = env._get_flat_img() dataset[i, :] = img if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def tdm_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) qf2 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs'] ) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3( env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_kwargs ) algorithm.to(ptu.device) algorithm.train()
def run_linear_ocm_exp(variant): from rlkit.tf.ddpg import DDPG from rlkit.envs.flattened_product_box import FlattenedProductBox from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.tf.policies.nn_policy import FeedForwardPolicy from rlkit.qfunctions.nn_qfunction import FeedForwardCritic from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] memory_dim = variant['memory_dim'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) env = ContinuousMemoryAugmented( env, num_memory_states=memory_dim, ) env = FlattenedProductBox(env) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy( env_spec=env.spec, **ou_params ) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def generate_goal_data_set(env=None, num_goals=1000, use_cached_dataset=False, action_scale=1 / 10): if use_cached_dataset and osp.isfile('/tmp/goals' + str(num_goals) + '.npy'): goal_dict = np.load('/tmp/goals' + str(num_goals) + '.npy').item() print("loaded data from saved file") return goal_dict cached_goal_keys = [ 'latent_desired_goal', 'image_desired_goal', 'state_desired_goal', 'joint_desired_goal', ] goal_sizes = [ env.observation_space.spaces['latent_desired_goal'].low.size, env.observation_space.spaces['image_desired_goal'].low.size, env.observation_space.spaces['state_desired_goal'].low.size, 7 ] observation_keys = [ 'latent_observation', 'image_observation', 'state_observation', 'state_observation', ] goal_generation_dict = dict() for goal_key, goal_size, obs_key in zip( cached_goal_keys, goal_sizes, observation_keys, ): goal_generation_dict[goal_key] = [goal_size, obs_key] goal_dict = dict() policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key] = np.zeros((num_goals, goal_size)) print('Generating Random Goals') for i in range(num_goals): if i % 50 == 0: print('Reset') env.reset_model() exploration_policy.reset() action = exploration_policy.get_action()[0] * action_scale obs, _, _, _ = env.step(action) print(i) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key][i, :] = obs[obs_key] np.save('/tmp/goals' + str(num_goals) + '.npy', goal_dict) return goal_dict
def experiment(variant): ''' 1. 建立实验环境(eval, expl) 2. 确立输入,输出维度,建立qf函数,policy函数 3. 复制target qf和 target policy 函数 4. 对于评估构建path collector 5. 对于训练实验,构建探索策略、path collector、replay buffer 6. 构建 DDPGTrainer (qf, policy) 7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分) 8. 开始训练 :param variant: config parameter :return: ''' eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) # 利用copy target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) # 评估 eval_path_collector = MdpPathCollector(eval_env, policy) # 实验 (探索策略、path收集、replay buffer) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) # 转化变量格式 algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = CylinderXYPusher2DEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) else: env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( GoalXYPosAndVelAnt( goal_dim_weights=[0.1, 0.1, 0.9, 0.9], speed_weight=None, )) max_tau = variant['tdm_kwargs']['max_tau'] # Normalizer isn't used unless you set num_pretrain_paths > 0 tdm_normalizer = TdmNormalizer( env, vectorized=True, max_tau=max_tau, ) qf = TdmQf( env=env, vectorized=True, norm_order=1, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, hidden_sizes=[300, 300], ) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, max_size=int(1E6), ) algorithm = TemporalDifferenceModel(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, qf_criterion=HuberLoss(), tdm_normalizer=tdm_normalizer, **variant['tdm_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_params']) env = normalize(env) es = OUStrategy( action_space=env.action_space, **variant['es_params'] ) algo_class = variant['algo_class'] algo_params = variant['algo_params'] hidden_size = variant['hidden_size'] if algo_class == DDPG: # algo_params.pop('naf_policy_learning_rate') qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) algorithm = DDPG( env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params'] ) elif algo_class == NAF: algo_params.pop('qf_learning_rate') # algo_params.pop('policy_learning_rate') qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, ) algorithm = NAF( env, policy=qf, exploration_strategy=es, **variant['algo_params'] ) else: raise Exception("Invalid algo class: {}".format(algo_class)) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, ) algorithm = NAF(env, naf_policy=qf, exploration_strategy=es, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) qf2 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs'] ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer( env=env, **variant['her_replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3( env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) policy = NafPolicy(int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = NAF(env, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) plotter = QFPolicyPlotter( qf=qf, # policy=policy, policy=exploration_policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100 ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, render_eval_paths=True, plotter=plotter, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): eval_env = NormalizedBoxEnv(HalfCheetahEnv()) expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()