def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = GaussianStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def get_exploration_strategy(variant, env): from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy exploration_type = variant['exploration_type'] exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) return es
def get_td3pg(evaluation_environment, parameters): """ :param evaluation_environment: :param parameters: :return: """ obs_dim = evaluation_environment.observation_space.low.size action_dim = evaluation_environment.action_space.low.size hidden_sizes_qf = parameters['hidden_sizes_qf'] hidden_sizes_policy = parameters['hidden_sizes_policy'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_qf, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes_policy, ) es = GaussianStrategy( action_space=evaluation_environment.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **parameters['trainer_params']) return exploration_policy, policy, trainer
def tdm_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) qf2 = TdmQf( env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs'] ) policy = TdmPolicy( env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs'] ) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs'] ) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3( env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_kwargs ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = CylinderXYPusher2DEnv(**variant['env_kwargs']) env = MultitaskToFlatEnv(env) else: env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = MultitaskToFlatEnv(env) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(HalfCheetahEnv()) eval_env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant["qf_kwargs"]) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant["policy_kwargs"]) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy) eval_path_collector = MdpPathCollector(eval_env, policy) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant["trainer_kwargs"]) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): logger.add_text_output('./d_text.txt') logger.add_tabular_output('./d_tabular.txt') logger.set_snapshot_dir('./snaps') farmer = Farmer([('0.0.0.0', 1)]) remote_env = farmer.force_acq_env() remote_env.set_spaces() env = NormalizedBoxEnv(remote_env) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[256, 256], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[256, 256], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): #Robot env = gym.make('replab-v0')._start_rospy(goal_oriented=False) #SIM #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False) env.action_space.low *= 10 env.action_space.high *= 10 env = NormalizedBoxEnv(env) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant,env=None): if env is None: # default setting of environment env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf1, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def get_exploration_strategy(variant, env): from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy from rlkit.exploration_strategies.gaussian_and_epislon import \ GaussianAndEpislonStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.exploration_strategies.noop import NoopStrategy exploration_type = variant['exploration_type'] # exploration_noise = variant.get('exploration_noise', 0.1) es_kwargs = variant.get('es_kwargs', {}) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma **es_kwargs) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma **es_kwargs) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, # prob_random_action=exploration_noise, **es_kwargs) elif exploration_type == 'gaussian_and_epsilon': es = GaussianAndEpislonStrategy( action_space=env.action_space, # max_sigma=exploration_noise, # min_sigma=exploration_noise, # Constant sigma # epsilon=exploration_noise, **es_kwargs) elif exploration_type == 'noop': es = NoopStrategy(action_space=env.action_space) else: raise Exception("Invalid type: " + exploration_type) return es
def experiment(variant): env = NormalizedBoxEnv(HopperEnv()) es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) her_kwargs = variant['algo_kwargs']['her_kwargs'] observation_key = her_kwargs['observation_key'] desired_goal_key = her_kwargs['desired_goal_key'] achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) num_ensemble_qs = variant.get("num_ensemble_qs", 0) ensemble_qs = [ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) for _ in range(num_ensemble_qs)] policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) render = variant.get("render", False) algorithm = HerExplorationTd3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, render=render, render_during_eval=render, ensemble_qs=ensemble_qs, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = envs[variant['env']](variant['dr']) expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation']) t_fn = variant["t_fn"] expl_env = TransformObservationWrapper(expl_env, t_fn) action_dim = expl_env.action_space.low.size conv_args = { "input_width": 16, "input_height": 16, "input_channels": 8, "kernel_sizes": [4], "n_channels": [32], "strides": [4], "paddings": [0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) target_policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) if variant['noise'] == "eps": es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, epsilon=0.3, max_sigma=0.0, min_sigma=0.0, #constant sigma 0 decay_period=1000000) elif variant['noise'] == "gaussian": es = GaussianStrategy(action_space=expl_env.action_space, max_sigma=0.3, min_sigma=0.1, decay_period=1000000) else: print("unsupported param for --noise") assert False exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('pick_and_lift-state-v0',sparse=True, not_special_p=0.5, ground_p = 0, special_is_grip=True, img_size=256, force_randomly_place=False, force_change_position=False) observation_key = 'observation' desired_goal_key = 'desired_goal' achieved_goal_key = "achieved_goal" replay_buffer = ObsDictRelabelingBuffer( env=expl_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = expl_env.observation_space.spaces['observation'].low.size action_dim = expl_env.action_space.low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.3, min_sigma=0.1, decay_period=1000000 # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3Trainer( # env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs'] ) trainer = HERTrainer(trainer) expl_path_collector = GoalConditionedPathCollector( expl_env, exploration_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if 'history_len' in variant: history_len = variant['history_len'] env = MultiTaskHistoryEnv(env, history_len=history_len) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = ConcatMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs'] ) algorithm = HerTd3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def prepare_trainer(algorithm, expl_env, obs_dim, action_dim, pretrained_policy_load, variant): print("Preparing for {} trainer.".format(algorithm)) if algorithm == "SAC": if not pretrained_policy_load: M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], ) else: snapshot = torch.load(pretrained_policy_load) qf1 = snapshot['trainer/qf1'] qf2 = snapshot['trainer/qf2'] target_qf1 = snapshot['trainer/target_qf1'] target_qf2 = snapshot['trainer/target_qf2'] policy = snapshot['exploration/policy'] if variant['trainer_kwargs']['use_automatic_entropy_tuning']: log_alpha = snapshot['trainer/log_alpha'] variant['trainer_kwargs']['log_alpha'] = log_alpha alpha_optimizer = snapshot['trainer/alpha_optimizer'] variant['trainer_kwargs']['alpha_optimizer'] = alpha_optimizer print("loaded the pretrained policy {}".format(pretrained_policy_load)) eval_policy = MakeDeterministic(policy) expl_policy = policy trainer = SACTrainer( env=expl_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'] ) elif algorithm == "TD3": if not pretrained_policy_load: qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) expl_policy = exploration_policy eval_policy = policy else: pass trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) return expl_policy, eval_policy, trainer
def experiment(variant): dummy_env = make_env(variant['env']) obs_dim = dummy_env.observation_space.low.size action_dim = dummy_env.action_space.low.size expl_env = VectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['expl_env_num']) ]) expl_env.seed(variant["seed"]) expl_env.action_space.seed(variant["seed"]) eval_env = SubprocVectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['eval_env_num']) ]) eval_env.seed(variant["seed"]) M = variant['layer_size'] num_quantiles = variant['num_quantiles'] zf1 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) zf2 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) target_zf1 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) target_zf2 = QuantileMlp( input_size=obs_dim + action_dim, output_size=1, num_quantiles=num_quantiles, hidden_sizes=[M, M], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) es = GaussianStrategy( action_space=dummy_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) # fraction proposal network fp = target_fp = None if variant['trainer_kwargs'].get('risk_type') == 'fqf': fp = FlattenMlp( input_size=obs_dim + action_dim, output_size=num_quantiles, hidden_sizes=[M // 2, M // 2], output_activation=softmax, ) target_fp = FlattenMlp( input_size=obs_dim + action_dim, output_size=num_quantiles, hidden_sizes=[M // 2, M // 2], output_activation=softmax, ) eval_path_collector = VecMdpPathCollector( eval_env, policy, ) expl_path_collector = VecMdpStepCollector( expl_env, exploration_policy, ) replay_buffer = TorchReplayBuffer( variant['replay_buffer_size'], dummy_env, ) trainer = TD4Trainer( policy=policy, target_policy=target_policy, zf1=zf1, zf2=zf2, target_zf1=target_zf1, target_zf2=target_zf2, fp=fp, target_fp=target_fp, num_quantiles=num_quantiles, **variant['trainer_kwargs'], ) algorithm = TorchVecOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): base_expl_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True) base_eval_env = PointMassEnv(n=variant["num_tasks"], reward_type=variant["reward_type"]) eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size print(expl_env.observation_space, expl_env.action_space) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer(env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): dummy_env = make_env(variant['env']) obs_dim = dummy_env.observation_space.low.size action_dim = dummy_env.action_space.low.size expl_env = VectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['expl_env_num']) ]) expl_env.seed(variant["seed"]) expl_env.action_space.seed(variant["seed"]) eval_env = SubprocVectorEnv([ lambda: make_env(variant['env']) for _ in range(variant['eval_env_num']) ]) eval_env.seed(variant["seed"]) M = variant['layer_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[M, M], ) es = GaussianStrategy( action_space=dummy_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = VecMdpPathCollector( eval_env, policy, ) expl_path_collector = VecMdpStepCollector( expl_env, exploration_policy, ) replay_buffer = TorchReplayBuffer( variant['replay_buffer_size'], dummy_env, ) trainer = TD3Trainer( policy=policy, target_policy=target_policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs'], ) algorithm = TorchVecOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant, args): # Doesn't work :( #import gym #expl_env = NormalizedBoxEnv( gym.make(args.env) ) #eval_env = NormalizedBoxEnv( gym.make(args.env) ) if 'Ant' in args.env: expl_env = NormalizedBoxEnv( AntEnv() ) eval_env = NormalizedBoxEnv( AntEnv() ) elif 'InvertedPendulum' in args.env: expl_env = NormalizedBoxEnv( InvertedPendulumEnv() ) eval_env = NormalizedBoxEnv( InvertedPendulumEnv() ) elif 'HalfCheetah' in args.env: expl_env = NormalizedBoxEnv( HalfCheetahEnv() ) eval_env = NormalizedBoxEnv( HalfCheetahEnv() ) elif 'Hopper' in args.env: expl_env = NormalizedBoxEnv( HopperEnv() ) eval_env = NormalizedBoxEnv( HopperEnv() ) elif 'Reacher' in args.env: expl_env = NormalizedBoxEnv( ReacherEnv() ) eval_env = NormalizedBoxEnv( ReacherEnv() ) elif 'Swimmer' in args.env: expl_env = NormalizedBoxEnv( SwimmerEnv() ) eval_env = NormalizedBoxEnv( SwimmerEnv() ) elif 'Walker2d' in args.env: expl_env = NormalizedBoxEnv( Walker2dEnv() ) eval_env = NormalizedBoxEnv( Walker2dEnv() ) else: raise ValueError(args.env) # Back to normal. obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # # expl_env = NormalizedBoxEnv(HalfCheetahEnv()) # expl_env = NormalizedBoxEnv(gym.make('Walker2d-v2')) # # eval_env = NormalizedBoxEnv(HalfCheetahEnv()) # eval_env = NormalizedBoxEnv(gym.make('Walker2d-v2')) # obs_dim = expl_env.observation_space.low.size # action_dim = expl_env.action_space.low.size expl_env = NormalizedBoxEnv(gym.make('activesearchrl-v0')) eval_env = NormalizedBoxEnv(gym.make('activesearchrl-v0')) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) es = GaussianStrategy( action_space=expl_env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) eval_path_collector = MdpPathCollector( eval_env, policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def tdm_td3_experiment(variant): variant['env_kwargs'].update(variant['reward_params']) env = variant['env_class'](**variant['env_kwargs']) multiworld_env = variant.get('multiworld_env', True) if multiworld_env is not True: env = MultitaskEnvToSilentMultitaskEnv(env) if variant["render"]: env.pause_on_goal = True if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, max_sigma=0.1, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) if multiworld_env is True: obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size else: obs_dim = action_dim = goal_dim = None vectorized = 'vectorized' in env.reward_type variant['algo_kwargs']['tdm_kwargs']['vectorized'] = vectorized norm_order = env.norm_order variant['algo_kwargs']['tdm_kwargs']['norm_order'] = norm_order qf1 = TdmQf(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, vectorized=vectorized, norm_order=norm_order, **variant['qf_kwargs']) qf2 = TdmQf(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, vectorized=vectorized, norm_order=norm_order, **variant['qf_kwargs']) policy = TdmPolicy(env=env, observation_dim=obs_dim, action_dim=action_dim, goal_dim=goal_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) relabeling_env = pickle.loads(pickle.dumps(env)) algo_kwargs = variant['algo_kwargs'] if multiworld_env is True: observation_key = variant.get('observation_key', 'state_observation') desired_goal_key = variant.get('desired_goal_key', 'state_desired_goal') achieved_goal_key = variant.get('achieved_goal_key', 'state_achieved_goal') replay_buffer = ObsDictRelabelingBuffer( env=relabeling_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **variant['replay_buffer_kwargs']) algo_kwargs['tdm_kwargs']['observation_key'] = observation_key algo_kwargs['tdm_kwargs']['desired_goal_key'] = desired_goal_key else: replay_buffer = RelabelingReplayBuffer( env=relabeling_env, **variant['replay_buffer_kwargs']) # qf_criterion = variant['qf_criterion_class']() # algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['td3_kwargs']['training_env'] = env if 'tau_schedule_kwargs' in variant: tau_schedule = IntPiecewiseLinearSchedule( **variant['tau_schedule_kwargs']) else: tau_schedule = None algo_kwargs['tdm_kwargs']['epoch_max_tau_schedule'] = tau_schedule algorithm = TdmTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def td3_experiment(variant): import gym import multiworld.envs.mujoco import multiworld.envs.pygame import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.torch.grill.launcher import get_state_experiment_video_save_function from rlkit.torch.her.her_td3 import HerTd3 from rlkit.torch.td3.td3 import TD3 from rlkit.torch.networks import ConcatMlp, TanhMlpPolicy from rlkit.data_management.obs_dict_replay_buffer import ( ObsDictReplayBuffer) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from rlkit.samplers.data_collector.path_collector import ObsDictPathCollector if 'env_id' in variant: eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] # desired_goal_key = variant['desired_goal_key'] # variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key # variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() # achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictReplayBuffer( env=eval_env, observation_key=observation_key, # desired_goal_key=desired_goal_key, # achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=eval_env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=eval_env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=eval_env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) target_policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) trainer = TD3(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = ObsDictPathCollector( eval_env, policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) expl_path_collector = ObsDictPathCollector( expl_env, expl_policy, observation_key=observation_key, # render=True, # desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) # if variant.get("save_video", False): # rollout_function = rf.create_rollout_function( # rf.multitask_rollout, # max_path_length=algorithm.max_path_length, # observation_key=observation_key, # desired_goal_key=algorithm.desired_goal_key, # ) # video_func = get_state_experiment_video_save_function( # rollout_function, # env, # policy, # variant, # ) # algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def grill_her_td3_experiment(variant): env = variant["env_class"](**variant['env_kwargs']) render = variant["render"] rdim = variant["rdim"] vae_path = variant["vae_paths"][str(rdim)] reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) if init_camera is None: camera_name = "topview" else: camera_name = None env = ImageEnv( env, 84, init_camera=init_camera, camera_name=camera_name, transpose=True, normalize=True, ) env = VAEWrappedEnv(env, vae_path, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, **variant.get('vae_wrapped_env_kwargs', {})) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) observation_key = variant.get('observation_key', 'latent_observation') desired_goal_key = variant.get('desired_goal_key', 'latent_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") obs_dim = (env.observation_space.spaces[observation_key].low.size + env.observation_space.spaces[desired_goal_key].low.size) action_dim = env.action_space.low.size hidden_sizes = variant.get('hidden_sizes', [400, 300]) qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes, ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=hidden_sizes, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) training_mode = variant.get("training_mode", "train") testing_mode = variant.get("testing_mode", "test") testing_env = pickle.loads(pickle.dumps(env)) testing_env.mode(testing_mode) training_env = pickle.loads(pickle.dumps(env)) training_env.mode(training_mode) relabeling_env = pickle.loads(pickle.dumps(env)) relabeling_env.mode(training_mode) relabeling_env.disable_render() video_vae_env = pickle.loads(pickle.dumps(env)) video_vae_env.mode("video_vae") video_goal_env = pickle.loads(pickle.dumps(env)) video_goal_env.mode("video_env") replay_buffer = ObsDictRelabelingBuffer( env=relabeling_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_kwargs']) variant["algo_kwargs"]["replay_buffer"] = replay_buffer algorithm = HerTd3(testing_env, training_env=training_env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, render=render, render_during_eval=render, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs']) if ptu.gpu_enabled(): print("using GPU") algorithm.to(ptu.device) for e in [testing_env, training_env, video_vae_env, video_goal_env]: e.vae.to(ptu.device) algorithm.train() if variant.get("save_video", True): logdir = logger.get_snapshot_dir() policy.train(False) filename = osp.join(logdir, 'video_final_env.mp4') rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) dump_video(video_goal_env, policy, filename, rollout_function) filename = osp.join(logdir, 'video_final_vae.mp4') dump_video(video_vae_env, policy, filename, rollout_function)
def her_td3_experiment(variant): if 'env_id' in variant: env = gym.make(variant['env_id']) else: env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] desired_goal_key = variant['desired_goal_key'] variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): import gym import multiworld.envs.mujoco import multiworld.envs.pygame import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy from rlkit.exploration_strategies.ou_strategy import OUStrategy from rlkit.torch.grill.launcher import get_video_save_func from rlkit.torch.her.her_td3 import HerTd3 from rlkit.torch.networks import ConcatMlp, TanhMlpPolicy from rlkit.data_management.obs_dict_replay_buffer import ( ObsDictRelabelingBuffer) if 'env_id' in variant: env = gym.make(variant['env_id']) else: env = variant['env_class'](**variant['env_kwargs']) observation_key = variant['observation_key'] desired_goal_key = variant['desired_goal_key'] variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key if variant.get('normalize', False): raise NotImplementedError() achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) if variant.get("save_video", False): rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=algorithm.max_path_length, observation_key=algorithm.observation_key, desired_goal_key=algorithm.desired_goal_key, ) video_func = get_video_save_func( rollout_function, env, policy, variant, ) algorithm.post_epoch_funcs.append(video_func) algorithm.to(ptu.device) algorithm.train()