def run_experiment(variant): env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) env = DelayedEnv(env, delay=0.01) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = RemoteSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size'] ) base_kwargs = dict( sampler=sampler, epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reparameterize=variant['reparameterize'], reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], reparameterize=variant['reparameterize'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def run_experiment(env, seed, scale_reward, scale_entropy, tsallisQ, num_of_train): tf.set_random_seed(seed) environmentName = env # environmentName = "LunarLanderContinuous-v2" print("Experiment: {}".format(environmentName)) # Set up the PyBullet environment. # env = normalize(gym.make(environmentName)) env = GymEnv(environmentName) # Set up the replay buffer. pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000) # Set up the sampler. sampler_params = { 'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 256, } sampler = SimpleSampler(**sampler_params) # Set up the value function networks. M = 128 qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1') qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2') vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M)) # Set up the policy network. # initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec = env.spec, hidden_layer_sizes = (M, M), reparameterize = False, reg = 1e-3, ) # policy = GMMPolicy( # env_spec=env.spec, # K=1, # hidden_layer_sizes=(M, M), # reparameterize=False, # qf=qf1, # reg=1.0e-3, # ) initial_exploration_policy = policy base_kwargs = { 'epoch_length': 1000, 'n_train_repeat': num_of_train, 'n_initial_exploration_steps': 1000, 'eval_render': False, 'eval_n_episodes': 3, 'eval_deterministic': True, } base_kwargs = dict(base_kwargs, sampler = sampler) # Define a function for reward scaling. def incrementor(itr): return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0)) def decrementor(itr): return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0)) algorithm = TAC( base_kwargs = base_kwargs, env = env, policy = policy, initial_exploration_policy = initial_exploration_policy, pool = pool, qf1 = qf1, qf2 = qf2, vf = vf, lr = 3.0e-4, scale_reward = scale_reward, # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem. scale_entropy = scale_entropy, # CG: default 1.0, 0.8 for the lunar lander problem. discount = 0.99, tau = 0.01, reparameterize = False, target_update_interval = 1, action_prior = 'uniform', save_full_state = False, tsallisQ = tsallisQ, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN_BD( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], # Additional params for behaviour tracking metric=variant['metric'], env_id=variant['prefix'], eval_freq=variant['eval_freq'], log_dir=get_logdir(args, variant), ) algorithm.train()