def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] constants.COST_TYPE = variant['algorithm_params']['cost_type'] register( id='MECS-v1', entry_point='sac.envs.environment_V_sweep:MEC_v1', max_episode_steps=5000, ) register( id='MECS-v2', entry_point='sac.envs.env_V_sweep_v2:MEC_v2', max_episode_steps=5000, ) register( id='MECS-v3', entry_point='sac.envs.env_V_sweep_v3:MEC_v3', max_episode_steps=5000, ) register( id='MECS-v4', entry_point='sac.envs.env_V_sweep_v4:MEC_v4', max_episode_steps=5000, ) register( id='MECS-v5', entry_point='sac.envs.env_V_sweep_v5:MEC_v5', max_episode_steps=5000, ) register( id='MECS-v6', entry_point='sac.envs.env_V_sweep_v6:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v61', entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v7', entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7', max_episode_steps=5000, ) register( id='MECS-v8', entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8', max_episode_steps=5000, ) register( id='MECS-v9', entry_point='sac.envs.env_V_sweep_v9:MEC_v9', max_episode_steps=5000, ) register( id='MECS-v10', entry_point='sac.envs.env_V_sweep_v10:MEC_v10', max_episode_steps=5000, ) register( id='MECS-v11', entry_point='sac.envs.env_V_sweep_v11:MEC_v11', max_episode_steps=5000, ) register( id='MECS-v12', entry_point='sac.envs.env_V_sweep_v12:MEC_v12', max_episode_steps=5000, ) register( id='MECS-v13', entry_point='sac.envs.env_V_sweep_v13:MEC_v13', max_episode_steps=5000, ) register( id='MECS-v14', entry_point='sac.envs.env_V_sweep_v14:MEC_v14', max_episode_steps=5000, ) register( id='MECS-v15', entry_point='sac.envs.env_V_sweep_v15:MEC_v15', max_episode_steps=5000, ) register( id='MECS-v16', entry_point='sac.envs.env_V_sweep_v16:MEC_v16', max_episode_steps=5000, ) register( id='MECS-v17', entry_point='sac.envs.env_V_sweep_v17:MEC_v17', max_episode_steps=5000, ) register( id='MECS-v18', entry_point='sac.envs.env_V_sweep_v18:MEC_v18', max_episode_steps=5000, ) register( id='MECS-v19', entry_point='sac.envs.env_V_sweep_v19:MEC_v19', max_episode_steps=5000, ) register( id='MECS-v20', entry_point='sac.envs.env_V_sweep_v20:MEC_v20', max_episode_steps=5000, ) register( id='MECS-v21', entry_point='sac.envs.env_V_sweep_v21:MEC_v21', max_episode_steps=5000, ) register( id='MECS-v22', entry_point='sac.envs.env_V_sweep_v22:MEC_v22', max_episode_steps=5000, ) register( id='MECS-v23', entry_point='sac.envs.env_V_sweep_v23:MEC_v23', max_episode_steps=5000, ) register( id='MECS-v24', entry_point='sac.envs.env_V_sweep_v24:MEC_v24', max_episode_steps=5000, ) register( id='MECS-v25', entry_point='sac.envs.env_V_sweep_v25:MEC_v25', max_episode_steps=5000, ) register( id='MECS-v26', entry_point='sac.envs.env_V_sweep_v26:MEC_v26', max_episode_steps=5000, ) register( id='MECS-v27', entry_point='sac.envs.env_V_sweep_v27:MEC_v27', max_episode_steps=5000, ) register( id='MECS-v28', entry_point='sac.envs.env_V_sweep_v28:MEC_v28', max_episode_steps=5000, ) register( id='MECS-v29', entry_point='sac.envs.env_V_sweep_v29:MEC_v29', max_episode_steps=5000, ) register( id='MECS-v30', entry_point='sac.envs.env_V_sweep_v30:MEC_v30', max_episode_steps=5000, ) env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(param): random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) ) ) replay_buffer_params = { 'max_replay_buffer_size': 1e6, } sampler_params = { 'max_path_length': horizon - 1, 'min_pool_size': 1000, 'batch_size': 256, } pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict( { 'epoch_length': 1500, 'n_train_repeat': 1, 'n_initial_exploration_steps': 5000, 'eval_render': False, 'eval_n_episodes': 1, 'eval_deterministic': True, 'n_epochs': 2e3 }, sampler=sampler) M = 64 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(64, 64), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=20, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(env, seed, scale_reward, scale_entropy, tsallisQ, num_of_train): tf.set_random_seed(seed) environmentName = env # environmentName = "LunarLanderContinuous-v2" print("Experiment: {}".format(environmentName)) # Set up the PyBullet environment. # env = normalize(gym.make(environmentName)) env = GymEnv(environmentName) # Set up the replay buffer. pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000) # Set up the sampler. sampler_params = { 'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 256, } sampler = SimpleSampler(**sampler_params) # Set up the value function networks. M = 128 qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1') qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2') vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M)) # Set up the policy network. # initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec = env.spec, hidden_layer_sizes = (M, M), reparameterize = False, reg = 1e-3, ) # policy = GMMPolicy( # env_spec=env.spec, # K=1, # hidden_layer_sizes=(M, M), # reparameterize=False, # qf=qf1, # reg=1.0e-3, # ) initial_exploration_policy = policy base_kwargs = { 'epoch_length': 1000, 'n_train_repeat': num_of_train, 'n_initial_exploration_steps': 1000, 'eval_render': False, 'eval_n_episodes': 3, 'eval_deterministic': True, } base_kwargs = dict(base_kwargs, sampler = sampler) # Define a function for reward scaling. def incrementor(itr): return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0)) def decrementor(itr): return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0)) algorithm = TAC( base_kwargs = base_kwargs, env = env, policy = policy, initial_exploration_policy = initial_exploration_policy, pool = pool, qf1 = qf1, qf2 = qf2, vf = vf, lr = 3.0e-4, scale_reward = scale_reward, # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem. scale_entropy = scale_entropy, # CG: default 1.0, 0.8 for the lunar lander problem. discount = 0.99, tau = 0.01, reparameterize = False, target_update_interval = 1, action_prior = 'uniform', save_full_state = False, tsallisQ = tsallisQ, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. # algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0]) # Set up the ensemble Q-function for action selection. self._Q_ensemble = NNQFunction( env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name='ensqf') # ======================================================================== # Set up the training target for the ensemble Q-function for action selection. # ======================================================================== # Create the observation placeholder. self._observations_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='obv_ens', ) # Create the next observation placeholder. self._observations_ens_next_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='next_obv_ens', ) # Create a list of next action placeholders. self._acts_next_phs = [] for i in range(len(q_param_list)): act_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name=str(i) + '_next_act_ens', ) self._acts_next_phs.append(act_ens_ph) # Create the observed action placeholder. self._obv_act_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name='act_obv_ens', ) # Create the reward placeholder. self._rewards_ph = tf.placeholder( tf.float32, shape=(None, ), name='rew_ens', ) # Create the terminal placeholder. self._terminals_ph = tf.placeholder( tf.float32, shape=(None, ), name='ter_ens', ) # Determine the target Q-value for next step. self._q_ens_targets = [] for act_next_ph in self._acts_next_phs: qt = self._Q_ensemble.get_output_for( self._observations_ens_next_ph, act_next_ph, reuse=True) self._q_ens_targets.append(qt) for i, q_t in enumerate(self._q_ens_targets): if i == 0: self._q_ens_next = q_t else: self._q_ens_next = tf.maximum(self._q_ens_next, q_t) # self._q_ens_next = self._q_ens_next + q_t # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets) # Determine the Q-loss. self._q_train = self._Q_ensemble.get_output_for( self._observations_ens_ph, self._obv_act_ph, reuse=True) self._q_ens_loss = 0.5 * tf.reduce_mean( (self._q_train - tf.stop_gradient(self._scale_reward * self._rewards_ph + (1 - self._terminals_ph) * self._discount * self._q_ens_next))**2) # Determine the Q-training operator. self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize( loss=self._q_ens_loss, var_list=self._Q_ensemble.get_params_internal()) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer())
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] if variant['num_hidden'] != 256: M = variant['num_hidden'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1', batchnormvf=variant['batchnormvf']) qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2', batchnormvf=variant['batchnormvf']) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), batchnormvf=variant['batchnormvf'], dropoutvf_keep_prob=variant['dropoutvf']) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], todropoutpi=(variant['dropoutpi'] < 1.0), dropoutpi=variant['dropoutpi'], batchnormpi=variant['batchnormpi']) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) if variant['reward_scale'] < 0: scale_rew = algorithm_params['scale_reward'] else: scale_rew = variant['reward_scale'] algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'], scale_reward=scale_rew, discount=algorithm_params['discount'], tau=variant['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, l1regpi=variant['l1regpi'], l2regpi=variant['l2regpi'], l1regvf=variant['l1regvf'], l2regvf=variant['l2regvf'], ent_coef=variant['ent_coef'], wclippi=variant['wclippi'], wclipvf=variant['wclipvf'], dropoutpi=variant['dropoutpi'], dropoutvf=variant['dropoutvf'], batchnormpi=variant['batchnormpi'], batchnormvf=variant['batchnormvf']) algorithm._sess.run(tf.global_variables_initializer()) for v in tf.trainable_variables(): print(v.name) algorithm.train() if variant['policypath'] != '': save_w_path = os.path.expanduser(variant['policypath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gaussian_policy'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',') if variant['valuepath'] != '': save_w_path = os.path.expanduser(variant['valuepath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf1'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf2'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vf'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',')