def run_experiment(variant): #low_level_policy = load_low_level_policy( # policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path']) env_name = variant['env_name'] env_type = env_name.split('-')[-1] env_args = { name.replace('env_', '', 1): value for name, value in variant.items() if name.startswith('env_') and name != 'env_name' } if 'random-goal' in env_name: EnvClass = RANDOM_GOAL_ENVS[env_type] elif 'rllab' in variant['env_name']: EnvClass = RLLAB_ENVS[variant['env_name']] else: raise NotImplementedError base_env = normalize(EnvClass(**env_args)) env = base_env #env = HierarchyProxyEnv(wrapped_env=base_env, # low_level_policy=low_level_policy) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = variant['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes') observations_preprocessor = ( MLPPreprocessor(env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, name='high_level_observations_preprocessor') if preprocessing_hidden_sizes is not None else None) policy_s_t_layers = variant['policy_s_t_layers'] policy_s_t_units = variant['policy_s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": variant['policy_coupling_layers'], "translation_hidden_sizes": s_t_hidden_sizes, "scale_hidden_sizes": s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, mode="train", squash=False, bijector_config=bijector_config, q_function=qf1, fix_h_on_reset=variant.get('policy_fix_h_on_reset', False), observations_preprocessor=observations_preprocessor, name="high_level_policy") algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf1=qf1, vf=vf, qf2=qf2, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], target_update_interval=variant['target_update_interval'], action_prior=variant['action_prior'], initial_exploration_policy=initial_exploration_policy, save_full_state=False, ) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] constants.COST_TYPE = variant['algorithm_params']['cost_type'] register( id='MECS-v1', entry_point='sac.envs.environment_V_sweep:MEC_v1', max_episode_steps=5000, ) register( id='MECS-v2', entry_point='sac.envs.env_V_sweep_v2:MEC_v2', max_episode_steps=5000, ) register( id='MECS-v3', entry_point='sac.envs.env_V_sweep_v3:MEC_v3', max_episode_steps=5000, ) register( id='MECS-v4', entry_point='sac.envs.env_V_sweep_v4:MEC_v4', max_episode_steps=5000, ) register( id='MECS-v5', entry_point='sac.envs.env_V_sweep_v5:MEC_v5', max_episode_steps=5000, ) register( id='MECS-v6', entry_point='sac.envs.env_V_sweep_v6:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v61', entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v7', entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7', max_episode_steps=5000, ) register( id='MECS-v8', entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8', max_episode_steps=5000, ) register( id='MECS-v9', entry_point='sac.envs.env_V_sweep_v9:MEC_v9', max_episode_steps=5000, ) register( id='MECS-v10', entry_point='sac.envs.env_V_sweep_v10:MEC_v10', max_episode_steps=5000, ) register( id='MECS-v11', entry_point='sac.envs.env_V_sweep_v11:MEC_v11', max_episode_steps=5000, ) register( id='MECS-v12', entry_point='sac.envs.env_V_sweep_v12:MEC_v12', max_episode_steps=5000, ) register( id='MECS-v13', entry_point='sac.envs.env_V_sweep_v13:MEC_v13', max_episode_steps=5000, ) register( id='MECS-v14', entry_point='sac.envs.env_V_sweep_v14:MEC_v14', max_episode_steps=5000, ) register( id='MECS-v15', entry_point='sac.envs.env_V_sweep_v15:MEC_v15', max_episode_steps=5000, ) register( id='MECS-v16', entry_point='sac.envs.env_V_sweep_v16:MEC_v16', max_episode_steps=5000, ) register( id='MECS-v17', entry_point='sac.envs.env_V_sweep_v17:MEC_v17', max_episode_steps=5000, ) register( id='MECS-v18', entry_point='sac.envs.env_V_sweep_v18:MEC_v18', max_episode_steps=5000, ) register( id='MECS-v19', entry_point='sac.envs.env_V_sweep_v19:MEC_v19', max_episode_steps=5000, ) register( id='MECS-v20', entry_point='sac.envs.env_V_sweep_v20:MEC_v20', max_episode_steps=5000, ) register( id='MECS-v21', entry_point='sac.envs.env_V_sweep_v21:MEC_v21', max_episode_steps=5000, ) register( id='MECS-v22', entry_point='sac.envs.env_V_sweep_v22:MEC_v22', max_episode_steps=5000, ) register( id='MECS-v23', entry_point='sac.envs.env_V_sweep_v23:MEC_v23', max_episode_steps=5000, ) register( id='MECS-v24', entry_point='sac.envs.env_V_sweep_v24:MEC_v24', max_episode_steps=5000, ) register( id='MECS-v25', entry_point='sac.envs.env_V_sweep_v25:MEC_v25', max_episode_steps=5000, ) register( id='MECS-v26', entry_point='sac.envs.env_V_sweep_v26:MEC_v26', max_episode_steps=5000, ) register( id='MECS-v27', entry_point='sac.envs.env_V_sweep_v27:MEC_v27', max_episode_steps=5000, ) register( id='MECS-v28', entry_point='sac.envs.env_V_sweep_v28:MEC_v28', max_episode_steps=5000, ) register( id='MECS-v29', entry_point='sac.envs.env_V_sweep_v29:MEC_v29', max_episode_steps=5000, ) register( id='MECS-v30', entry_point='sac.envs.env_V_sweep_v30:MEC_v30', max_episode_steps=5000, ) env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] if variant['num_hidden'] != 256: M = variant['num_hidden'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1', batchnormvf=variant['batchnormvf']) qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2', batchnormvf=variant['batchnormvf']) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), batchnormvf=variant['batchnormvf'], dropoutvf_keep_prob=variant['dropoutvf']) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], todropoutpi=(variant['dropoutpi'] < 1.0), dropoutpi=variant['dropoutpi'], batchnormpi=variant['batchnormpi']) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) if variant['reward_scale'] < 0: scale_rew = algorithm_params['scale_reward'] else: scale_rew = variant['reward_scale'] algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'], scale_reward=scale_rew, discount=algorithm_params['discount'], tau=variant['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, l1regpi=variant['l1regpi'], l2regpi=variant['l2regpi'], l1regvf=variant['l1regvf'], l2regvf=variant['l2regvf'], ent_coef=variant['ent_coef'], wclippi=variant['wclippi'], wclipvf=variant['wclipvf'], dropoutpi=variant['dropoutpi'], dropoutvf=variant['dropoutvf'], batchnormpi=variant['batchnormpi'], batchnormvf=variant['batchnormvf']) algorithm._sess.run(tf.global_variables_initializer()) for v in tf.trainable_variables(): print(v.name) algorithm.train() if variant['policypath'] != '': save_w_path = os.path.expanduser(variant['policypath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gaussian_policy'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',') if variant['valuepath'] != '': save_w_path = os.path.expanduser(variant['valuepath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf1'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf2'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vf'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',')