def run_experiment(param): random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( # IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ) # ) ) ) replay_buffer_params = { 'max_replay_buffer_size': 1e6, } sampler_params = { 'max_path_length': horizon - 1, 'min_pool_size': 1000, 'batch_size': 256, } pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict( { 'epoch_length': 1500, 'n_train_repeat': 1, 'n_initial_exploration_steps': 5000, 'eval_render': False, 'eval_n_episodes': 1, 'eval_deterministic': True, 'n_epochs': 2e3 }, sampler=sampler) M = 64 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(64, 64), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=20, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): sub_level_policies_paths = [] # args = parse_args() args = arg() domain = ENVIRONMENTS[args.domain][args.task] if args.domain == 'sawyer-reach': goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] lower_goal_range = [-0.1, -0.1, -0.1] upper_goal_range = [0.1, 0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( domain( # playable params random_arm_init=random_arm_init, lower_goal_range=lower_goal_range, upper_goal_range=upper_goal_range, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) else: raise ValueError("Domain not available") pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=1000, n_epochs=2e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] constants.COST_TYPE = variant['algorithm_params']['cost_type'] register( id='MECS-v1', entry_point='sac.envs.environment_V_sweep:MEC_v1', max_episode_steps=5000, ) register( id='MECS-v2', entry_point='sac.envs.env_V_sweep_v2:MEC_v2', max_episode_steps=5000, ) register( id='MECS-v3', entry_point='sac.envs.env_V_sweep_v3:MEC_v3', max_episode_steps=5000, ) register( id='MECS-v4', entry_point='sac.envs.env_V_sweep_v4:MEC_v4', max_episode_steps=5000, ) register( id='MECS-v5', entry_point='sac.envs.env_V_sweep_v5:MEC_v5', max_episode_steps=5000, ) register( id='MECS-v6', entry_point='sac.envs.env_V_sweep_v6:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v61', entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6', max_episode_steps=5000, ) register( id='MECS-v7', entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7', max_episode_steps=5000, ) register( id='MECS-v8', entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8', max_episode_steps=5000, ) register( id='MECS-v9', entry_point='sac.envs.env_V_sweep_v9:MEC_v9', max_episode_steps=5000, ) register( id='MECS-v10', entry_point='sac.envs.env_V_sweep_v10:MEC_v10', max_episode_steps=5000, ) register( id='MECS-v11', entry_point='sac.envs.env_V_sweep_v11:MEC_v11', max_episode_steps=5000, ) register( id='MECS-v12', entry_point='sac.envs.env_V_sweep_v12:MEC_v12', max_episode_steps=5000, ) register( id='MECS-v13', entry_point='sac.envs.env_V_sweep_v13:MEC_v13', max_episode_steps=5000, ) register( id='MECS-v14', entry_point='sac.envs.env_V_sweep_v14:MEC_v14', max_episode_steps=5000, ) register( id='MECS-v15', entry_point='sac.envs.env_V_sweep_v15:MEC_v15', max_episode_steps=5000, ) register( id='MECS-v16', entry_point='sac.envs.env_V_sweep_v16:MEC_v16', max_episode_steps=5000, ) register( id='MECS-v17', entry_point='sac.envs.env_V_sweep_v17:MEC_v17', max_episode_steps=5000, ) register( id='MECS-v18', entry_point='sac.envs.env_V_sweep_v18:MEC_v18', max_episode_steps=5000, ) register( id='MECS-v19', entry_point='sac.envs.env_V_sweep_v19:MEC_v19', max_episode_steps=5000, ) register( id='MECS-v20', entry_point='sac.envs.env_V_sweep_v20:MEC_v20', max_episode_steps=5000, ) register( id='MECS-v21', entry_point='sac.envs.env_V_sweep_v21:MEC_v21', max_episode_steps=5000, ) register( id='MECS-v22', entry_point='sac.envs.env_V_sweep_v22:MEC_v22', max_episode_steps=5000, ) register( id='MECS-v23', entry_point='sac.envs.env_V_sweep_v23:MEC_v23', max_episode_steps=5000, ) register( id='MECS-v24', entry_point='sac.envs.env_V_sweep_v24:MEC_v24', max_episode_steps=5000, ) register( id='MECS-v25', entry_point='sac.envs.env_V_sweep_v25:MEC_v25', max_episode_steps=5000, ) register( id='MECS-v26', entry_point='sac.envs.env_V_sweep_v26:MEC_v26', max_episode_steps=5000, ) register( id='MECS-v27', entry_point='sac.envs.env_V_sweep_v27:MEC_v27', max_episode_steps=5000, ) register( id='MECS-v28', entry_point='sac.envs.env_V_sweep_v28:MEC_v28', max_episode_steps=5000, ) register( id='MECS-v29', entry_point='sac.envs.env_V_sweep_v29:MEC_v29', max_episode_steps=5000, ) register( id='MECS-v30', entry_point='sac.envs.env_V_sweep_v30:MEC_v30', max_episode_steps=5000, ) env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): #low_level_policy = load_low_level_policy( # policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path']) env_name = variant['env_name'] env_type = env_name.split('-')[-1] env_args = { name.replace('env_', '', 1): value for name, value in variant.items() if name.startswith('env_') and name != 'env_name' } if 'random-goal' in env_name: EnvClass = RANDOM_GOAL_ENVS[env_type] elif 'rllab' in variant['env_name']: EnvClass = RLLAB_ENVS[variant['env_name']] else: raise NotImplementedError base_env = normalize(EnvClass(**env_args)) env = base_env #env = HierarchyProxyEnv(wrapped_env=base_env, # low_level_policy=low_level_policy) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = variant['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes') observations_preprocessor = ( MLPPreprocessor(env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, name='high_level_observations_preprocessor') if preprocessing_hidden_sizes is not None else None) policy_s_t_layers = variant['policy_s_t_layers'] policy_s_t_units = variant['policy_s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { "scale_regularization": 0.0, "num_coupling_layers": variant['policy_coupling_layers'], "translation_hidden_sizes": s_t_hidden_sizes, "scale_hidden_sizes": s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, mode="train", squash=False, bijector_config=bijector_config, q_function=qf1, fix_h_on_reset=variant.get('policy_fix_h_on_reset', False), observations_preprocessor=observations_preprocessor, name="high_level_policy") algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf1=qf1, vf=vf, qf2=qf2, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], target_update_interval=variant['target_update_interval'], action_prior=variant['action_prior'], initial_exploration_policy=initial_exploration_policy, save_full_state=False, ) algorithm.train()
def run_experiment(variant): domain = None goal_size = None sub_level_policies_paths = [] if args.domain == 'ant-cross-maze': domain = CrossMazeAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'ant-random-goal': domain = RandomGoalAntEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl") sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl") elif args.domain == 'cheetah-hurdle': domain = HalfCheetahHurdleEnv goal_size = 2 sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl") sub_level_policies_paths.append( "primitive-policies/hc/jp-longz/jump.pkl") elif args.domain == 'pusher': domain = PusherEnv goal_size = 0 sub_level_policies_paths.append( "primitive-policies/pusher/bottom/bottom.pkl") sub_level_policies_paths.append( "primitive-policies/pusher/left/left.pkl") env = normalize(domain()) #CrossMazeAntEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=256) base_kwargs = dict(epoch_length=1000, n_epochs=5e3, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M,M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def __init__(self, base_kwargs, env, policy, discriminator, qf, vf, pool, plotter=None, lr=3E-3, scale_entropy=1, discount=0.99, tau=0.01, num_skills=20, save_full_state=False, find_best_skill_interval=10, best_skill_n_rollouts=10, learn_p_z=False, include_actions=False, add_p_z=True, reparametrize=False): """ Args: base_kwargs (dict): dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. policy: (`rllab.NNPolicy`): A policy function approximator. discriminator: (`rllab.NNPolicy`): A discriminator for z. qf (`ValueFunction`): Q-function approximator. vf (`ValueFunction`): Soft value function approximator. pool (`PoolBase`): Replay buffer to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. scale_entropy (`float`): Scaling factor for entropy. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. num_skills (`int`): Number of skills/options to learn. save_full_state (`bool`): If True, save the full class in the snapshot. See `self.get_snapshot` for more information. find_best_skill_interval (`int`): How often to recompute the best skill. best_skill_n_rollouts (`int`): When finding the best skill, how many rollouts to do per skill. include_actions (`bool`): Whether to pass actions to the discriminator. add_p_z (`bool`): Whether th include log p(z) in the pseudo-reward. """ Serializable.quick_init(self, locals()) super(SAC, self).__init__(**base_kwargs) self._env = env self._policy = policy self._discriminator = discriminator self._include_actions = False self._qf = qf self._vf = vf self._pool = pool self._plotter = plotter self._policy_lr = lr self._discriminator_lr = lr self._qf_lr = lr self._vf_lr = lr self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._num_skills = num_skills self._p_z = np.full(num_skills, 1.0 / num_skills) self._find_best_skill_interval = find_best_skill_interval self._best_skill_n_rollouts = best_skill_n_rollouts self._learn_p_z = learn_p_z self._save_full_state = save_full_state self._add_p_z = add_p_z self._Da = self._env.action_space.flat_dim self._Do = self._env.observation_space.flat_dim self._training_ops = list() super(DIAYN, self).__init__(base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=UniformPolicy( env.spec), qf1=None, qf2=None, vf=vf, pool=pool, lr=lr, discount=discount, tau=tau, save_full_state=save_full_state, reparameterize=reparametrize) # self._init_placeholders() # self._init_actor_update() # self._init_critic_update() self._init_discriminator_update() # self._init_target_ops() self._sess.run(tf.global_variables_initializer())
def run_experiment(variant): sub_level_policies_paths = [] args = arg() if args.domain == 'sawyer-reach': print("Composition Reach") goal_size = 0 sub_level_policies_paths.append("ikx") sub_level_policies_paths.append("iky") sub_level_policies_paths.append("ikz") random_arm_init = [-0.1, 0.1] render = False reward_shaping = True horizon = 250 env = normalize( CRLWrapper( IKWrapper( SawyerReach( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, )))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick': print("Composition Reach and Pick") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 1000 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 1500 elif args.domain == 'sawyer-reach-pick-simple': print("Composition Reach and Pick Simple") goal_size = 3 sub_level_policies_paths.append( "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl") sub_level_policies_paths.append( "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl") render = False random_arm_init = [-0.0001, 0.0001] reward_shaping = False horizon = 500 env = normalize( CRLWrapper( SawyerReachPick( # playable params random_arm_init=random_arm_init, has_renderer=render, reward_shaping=reward_shaping, horizon=horizon, placement_initializer=UniformRandomSampler( x_range=[-0.01, 0.01], y_range=[-0.01, 0.01], ensure_object_boundary_in_range=False, z_rotation=None, ), # constant params has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=100, ))) ep_length = 3000 else: raise ValueError("Domain not available") if args.demo: pool = DemoReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) else: pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, seq_len=len(sub_level_policies_paths), ) sampler = SimpleSampler( max_path_length=horizon - 1, # should be same as horizon min_pool_size=1000, batch_size=256) base_kwargs = dict( epoch_length=ep_length, n_epochs=5e3, # n_epochs=5, n_train_repeat=1, eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=sampler, use_demos=args.demo, ) M = 128 qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) policy = GaussianPtrPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=True, reg=1e-3, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, g=goal_size, policy=policy, sub_level_policies_paths=sub_level_policies_paths, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=3e-4, scale_reward=5, discount=0.99, tau=0.005, reparameterize=True, target_update_interval=1, action_prior='uniform', save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] if variant['num_hidden'] != 256: M = variant['num_hidden'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1', batchnormvf=variant['batchnormvf']) qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2', batchnormvf=variant['batchnormvf']) vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), batchnormvf=variant['batchnormvf'], dropoutvf_keep_prob=variant['dropoutvf']) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], todropoutpi=(variant['dropoutpi'] < 1.0), dropoutpi=variant['dropoutpi'], batchnormpi=variant['batchnormpi']) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) if variant['reward_scale'] < 0: scale_rew = algorithm_params['scale_reward'] else: scale_rew = variant['reward_scale'] algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'], scale_reward=scale_rew, discount=algorithm_params['discount'], tau=variant['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, l1regpi=variant['l1regpi'], l2regpi=variant['l2regpi'], l1regvf=variant['l1regvf'], l2regvf=variant['l2regvf'], ent_coef=variant['ent_coef'], wclippi=variant['wclippi'], wclipvf=variant['wclipvf'], dropoutpi=variant['dropoutpi'], dropoutvf=variant['dropoutvf'], batchnormpi=variant['batchnormpi'], batchnormvf=variant['batchnormvf']) algorithm._sess.run(tf.global_variables_initializer()) for v in tf.trainable_variables(): print(v.name) algorithm.train() if variant['policypath'] != '': save_w_path = os.path.expanduser(variant['policypath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='gaussian_policy'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',') if variant['valuepath'] != '': save_w_path = os.path.expanduser(variant['valuepath']) toexport = [] savesess = algorithm._sess for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf1'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qf2'): toexport.append(savesess.run(v)) for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vf'): toexport.append(savesess.run(v)) np.savetxt(save_w_path, np.concatenate(toexport, axis=None), delimiter=',')