Exemplo n.º 1
0
def run_experiment(param):
    random_arm_init = [-0.1, 0.1]
    lower_goal_range = [-0.1, -0.1, -0.1]
    upper_goal_range = [0.1, 0.1, 0.1]
    render = False
    reward_shaping = True
    horizon = 250
    env = normalize(
        CRLWrapper(
            # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    lower_goal_range=lower_goal_range,
                    upper_goal_range=upper_goal_range,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
            # )
        )
    )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }


    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 2e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=20,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Exemplo n.º 2
0
def run_experiment(variant):
    sub_level_policies_paths = []
    # args = parse_args()
    args = arg()
    domain = ENVIRONMENTS[args.domain][args.task]
    if args.domain == 'sawyer-reach':
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        lower_goal_range = [-0.1, -0.1, -0.1]
        upper_goal_range = [0.1, 0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    domain(
                        # playable params
                        random_arm_init=random_arm_init,
                        lower_goal_range=lower_goal_range,
                        upper_goal_range=upper_goal_range,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
    else:
        raise ValueError("Domain not available")

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )
    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=2e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler)
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Exemplo n.º 3
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    constants.COST_TYPE = variant['algorithm_params']['cost_type']
    register(
        id='MECS-v1',
        entry_point='sac.envs.environment_V_sweep:MEC_v1',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v2',
        entry_point='sac.envs.env_V_sweep_v2:MEC_v2',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v3',
        entry_point='sac.envs.env_V_sweep_v3:MEC_v3',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v4',
        entry_point='sac.envs.env_V_sweep_v4:MEC_v4',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v5',
        entry_point='sac.envs.env_V_sweep_v5:MEC_v5',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v6',
        entry_point='sac.envs.env_V_sweep_v6:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v61',
        entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v7',
        entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v8',
        entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v9',
        entry_point='sac.envs.env_V_sweep_v9:MEC_v9',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v10',
        entry_point='sac.envs.env_V_sweep_v10:MEC_v10',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v11',
        entry_point='sac.envs.env_V_sweep_v11:MEC_v11',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v12',
        entry_point='sac.envs.env_V_sweep_v12:MEC_v12',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v13',
        entry_point='sac.envs.env_V_sweep_v13:MEC_v13',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v14',
        entry_point='sac.envs.env_V_sweep_v14:MEC_v14',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v15',
        entry_point='sac.envs.env_V_sweep_v15:MEC_v15',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v16',
        entry_point='sac.envs.env_V_sweep_v16:MEC_v16',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v17',
        entry_point='sac.envs.env_V_sweep_v17:MEC_v17',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v18',
        entry_point='sac.envs.env_V_sweep_v18:MEC_v18',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v19',
        entry_point='sac.envs.env_V_sweep_v19:MEC_v19',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v20',
        entry_point='sac.envs.env_V_sweep_v20:MEC_v20',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v21',
        entry_point='sac.envs.env_V_sweep_v21:MEC_v21',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v22',
        entry_point='sac.envs.env_V_sweep_v22:MEC_v22',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v23',
        entry_point='sac.envs.env_V_sweep_v23:MEC_v23',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v24',
        entry_point='sac.envs.env_V_sweep_v24:MEC_v24',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v25',
        entry_point='sac.envs.env_V_sweep_v25:MEC_v25',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v26',
        entry_point='sac.envs.env_V_sweep_v26:MEC_v26',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v27',
        entry_point='sac.envs.env_V_sweep_v27:MEC_v27',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v28',
        entry_point='sac.envs.env_V_sweep_v28:MEC_v28',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v29',
        entry_point='sac.envs.env_V_sweep_v29:MEC_v29',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v30',
        entry_point='sac.envs.env_V_sweep_v30:MEC_v30',
        max_episode_steps=5000,
    )

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Exemplo n.º 4
0
def run_experiment(variant):
    #low_level_policy = load_low_level_policy(
    #    policy_path='/home/rcorona/sac/data/humanoid-rllab/default-humanoid_base-00/itr_0.pkl')#variant['low_level_policy_path'])

    env_name = variant['env_name']
    env_type = env_name.split('-')[-1]

    env_args = {
        name.replace('env_', '', 1): value
        for name, value in variant.items()
        if name.startswith('env_') and name != 'env_name'
    }
    if 'random-goal' in env_name:
        EnvClass = RANDOM_GOAL_ENVS[env_type]
    elif 'rllab' in variant['env_name']:
        EnvClass = RLLAB_ENVS[variant['env_name']]
    else:
        raise NotImplementedError

    base_env = normalize(EnvClass(**env_args))
    env = base_env
    #env = HierarchyProxyEnv(wrapped_env=base_env,
    #                        low_level_policy=low_level_policy)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = variant['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    preprocessing_hidden_sizes = variant.get('preprocessing_hidden_sizes')
    observations_preprocessor = (
        MLPPreprocessor(env_spec=env.spec,
                        layer_sizes=preprocessing_hidden_sizes,
                        name='high_level_observations_preprocessor')
        if preprocessing_hidden_sizes is not None else None)

    policy_s_t_layers = variant['policy_s_t_layers']
    policy_s_t_units = variant['policy_s_t_units']
    s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

    bijector_config = {
        "scale_regularization": 0.0,
        "num_coupling_layers": variant['policy_coupling_layers'],
        "translation_hidden_sizes": s_t_hidden_sizes,
        "scale_hidden_sizes": s_t_hidden_sizes,
    }

    policy = LatentSpacePolicy(
        env_spec=env.spec,
        mode="train",
        squash=False,
        bijector_config=bijector_config,
        q_function=qf1,
        fix_h_on_reset=variant.get('policy_fix_h_on_reset', False),
        observations_preprocessor=observations_preprocessor,
        name="high_level_policy")

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf1=qf1,
        vf=vf,
        qf2=qf2,
        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],
        target_update_interval=variant['target_update_interval'],
        action_prior=variant['action_prior'],
        initial_exploration_policy=initial_exploration_policy,
        save_full_state=False,
    )

    algorithm.train()
def run_experiment(variant):
    domain = None
    goal_size = None
    sub_level_policies_paths = []
    if args.domain == 'ant-cross-maze':
        domain = CrossMazeAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'ant-random-goal':
        domain = RandomGoalAntEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/ant/fwrd/fwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/bwrd/bwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/uwrd/uwrd.pkl")
        sub_level_policies_paths.append("primitive-policies/ant/dwrd/dwrd.pkl")
    elif args.domain == 'cheetah-hurdle':
        domain = HalfCheetahHurdleEnv
        goal_size = 2
        sub_level_policies_paths.append("primitive-policies/hc/fwd/fwd.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/hc/jp-longz/jump.pkl")
    elif args.domain == 'pusher':
        domain = PusherEnv
        goal_size = 0
        sub_level_policies_paths.append(
            "primitive-policies/pusher/bottom/bottom.pkl")
        sub_level_policies_paths.append(
            "primitive-policies/pusher/left/left.pkl")

    env = normalize(domain())  #CrossMazeAntEnv())

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
        seq_len=len(sub_level_policies_paths),
    )

    sampler = SimpleSampler(max_path_length=1000,
                            min_pool_size=1000,
                            batch_size=256)

    base_kwargs = dict(epoch_length=1000,
                       n_epochs=5e3,
                       n_train_repeat=1,
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=sampler)

    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Exemplo n.º 6
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Exemplo n.º 7
0
    def __init__(self,
                 base_kwargs,
                 env,
                 policy,
                 discriminator,
                 qf,
                 vf,
                 pool,
                 plotter=None,
                 lr=3E-3,
                 scale_entropy=1,
                 discount=0.99,
                 tau=0.01,
                 num_skills=20,
                 save_full_state=False,
                 find_best_skill_interval=10,
                 best_skill_n_rollouts=10,
                 learn_p_z=False,
                 include_actions=False,
                 add_p_z=True,
                 reparametrize=False):
        """
        Args:
            base_kwargs (dict): dictionary of base arguments that are directly
                passed to the base `RLAlgorithm` constructor.
            env (`rllab.Env`): rllab environment object.
            policy: (`rllab.NNPolicy`): A policy function approximator.
            discriminator: (`rllab.NNPolicy`): A discriminator for z.
            qf (`ValueFunction`): Q-function approximator.
            vf (`ValueFunction`): Soft value function approximator.
            pool (`PoolBase`): Replay buffer to add gathered samples to.
            plotter (`QFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            lr (`float`): Learning rate used for the function approximators.
            scale_entropy (`float`): Scaling factor for entropy.
            discount (`float`): Discount factor for Q-function updates.
            tau (`float`): Soft value function target update weight.
            num_skills (`int`): Number of skills/options to learn.
            save_full_state (`bool`): If True, save the full class in the
                snapshot. See `self.get_snapshot` for more information.
            find_best_skill_interval (`int`): How often to recompute the best
                skill.
            best_skill_n_rollouts (`int`): When finding the best skill, how
                many rollouts to do per skill.
            include_actions (`bool`): Whether to pass actions to the
                discriminator.
            add_p_z (`bool`): Whether th include log p(z) in the pseudo-reward.
        """

        Serializable.quick_init(self, locals())
        super(SAC, self).__init__(**base_kwargs)

        self._env = env
        self._policy = policy
        self._discriminator = discriminator
        self._include_actions = False
        self._qf = qf
        self._vf = vf
        self._pool = pool
        self._plotter = plotter

        self._policy_lr = lr
        self._discriminator_lr = lr
        self._qf_lr = lr
        self._vf_lr = lr
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._num_skills = num_skills
        self._p_z = np.full(num_skills, 1.0 / num_skills)
        self._find_best_skill_interval = find_best_skill_interval
        self._best_skill_n_rollouts = best_skill_n_rollouts
        self._learn_p_z = learn_p_z
        self._save_full_state = save_full_state
        self._add_p_z = add_p_z

        self._Da = self._env.action_space.flat_dim
        self._Do = self._env.observation_space.flat_dim

        self._training_ops = list()

        super(DIAYN, self).__init__(base_kwargs=base_kwargs,
                                    env=env,
                                    policy=policy,
                                    initial_exploration_policy=UniformPolicy(
                                        env.spec),
                                    qf1=None,
                                    qf2=None,
                                    vf=vf,
                                    pool=pool,
                                    lr=lr,
                                    discount=discount,
                                    tau=tau,
                                    save_full_state=save_full_state,
                                    reparameterize=reparametrize)

        # self._init_placeholders()
        # self._init_actor_update()
        # self._init_critic_update()
        self._init_discriminator_update()
        # self._init_target_ops()

        self._sess.run(tf.global_variables_initializer())
Exemplo n.º 8
0
def run_experiment(variant):
    sub_level_policies_paths = []
    args = arg()

    if args.domain == 'sawyer-reach':
        print("Composition Reach")
        goal_size = 0
        sub_level_policies_paths.append("ikx")
        sub_level_policies_paths.append("iky")
        sub_level_policies_paths.append("ikz")
        random_arm_init = [-0.1, 0.1]
        render = False
        reward_shaping = True
        horizon = 250
        env = normalize(
            CRLWrapper(
                IKWrapper(
                    SawyerReach(
                        # playable params
                        random_arm_init=random_arm_init,
                        has_renderer=render,
                        reward_shaping=reward_shaping,
                        horizon=horizon,

                        # constant params
                        has_offscreen_renderer=False,
                        use_camera_obs=False,
                        use_object_obs=True,
                        control_freq=100,
                    ))))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick':
        print("Composition Reach and Pick")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 1000
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 1500

    elif args.domain == 'sawyer-reach-pick-simple':
        print("Composition Reach and Pick Simple")
        goal_size = 3
        sub_level_policies_paths.append(
            "log/prim/pick/2019-08-14-18-18-17-370041-PDT/itr_2000.pkl")
        sub_level_policies_paths.append(
            "log/prim/reach/2019-08-20-15-52-39-191438-PDT/itr_2000.pkl")

        render = False

        random_arm_init = [-0.0001, 0.0001]
        reward_shaping = False
        horizon = 500
        env = normalize(
            CRLWrapper(
                SawyerReachPick(
                    # playable params
                    random_arm_init=random_arm_init,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,
                    placement_initializer=UniformRandomSampler(
                        x_range=[-0.01, 0.01],
                        y_range=[-0.01, 0.01],
                        ensure_object_boundary_in_range=False,
                        z_rotation=None,
                    ),
                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100,
                )))
        ep_length = 3000
    else:
        raise ValueError("Domain not available")

    if args.demo:
        pool = DemoReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )
    else:
        pool = SimpleReplayBuffer(
            env_spec=env.spec,
            max_replay_buffer_size=1e6,
            seq_len=len(sub_level_policies_paths),
        )

    sampler = SimpleSampler(
        max_path_length=horizon - 1,  # should be same as horizon
        min_pool_size=1000,
        batch_size=256)

    base_kwargs = dict(
        epoch_length=ep_length,
        n_epochs=5e3,
        # n_epochs=5,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
        sampler=sampler,
        use_demos=args.demo,
    )
    M = 128
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)
    policy = GaussianPtrPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        g=goal_size,
        policy=policy,
        sub_level_policies_paths=sub_level_policies_paths,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=5,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )
    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
Exemplo n.º 9
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    if variant['num_hidden'] != 256:
        M = variant['num_hidden']
    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf1',
                      batchnormvf=variant['batchnormvf'])
    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf2',
                      batchnormvf=variant['batchnormvf'])
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(M, M),
                     batchnormvf=variant['batchnormvf'],
                     dropoutvf_keep_prob=variant['dropoutvf'])

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(env_spec=env.spec,
                                hidden_layer_sizes=(M, M),
                                reparameterize=policy_params['reparameterize'],
                                todropoutpi=(variant['dropoutpi'] < 1.0),
                                dropoutpi=variant['dropoutpi'],
                                batchnormpi=variant['batchnormpi'])
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    if variant['reward_scale'] < 0:
        scale_rew = algorithm_params['scale_reward']
    else:
        scale_rew = variant['reward_scale']
    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'],
        scale_reward=scale_rew,
        discount=algorithm_params['discount'],
        tau=variant['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
        l1regpi=variant['l1regpi'],
        l2regpi=variant['l2regpi'],
        l1regvf=variant['l1regvf'],
        l2regvf=variant['l2regvf'],
        ent_coef=variant['ent_coef'],
        wclippi=variant['wclippi'],
        wclipvf=variant['wclipvf'],
        dropoutpi=variant['dropoutpi'],
        dropoutvf=variant['dropoutvf'],
        batchnormpi=variant['batchnormpi'],
        batchnormvf=variant['batchnormvf'])

    algorithm._sess.run(tf.global_variables_initializer())

    for v in tf.trainable_variables():
        print(v.name)

    algorithm.train()

    if variant['policypath'] != '':
        save_w_path = os.path.expanduser(variant['policypath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='gaussian_policy'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
    if variant['valuepath'] != '':
        save_w_path = os.path.expanduser(variant['valuepath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf1'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf2'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='vf'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')