示例#1
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    constants.COST_TYPE = variant['algorithm_params']['cost_type']
    register(
        id='MECS-v1',
        entry_point='sac.envs.environment_V_sweep:MEC_v1',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v2',
        entry_point='sac.envs.env_V_sweep_v2:MEC_v2',
        max_episode_steps=5000,
    )

    register(
        id='MECS-v3',
        entry_point='sac.envs.env_V_sweep_v3:MEC_v3',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v4',
        entry_point='sac.envs.env_V_sweep_v4:MEC_v4',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v5',
        entry_point='sac.envs.env_V_sweep_v5:MEC_v5',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v6',
        entry_point='sac.envs.env_V_sweep_v6:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v61',
        entry_point='sac.envs.env_V_sweep_v6_with_a:MEC_v6',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v7',
        entry_point='sac.envs.env_V_sweep_v7_new:MEC_v7',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v8',
        entry_point='sac.envs.env_V_sweep_v8_new:MEC_v8',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v9',
        entry_point='sac.envs.env_V_sweep_v9:MEC_v9',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v10',
        entry_point='sac.envs.env_V_sweep_v10:MEC_v10',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v11',
        entry_point='sac.envs.env_V_sweep_v11:MEC_v11',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v12',
        entry_point='sac.envs.env_V_sweep_v12:MEC_v12',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v13',
        entry_point='sac.envs.env_V_sweep_v13:MEC_v13',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v14',
        entry_point='sac.envs.env_V_sweep_v14:MEC_v14',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v15',
        entry_point='sac.envs.env_V_sweep_v15:MEC_v15',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v16',
        entry_point='sac.envs.env_V_sweep_v16:MEC_v16',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v17',
        entry_point='sac.envs.env_V_sweep_v17:MEC_v17',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v18',
        entry_point='sac.envs.env_V_sweep_v18:MEC_v18',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v19',
        entry_point='sac.envs.env_V_sweep_v19:MEC_v19',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v20',
        entry_point='sac.envs.env_V_sweep_v20:MEC_v20',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v21',
        entry_point='sac.envs.env_V_sweep_v21:MEC_v21',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v22',
        entry_point='sac.envs.env_V_sweep_v22:MEC_v22',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v23',
        entry_point='sac.envs.env_V_sweep_v23:MEC_v23',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v24',
        entry_point='sac.envs.env_V_sweep_v24:MEC_v24',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v25',
        entry_point='sac.envs.env_V_sweep_v25:MEC_v25',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v26',
        entry_point='sac.envs.env_V_sweep_v26:MEC_v26',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v27',
        entry_point='sac.envs.env_V_sweep_v27:MEC_v27',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v28',
        entry_point='sac.envs.env_V_sweep_v28:MEC_v28',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v29',
        entry_point='sac.envs.env_V_sweep_v29:MEC_v29',
        max_episode_steps=5000,
    )
    register(
        id='MECS-v30',
        entry_point='sac.envs.env_V_sweep_v30:MEC_v30',
        max_episode_steps=5000,
    )

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale']*algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
示例#2
0
def run_experiment(param):
    random_arm_init = [-0.1, 0.1]
    lower_goal_range = [-0.1, -0.1, -0.1]
    upper_goal_range = [0.1, 0.1, 0.1]
    render = False
    reward_shaping = True
    horizon = 250
    env = normalize(
        CRLWrapper(
            # IKWrapper(
                SawyerReach(
                    # playable params
                    random_arm_init=random_arm_init,
                    lower_goal_range=lower_goal_range,
                    upper_goal_range=upper_goal_range,
                    has_renderer=render,
                    reward_shaping=reward_shaping,
                    horizon=horizon,

                    # constant params
                    has_offscreen_renderer=False,
                    use_camera_obs=False,
                    use_object_obs=True,
                    control_freq=100, )
            # )
        )
    )
    replay_buffer_params = {
        'max_replay_buffer_size': 1e6,
    }

    sampler_params = {
        'max_path_length': horizon - 1,
        'min_pool_size': 1000,
        'batch_size': 256,
    }


    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(
        {
            'epoch_length': 1500,
            'n_train_repeat': 1,
            'n_initial_exploration_steps': 5000,
            'eval_render': False,
            'eval_n_episodes': 1,
            'eval_deterministic': True,
            'n_epochs': 2e3
        },
        sampler=sampler)

    M = 64
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec=env.spec,
        hidden_layer_sizes=(64, 64),
        reparameterize=True,
        reg=1e-3,
    )

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=3e-4,
        scale_reward=20,
        discount=0.99,
        tau=0.005,
        reparameterize=True,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
示例#3
0
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
                env_spec=env.spec,
                hidden_layer_sizes=(M,M),
                reparameterize=policy_params['reparameterize'],
                reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get('preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
示例#4
0
def run_experiment(env, seed, scale_reward,
                   scale_entropy, tsallisQ, num_of_train):
    tf.set_random_seed(seed)

    environmentName = env
    # environmentName = "LunarLanderContinuous-v2"

    print("Experiment: {}".format(environmentName))

    # Set up the PyBullet environment.
    # env = normalize(gym.make(environmentName))
    env = GymEnv(environmentName)

    # Set up the replay buffer.
    pool = SimpleReplayBuffer(env_spec = env.spec, max_replay_buffer_size = 1000000)

    # Set up the sampler.
    sampler_params = {
        'max_path_length': 1000,
        'min_pool_size': 1000,
        'batch_size': 256,
    }
    sampler = SimpleSampler(**sampler_params)

    # Set up the value function networks.
    M = 128
    qf1 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf1')
    qf2 = NNQFunction(env_spec = env.spec, hidden_layer_sizes = (M, M), name = 'qf2')
    vf = NNVFunction(env_spec = env.spec, hidden_layer_sizes = (M, M))

    # Set up the policy network.
    # initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    policy = GaussianPolicy(
        env_spec = env.spec,
        hidden_layer_sizes = (M, M),
        reparameterize = False,
        reg = 1e-3,
    )
    # policy = GMMPolicy(
    #     env_spec=env.spec,
    #     K=1,
    #     hidden_layer_sizes=(M, M),
    #     reparameterize=False,
    #     qf=qf1,
    #     reg=1.0e-3,
    # )

    initial_exploration_policy = policy

    base_kwargs = {
        'epoch_length': 1000,
        'n_train_repeat': num_of_train,
        'n_initial_exploration_steps': 1000,
        'eval_render': False,
        'eval_n_episodes': 3,
        'eval_deterministic': True,
    }
    base_kwargs = dict(base_kwargs, sampler = sampler)

    # Define a function for reward scaling.
    def incrementor(itr):
        return (0.5 + (0.8 - 0.5) * tf.minimum(itr / 500000., 1.0))

    def decrementor(itr):
        return (0.8 - (0.8 - 0.6) * tf.minimum(itr / 500000., 1.0))

    algorithm = TAC(
        base_kwargs = base_kwargs,
        env = env,
        policy = policy,
        initial_exploration_policy = initial_exploration_policy,
        pool = pool,
        qf1 = qf1,
        qf2 = qf2,
        vf = vf,
        lr = 3.0e-4,
        scale_reward = scale_reward,  # CG: default 1.0, 0.5 for the lunar lander problem, 3.0 for the pendulum problem.
        scale_entropy = scale_entropy,  # CG: default 1.0, 0.8 for the lunar lander problem.
        discount = 0.99,
        tau = 0.01,
        reparameterize = False,
        target_update_interval = 1,
        action_prior = 'uniform',
        save_full_state = False,
        tsallisQ = tsallisQ,
    )

    algorithm._sess.run(tf.global_variables_initializer())
    algorithm.train()
示例#5
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])
示例#6
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    if variant['num_hidden'] != 256:
        M = variant['num_hidden']
    qf1 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf1',
                      batchnormvf=variant['batchnormvf'])
    qf2 = NNQFunction(env_spec=env.spec,
                      hidden_layer_sizes=(M, M),
                      name='qf2',
                      batchnormvf=variant['batchnormvf'])
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(M, M),
                     batchnormvf=variant['batchnormvf'],
                     dropoutvf_keep_prob=variant['dropoutvf'])

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(env_spec=env.spec,
                                hidden_layer_sizes=(M, M),
                                reparameterize=policy_params['reparameterize'],
                                todropoutpi=(variant['dropoutpi'] < 1.0),
                                dropoutpi=variant['dropoutpi'],
                                batchnormpi=variant['batchnormpi'])
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    if variant['reward_scale'] < 0:
        scale_rew = algorithm_params['scale_reward']
    else:
        scale_rew = variant['reward_scale']
    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'] if variant['lr'] < 0 else variant['lr'],
        scale_reward=scale_rew,
        discount=algorithm_params['discount'],
        tau=variant['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
        l1regpi=variant['l1regpi'],
        l2regpi=variant['l2regpi'],
        l1regvf=variant['l1regvf'],
        l2regvf=variant['l2regvf'],
        ent_coef=variant['ent_coef'],
        wclippi=variant['wclippi'],
        wclipvf=variant['wclipvf'],
        dropoutpi=variant['dropoutpi'],
        dropoutvf=variant['dropoutvf'],
        batchnormpi=variant['batchnormpi'],
        batchnormvf=variant['batchnormvf'])

    algorithm._sess.run(tf.global_variables_initializer())

    for v in tf.trainable_variables():
        print(v.name)

    algorithm.train()

    if variant['policypath'] != '':
        save_w_path = os.path.expanduser(variant['policypath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='gaussian_policy'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')
    if variant['valuepath'] != '':
        save_w_path = os.path.expanduser(variant['valuepath'])
        toexport = []
        savesess = algorithm._sess
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf1'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='qf2'):
            toexport.append(savesess.run(v))
        for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='vf'):
            toexport.append(savesess.run(v))
        np.savetxt(save_w_path,
                   np.concatenate(toexport, axis=None),
                   delimiter=',')