Пример #1
0
    def __init__(self,
                 policy2,
                 baseline2,
                 obs1_dim,
                 obs2_dim,
                 action1_dim,
                 action2_dim,
                 optimizer_args=None,
                 optimizer2_args=None,
                 transfer=True,
                 record_rewards=True,
                 rewards=None,
                 N1=1,
                 N2=1,
                 **kwargs):
        self.transfer = transfer
        sampler_cls = RARLSampler
        sampler_args = dict()
        self.policy2 = policy2
        self.baseline2 = baseline2
        if optimizer_args is None:
            optimizer_args = dict()
        if optimizer2_args is None:
            optimizer2_args = dict()
        self.optimizer2 = ConjugateGradientOptimizer(**optimizer2_args)

        self.obs1_dim = obs1_dim
        self.obs2_dim = obs2_dim
        self.action1_dim = action1_dim
        self.action2_dim = action2_dim

        self.record_rewards = record_rewards
        if self.record_rewards:
            if rewards is None:  #create empty dict
                self.rewards = {}
                self.rewards['average_discounted_return1'] = []
                self.rewards['AverageReturn1'] = []
                self.rewards['StdReturn1'] = []
                self.rewards['MaxReturn1'] = []
                self.rewards['MinReturn1'] = []

                self.rewards['average_discounted_return2'] = []
                self.rewards['AverageReturn2'] = []
                self.rewards['StdReturn2'] = []
                self.rewards['MaxReturn2'] = []
                self.rewards['MinReturn2'] = []
            else:
                self.rewards = rewards

        self.N1 = N1
        self.N2 = N2
        super(RARL, self).__init__(sampler_cls=sampler_cls,
                                   sampler_args=sampler_args,
                                   optimizer_args=optimizer_args,
                                   **kwargs)
Пример #2
0
    def __init__(self,
                 transfer=True,
                 optimizer=None,
                 optimizer_args=None,
                 record_rewards=True,
                 rewards=None,
                 **kwargs):
        self.transfer = transfer
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)

        self.record_rewards = record_rewards
        if self.record_rewards:
            if rewards is None:  #create empty dict
                self.rewards = {}
                self.rewards['average_discounted_return'] = []
                self.rewards['AverageReturn'] = []
                self.rewards['StdReturn'] = []
                self.rewards['MaxReturn'] = []
                self.rewards['MinReturn'] = []
            else:
                self.rewards = rewards
        super(TRPO_t, self).__init__(optimizer=optimizer, **kwargs)
Пример #3
0
def get_baseline(env, value_function, num_slices):
    if (value_function == 'zero'):
        baseline = ZeroBaseline(env.spec)
    else:
        value_network = get_value_network(env)

        if (value_function == 'conj'):
            baseline_optimizer = ConjugateGradientOptimizer(
                subsample_factor=1.0, num_slices=num_slices)
        elif (value_function == 'adam'):
            baseline_optimizer = FirstOrderOptimizer(
                max_epochs=3,
                batch_size=512,
                num_slices=num_slices,
                ignore_last=True,
                #verbose=True
            )
        else:
            logger.log("Inappropirate value function")
            exit(0)

        baseline = DeterministicMLPBaseline(env.spec,
                                            num_slices=num_slices,
                                            regressor_args=dict(
                                                network=value_network,
                                                optimizer=baseline_optimizer,
                                                normalize_inputs=False))

    return baseline
Пример #4
0
def generate_expert_dp():
    env = TfEnv(normalize(InvertedPendulumEnv()))
    policy = GaussianMLPPolicy(
        name="expert_policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64),
        std_hidden_sizes=(64, 64),
        adaptive_std=True,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=64,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)),
        gae_lambda=0.97,
    )

    with tf.Session() as sess:
        algo.train(sess=sess)
        t = rollout(env=env, agent=policy, max_path_length=100, animated=False)
        print(sum(t['rewards']))
        with open('expert_dp.pickle', 'wb') as handle:
            pickle.dump(policy, handle)
        while True:
            rollout(env=env, agent=policy, max_path_length=100, animated=False)
Пример #5
0
 def __init__(self, optimizer=None, optimizer_args=None, **kwargs):
     assert optimizer is None
     assert optimizer_args is None
     n_particles = len(kwargs['policy_list'])
     optimizer_list = []
     for n in range(n_particles):
         optimizer_list.append(ConjugateGradientOptimizer(**dict()))
     super(BMAMLTRPO, self).__init__(optimizer_list=optimizer_list,
                                     **kwargs)
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
    import sandbox.rocky.tf.core.layers as L
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from railrl.envs.memory.high_low import HighLow
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    # env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = HighLow(num_steps=H)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianLSTMPolicy(
        name="policy",
        env_spec=env.spec,
        lstm_layer_cls=L.LSTMLayer,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
Пример #7
0
 def __init__(
         self,
         optimizer=None,
         optimizer_args=None,
         **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
Пример #8
0
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from rlkit.envs.flattened_product_box import FlattenedProductBox
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from rlkit.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from rlkit.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
Пример #9
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              step_size=0.01,
              **kwargs):
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         optimizer = ConjugateGradientOptimizer(**optimizer_args)
     self.optimizer = optimizer
     self.step_size = step_size
     super(RCTRPO, self).__init__(**kwargs)
Пример #10
0
    def __init__(
            self,
            transfer=True,
            optimizer=None,
            optimizer_args=None,
            record_env=True,
            **kwargs):
        self.transfer = transfer
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)

        self.record_env = record_env
        super(TRPO_t, self).__init__(optimizer=optimizer, sampler_cls=QMDPSampler,sampler_args=dict(),**kwargs)
Пример #11
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params
    pprint(params)

    grid_world = SlaveGridWorldEnv("walled_chain",
                                   max_traj_length=DEFAULTS["max_path_length"],
                                   goal_reward=params["goal_reward"])
    agent = GridWorldMasterAgent(grid_world,
                                 match_reward=params["match_reward"])
    env = normalize(
        SituatedConversationEnvironment(env=grid_world, b_agent=agent))
    baseline = LinearFeatureBaseline(env)

    policy = RecurrentCategoricalPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_dims=params["policy_hidden_dims"],
        feature_network=MLPNetworkWithEmbeddings(
            "feature_network", env.observation_space.flat_dim,
            params["feature_dim"], params["feature_hidden_dims"], tf.tanh,
            tf.tanh, agent.vocab_size, params["embedding_dim"]),
        state_include_action=False,
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=params["max_path_length"],
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=15,
        snapshot_mode="last",
        exp_prefix="grid_world_sweep3",
        variant=params,
    )
Пример #12
0
def run_experiment(params):
    params_base = copy.copy(DEFAULTS)
    params_base.update(params)
    params = params_base

    policy = RecurrentCategoricalPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_dims=params["policy_hidden_dims"],
        feature_network=MLPNetworkWithEmbeddings("embeddings",
                                                 len(VOCAB),
                                                 params["feature_dim"],
                                                 params["feature_hidden_dims"],
                                                 tf.tanh,
                                                 tf.tanh,
                                                 len(VOCAB),
                                                 params["embedding_dim"],
                                                 has_other_input=False),
        state_include_action=False,
    )

    baseline = LinearFeatureBaseline(env.spec)

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=LENGTH,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="autoenc_unnorm_reward",
        variant=params,
    )
    def __init__(self,
                 optimizer=None,
                 gate_optimizer=None,
                 optimizer_args=None,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)

        ## separate optimizer required for the gate
        if gate_optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            gate_optimizer = ConjugateGradientOptimizer_Gating_Function(
                **optimizer_args)

        super(TRPO, self).__init__(optimizer=optimizer,
                                   gate_optimizer=gate_optimizer,
                                   **kwargs)
Пример #14
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    env = NormalizedBoxEnv(env)
    env = ConvertEnvToTf(env)

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               **variant['policy_params'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    algo_kwargs = variant['algo_kwargs']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **algo_kwargs)
    algo.train()
Пример #15
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=5,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="grid_world_silent",
        variant=params,
    )
Пример #16
0
    def _init_bnn_trpo(self, bnn_model, training_policy, time_step):

        if hasattr(self.env._wrapped_env, '_wrapped_env'):
            inner_env = self.env._wrapped_env._wrapped_env
        else:
            inner_env = self.env._wrapped_env.env.unwrapped

        cost_np_vec = inner_env.cost_np_vec

        batch_size = self.policy_opt_params["trpo"]["batch_size"]
        if bnn_model is not None:
            bnn_env = TfEnv(
                BayesNeuralNetEnv(env=self.env,
                                  inner_env=inner_env,
                                  cost_np=cost_np_vec,
                                  bnn_model=bnn_model,
                                  sam_mode=None))
        else:
            bnn_env = self.env

        baseline = LinearFeatureBaseline(env_spec=self.env.spec)

        algo = TRPO(
            env=bnn_env,
            policy=training_policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=time_step,
            discount=self.policy_opt_params["trpo"]["discount"],
            step_size=self.policy_opt_params["trpo"]["step_size"],
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
            # sampler_args=sampler_args,  # params for VectorizedSampler
        )

        return algo, cost_np_vec
Пример #17
0
    def setup(self, env, policy, start_itr):

        if not self.args.algo == 'thddpg':
            # Baseline
            if self.args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif self.args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(self.args.baseline_type)

            if self.args.control == 'concurrent':
                baseline = [baseline for _ in range(len(env.agents))]
        # Logger
        default_log_dir = config.LOG_DIR
        if self.args.log_dir is None:
            log_dir = osp.join(default_log_dir, self.args.exp_name)
        else:
            log_dir = self.args.log_dir

        tabular_log_file = osp.join(log_dir, self.args.tabular_log_file)
        text_log_file = osp.join(log_dir, self.args.text_log_file)
        params_log_file = osp.join(log_dir, self.args.params_log_file)

        logger.log_parameters_lite(params_log_file, self.args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(self.args.snapshot_mode)
        logger.set_log_tabular_only(self.args.log_tabular_only)
        logger.push_prefix("[%s] " % self.args.exp_name)

        if self.args.algo == 'tftrpo':
            algo = MATRPO(
                env=env,
                policy_or_policies=policy,
                baseline_or_baselines=baseline,
                batch_size=self.args.batch_size,
                start_itr=start_itr,
                max_path_length=self.args.max_path_length,
                n_itr=self.args.n_iter,
                discount=self.args.discount,
                gae_lambda=self.args.gae_lambda,
                step_size=self.args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if self.args.recurrent else None,
                ma_mode=self.args.control)
        elif self.args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if self.args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif self.args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            algo = thDDPG(env=env,
                          policy=policy,
                          qf=qfunc,
                          es=es,
                          batch_size=self.args.batch_size,
                          max_path_length=self.args.max_path_length,
                          epoch_length=self.args.epoch_length,
                          min_pool_size=self.args.min_pool_size,
                          replay_pool_size=self.args.replay_pool_size,
                          n_epochs=self.args.n_iter,
                          discount=self.args.discount,
                          scale_reward=0.01,
                          qf_learning_rate=self.args.qfunc_lr,
                          policy_learning_rate=self.args.policy_lr,
                          eval_samples=self.args.eval_samples,
                          mode=self.args.control)
        return algo
Пример #18
0
    def __init__(self, env, args):
        self.args = args
        # Parallel setup
        parallel_sampler.initialize(n_parallel=args.n_parallel)
        if args.seed is not None:
            set_seed(args.seed)
            parallel_sampler.set_seed(args.seed)

        env, policy = rllab_envpolicy_parser(env, args)

        if not args.algo == 'thddpg':
            # Baseline
            if args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(args.baseline_type)

        # Logger
        default_log_dir = config.LOG_DIR
        if args.log_dir is None:
            log_dir = osp.join(default_log_dir, args.exp_name)
        else:
            log_dir = args.log_dir

        tabular_log_file = osp.join(log_dir, args.tabular_log_file)
        text_log_file = osp.join(log_dir, args.text_log_file)
        params_log_file = osp.join(log_dir, args.params_log_file)

        logger.log_parameters_lite(params_log_file, args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(args.snapshot_mode)
        logger.set_log_tabular_only(args.log_tabular_only)
        logger.push_prefix("[%s] " % args.exp_name)

        if args.algo == 'tftrpo':
            self.algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_iter,
                discount=args.discount,
                gae_lambda=args.gae_lambda,
                step_size=args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if args.recurrent else None,
                mode=args.control)
        elif args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            self.algo = thDDPG(env=env,
                               policy=policy,
                               qf=qfunc,
                               es=es,
                               batch_size=args.batch_size,
                               max_path_length=args.max_path_length,
                               epoch_length=args.epoch_length,
                               min_pool_size=args.min_pool_size,
                               replay_pool_size=args.replay_pool_size,
                               n_epochs=args.n_iter,
                               discount=args.discount,
                               scale_reward=0.01,
                               qf_learning_rate=args.qfunc_lr,
                               policy_learning_rate=args.policy_lr,
                               eval_samples=args.eval_samples,
                               mode=args.control)
Пример #19
0
class RARL(TRPO):
    def __init__(self,
                 policy2,
                 baseline2,
                 obs1_dim,
                 obs2_dim,
                 action1_dim,
                 action2_dim,
                 policy_path,
                 optimizer_args=None,
                 optimizer2_args=None,
                 transfer=True,
                 record_rewards=True,
                 rewards=None,
                 sample_policy_1=False,
                 N1=1,
                 N2=1,
                 policy_save_interval=50,
                 **kwargs):
        self.transfer = transfer
        sampler_cls = RARLSampler
        sampler_args = dict()
        self.policy2 = policy2
        self.baseline2 = baseline2
        if optimizer_args is None:
            optimizer_args = dict()
        if optimizer2_args is None:
            optimizer2_args = dict()
        self.optimizer2 = ConjugateGradientOptimizer(**optimizer2_args)

        self.obs1_dim = obs1_dim
        self.obs2_dim = obs2_dim
        self.action1_dim = action1_dim
        self.action2_dim = action2_dim

        self.record_rewards = record_rewards
        if self.record_rewards:
            if rewards is None:  #create empty dict
                self.rewards = {}
                self.rewards['average_discounted_return1'] = []
                self.rewards['AverageReturn1'] = []
                self.rewards['StdReturn1'] = []
                self.rewards['MaxReturn1'] = []
                self.rewards['MinReturn1'] = []

                self.rewards['average_discounted_return2'] = []
                self.rewards['AverageReturn2'] = []
                self.rewards['StdReturn2'] = []
                self.rewards['MaxReturn2'] = []
                self.rewards['MinReturn2'] = []
            else:
                self.rewards = rewards

        self.N1 = N1
        self.N2 = N2

        self.policy_path = policy_path
        self.policy_save_interval = policy_save_interval
        self.sample_policy_1 = sample_policy_1

        super(RARL, self).__init__(sampler_cls=sampler_cls,
                                   sampler_args=sampler_args,
                                   optimizer_args=optimizer_args,
                                   **kwargs)

    @overrides
    def init_opt(self):
        #first policy
        is_recurrent = int(self.policy.recurrent)

        extra_dims = 1 + is_recurrent
        name = 'obs'
        obs_var = tf.placeholder(tf.float32,
                                 shape=[None] * extra_dims + [self.obs1_dim],
                                 name=name)

        name = 'action'
        action_var = tf.placeholder(tf.float32,
                                    shape=[None] * extra_dims +
                                    [self.action1_dim],
                                    name=name)

        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )

        dist = self.policy.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = -tf.reduce_sum(
                lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  leq_constraint=(mean_kl, self.step_size),
                                  inputs=input_list,
                                  constraint_name="mean_kl")

        #second policy
        is_recurrent = int(self.policy2.recurrent)

        extra_dims = 1 + is_recurrent
        name = 'obs'
        obs_var = tf.placeholder(tf.float32,
                                 shape=[None] * extra_dims + [self.obs2_dim],
                                 name=name)

        name = 'action'
        action_var = tf.placeholder(tf.float32,
                                    shape=[None] * extra_dims +
                                    [self.action2_dim],
                                    name=name)

        advantage_var = tensor_utils.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=tf.float32,
        )
        dist = self.policy2.distribution

        old_dist_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name='old_%s' % k)
            for k, shape in dist.dist_info_specs
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        state_info_vars = {
            k: tf.placeholder(tf.float32,
                              shape=[None] * (1 + is_recurrent) + list(shape),
                              name=k)
            for k, shape in self.policy2.state_info_specs
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy2.state_info_keys
        ]

        if is_recurrent:
            valid_var = tf.placeholder(tf.float32,
                                       shape=[None, None],
                                       name="valid")
        else:
            valid_var = None

        dist_info_vars = self.policy2.dist_info_sym(obs_var, state_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                       dist_info_vars)
        if is_recurrent:
            mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
            surr_loss = -tf.reduce_sum(
                lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
        else:
            mean_kl = tf.reduce_mean(kl)
            surr_loss = -tf.reduce_mean(lr * advantage_var)

        input_list = [
            obs_var,
            action_var,
            advantage_var,
        ] + state_info_vars_list + old_dist_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer2.update_opt(loss=surr_loss,
                                   target=self.policy2,
                                   leq_constraint=(mean_kl, self.step_size),
                                   inputs=input_list,
                                   constraint_name="mean_kl")

    @overrides
    def optimize_policy(self, itr, samples_data, policy_num):
        all_input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        agent_infos = samples_data["agent_infos"]
        if policy_num == 1:
            state_info_list = [
                agent_infos[k] for k in self.policy.state_info_keys
            ]
            dist_info_list = [
                agent_infos[k] for k in self.policy.distribution.dist_info_keys
            ]
        else:
            state_info_list = [
                agent_infos[k] for k in self.policy2.state_info_keys
            ]
            dist_info_list = [
                agent_infos[k]
                for k in self.policy2.distribution.dist_info_keys
            ]
        all_input_values += tuple(state_info_list) + tuple(dist_info_list)

        if policy_num == 1:
            if self.policy.recurrent:
                all_input_values += (samples_data["valids"], )
            logger.log("Computing loss before")
            loss_before = self.optimizer.loss(all_input_values)
            logger.log("Computing KL before")
            mean_kl_before = self.optimizer.constraint_val(all_input_values)
            logger.log("Optimizing")
            self.optimizer.optimize(all_input_values)
            logger.log("Computing KL after")
            mean_kl = self.optimizer.constraint_val(all_input_values)
            logger.log("Computing loss after")
            loss_after = self.optimizer.loss(all_input_values)
        else:
            if self.policy2.recurrent:
                all_input_values += (samples_data["valids"], )
            logger.log("Computing loss before")
            loss_before = self.optimizer2.loss(all_input_values)
            logger.log("Computing KL before")
            mean_kl_before = self.optimizer2.constraint_val(all_input_values)
            logger.log("Optimizing")
            self.optimizer2.optimize(all_input_values)
            logger.log("Computing KL after")
            mean_kl = self.optimizer2.constraint_val(all_input_values)
            logger.log("Computing loss after")
            loss_after = self.optimizer2.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('MeanKLBefore', mean_kl_before)
        logger.record_tabular('MeanKL', mean_kl)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    @overrides
    def obtain_samples(self, itr, policy_num):
        return self.sampler.obtain_samples(itr, policy_num,
                                           self.sample_policy_1)

    @overrides
    def process_samples(self, itr, paths, policy_num):
        return self.sampler.process_samples(itr, paths, policy_num)

    @overrides
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session()
            sess.__enter__()

        if not self.transfer:
            sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            if itr == 0 or itr % self.policy_save_interval == 0:
                params = dict(
                    params1=self.policy.get_param_values(),
                    params2=self.policy2.get_param_values(),
                )
                joblib.dump(params,
                            self.policy_path + '/params' + str(itr) + '.pkl',
                            compress=3)

            itr_start_time = time.time()

            for n1 in range(self.N1):
                with logger.prefix('itr #%d ' % itr + 'n1 #%d |' % n1):
                    logger.log("training policy 1...")
                    logger.log("Obtaining samples...")
                    paths = self.obtain_samples(itr, 1)
                    logger.log("Processing samples...")
                    samples_data = self.process_samples(itr, paths, 1)

                    if self.record_rewards:
                        undiscounted_returns = [
                            sum(path["rewards"]) for path in paths
                        ]
                        average_discounted_return = np.mean(
                            [path["returns"][0] for path in paths])
                        AverageReturn = np.mean(undiscounted_returns)
                        StdReturn = np.std(undiscounted_returns)
                        MaxReturn = np.max(undiscounted_returns)
                        MinReturn = np.min(undiscounted_returns)
                        self.rewards['average_discounted_return1'].append(
                            average_discounted_return)
                        self.rewards['AverageReturn1'].append(AverageReturn)
                        self.rewards['StdReturn1'].append(StdReturn)
                        self.rewards['MaxReturn1'].append(MaxReturn)
                        self.rewards['MinReturn1'].append(MinReturn)

                    logger.log("Logging diagnostics...")
                    self.log_diagnostics(paths, 1)
                    logger.log("Optimizing policy...")
                    self.optimize_policy(itr, samples_data, 1)

                    logger.record_tabular('Time', time.time() - start_time)
                    logger.record_tabular('ItrTime',
                                          time.time() - itr_start_time)
                    logger.dump_tabular(with_prefix=False)

            for n2 in range(self.N2):
                if itr != self.n_itr - 1:  #don't train adversary at last time
                    with logger.prefix('itr #%d ' % itr + 'n2 #%d |' % n2):
                        logger.log("training policy 2...")
                        logger.log("Obtaining samples...")
                        paths = self.obtain_samples(itr, 2)
                        logger.log("Processing samples...")
                        samples_data = self.process_samples(itr, paths, 2)

                        if self.record_rewards:
                            undiscounted_returns = [
                                sum(path["rewards"]) for path in paths
                            ]
                            average_discounted_return = np.mean(
                                [path["returns"][0] for path in paths])
                            AverageReturn = np.mean(undiscounted_returns)
                            StdReturn = np.std(undiscounted_returns)
                            MaxReturn = np.max(undiscounted_returns)
                            MinReturn = np.min(undiscounted_returns)
                            self.rewards['average_discounted_return2'].append(
                                average_discounted_return)
                            self.rewards['AverageReturn2'].append(
                                AverageReturn)
                            self.rewards['StdReturn2'].append(StdReturn)
                            self.rewards['MaxReturn2'].append(MaxReturn)
                            self.rewards['MinReturn2'].append(MinReturn)

                        logger.log("Logging diagnostics...")
                        self.log_diagnostics(paths, 2)
                        logger.log("Optimizing policy...")
                        self.optimize_policy(itr, samples_data, 2)

                        logger.record_tabular('Time', time.time() - start_time)
                        logger.record_tabular('ItrTime',
                                              time.time() - itr_start_time)
                        logger.dump_tabular(with_prefix=False)

            logger.log("Saving snapshot...")
            params = self.get_itr_snapshot(itr)  # , **kwargs)
            logger.save_itr_params(itr, params)
            logger.log("Saved")
            # logger.record_tabular('Time', time.time() - start_time)
            # logger.record_tabular('ItrTime', time.time() - itr_start_time)
            # logger.dump_tabular(with_prefix=False)

        self.shutdown_worker()
        if created_session:
            sess.close()

    @overrides
    def get_itr_snapshot(self, itr):
        if self.record_rewards:
            return dict(
                itr=itr,
                policy=self.policy,
                policy2=self.policy2,
                baseline=self.baseline,
                baseline2=self.baseline2,
                env=self.env,
                rewards=self.rewards,
            )
        else:
            return dict(
                itr=itr,
                policy=self.policy,
                policy2=self.policy2,
                baseline=self.baseline,
                baseline2=self.baseline2,
                env=self.env,
            )

    @overrides
    def log_diagnostics(self, paths, policy_num):
        self.env.log_diagnostics(paths)
        if policy_num == 1:
            self.policy.log_diagnostics(paths)
            self.baseline.log_diagnostics(paths)
        else:
            self.policy2.log_diagnostics(paths)
            self.baseline2.log_diagnostics(paths)
Пример #20
0
    def __init__(
            self,
            env,
            qf,
            es,
            policy=None,
            policy_batch_size=32,
            n_epochs=200,
            epoch_length=1000,
            discount=0.99,
            max_path_length=250,
            policy_weight_decay=0,
            policy_update_method='adam',
            policy_learning_rate=1e-3,
            policy_step_size=0.01,
            policy_optimizer_args=dict(),
            policy_updates_ratio=1.0,
            policy_use_target=True,
            policy_sample_last=False,
            eval_samples=10000,
            updates_ratio=1.0, # #updates/#samples
            scale_reward=1.0,
            include_horizon_terminal_transitions=False,
            save_freq=0,
            save_format='pickle',
            restore_auto=True,
            **kwargs):

        self.env = env
        self.policy = policy
        if self.policy is None: self.qf_dqn = True
        else: self.qf_dqn = False
        self.qf = qf
        self.es = es
        if self.es is None: self.es = ExplorationStrategy()
        self.n_epochs = n_epochs
        self.epoch_length = epoch_length
        self.discount = discount
        self.max_path_length = max_path_length

        self.init_critic(**kwargs)

        if not self.qf_dqn:
            self.policy_weight_decay = policy_weight_decay
            if policy_update_method == 'adam':
                self.policy_update_method = \
                    FirstOrderOptimizer(
                        update_method=policy_update_method,
                        learning_rate=policy_learning_rate,
                        **policy_optimizer_args,
                    )
                self.policy_learning_rate = policy_learning_rate
            elif policy_update_method == 'cg':
                self.policy_update_method = \
                    ConjugateGradientOptimizer(
                        **policy_optimizer_args,
                    )
                self.policy_step_size = policy_step_size
            self.policy_optimizer_args = policy_optimizer_args
            self.policy_updates_ratio = policy_updates_ratio
            self.policy_use_target = policy_use_target
            self.policy_batch_size = policy_batch_size
            self.policy_sample_last = policy_sample_last
            self.policy_surr_averages = []
            self.exec_policy = self.policy
        else:
            self.policy_batch_size = 0
            self.exec_policy = self.qf

        self.eval_samples = eval_samples
        self.updates_ratio = updates_ratio
        self.include_horizon_terminal_transitions = include_horizon_terminal_transitions

        self.paths = []
        self.es_path_returns = []
        self.paths_samples_cnt = 0

        self.scale_reward = scale_reward

        self.train_policy_itr = 0

        self.save_freq = save_freq
        self.save_format = save_format
        self.restore_auto = restore_auto
Пример #21
0
class DDPG(RLAlgorithm, Poleval):

    def __init__(
            self,
            env,
            qf,
            es,
            policy=None,
            policy_batch_size=32,
            n_epochs=200,
            epoch_length=1000,
            discount=0.99,
            max_path_length=250,
            policy_weight_decay=0,
            policy_update_method='adam',
            policy_learning_rate=1e-3,
            policy_step_size=0.01,
            policy_optimizer_args=dict(),
            policy_updates_ratio=1.0,
            policy_use_target=True,
            policy_sample_last=False,
            eval_samples=10000,
            updates_ratio=1.0, # #updates/#samples
            scale_reward=1.0,
            include_horizon_terminal_transitions=False,
            save_freq=0,
            save_format='pickle',
            restore_auto=True,
            **kwargs):

        self.env = env
        self.policy = policy
        if self.policy is None: self.qf_dqn = True
        else: self.qf_dqn = False
        self.qf = qf
        self.es = es
        if self.es is None: self.es = ExplorationStrategy()
        self.n_epochs = n_epochs
        self.epoch_length = epoch_length
        self.discount = discount
        self.max_path_length = max_path_length

        self.init_critic(**kwargs)

        if not self.qf_dqn:
            self.policy_weight_decay = policy_weight_decay
            if policy_update_method == 'adam':
                self.policy_update_method = \
                    FirstOrderOptimizer(
                        update_method=policy_update_method,
                        learning_rate=policy_learning_rate,
                        **policy_optimizer_args,
                    )
                self.policy_learning_rate = policy_learning_rate
            elif policy_update_method == 'cg':
                self.policy_update_method = \
                    ConjugateGradientOptimizer(
                        **policy_optimizer_args,
                    )
                self.policy_step_size = policy_step_size
            self.policy_optimizer_args = policy_optimizer_args
            self.policy_updates_ratio = policy_updates_ratio
            self.policy_use_target = policy_use_target
            self.policy_batch_size = policy_batch_size
            self.policy_sample_last = policy_sample_last
            self.policy_surr_averages = []
            self.exec_policy = self.policy
        else:
            self.policy_batch_size = 0
            self.exec_policy = self.qf

        self.eval_samples = eval_samples
        self.updates_ratio = updates_ratio
        self.include_horizon_terminal_transitions = include_horizon_terminal_transitions

        self.paths = []
        self.es_path_returns = []
        self.paths_samples_cnt = 0

        self.scale_reward = scale_reward

        self.train_policy_itr = 0

        self.save_freq = save_freq
        self.save_format = save_format
        self.restore_auto = restore_auto

    def start_worker(self):
        parallel_sampler.populate_task(self.env, self.exec_policy)

    def save(self, checkpoint_dir=None):
        if checkpoint_dir is None: checkpoint_dir = logger.get_snapshot_dir()

        pool_file = os.path.join(checkpoint_dir, 'pool.chk')
        if self.save_format == 'pickle':
            pickle_dump(pool_file + '.tmp', self.pool)
        elif self.save_format == 'joblib':
            joblib.dump(self.pool, pool_file + '.tmp', compress=1, cache_size=1e9)
        else: raise NotImplementedError
        shutil.move(pool_file + '.tmp', pool_file)

        checkpoint_file = os.path.join(checkpoint_dir, 'params.chk')
        sess = tf.get_default_session()
        saver = tf.train.Saver()
        saver.save(sess, checkpoint_file)

        tabular_file = os.path.join(checkpoint_dir, 'progress.csv')
        if os.path.isfile(tabular_file):
            tabular_chk_file = os.path.join(checkpoint_dir, 'progress.csv.chk')
            shutil.copy(tabular_file, tabular_chk_file)

        logger.log('Saved to checkpoint %s'%checkpoint_file)

    def restore(self, checkpoint_dir=None):
        if checkpoint_dir is None: checkpoint_dir = logger.get_snapshot_dir()
        checkpoint_file = os.path.join(checkpoint_dir, 'params.chk')
        if os.path.isfile(checkpoint_file + '.meta'):
            sess = tf.get_default_session()
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_file)

            tabular_chk_file = os.path.join(checkpoint_dir, 'progress.csv.chk')
            if os.path.isfile(tabular_chk_file):
                tabular_file = os.path.join(checkpoint_dir, 'progress.csv')
                logger.remove_tabular_output(tabular_file)
                shutil.copy(tabular_chk_file, tabular_file)
                logger.add_tabular_output(tabular_file)

            pool_file = os.path.join(checkpoint_dir, 'pool.chk')
            if self.save_format == 'pickle':
                pickle_load(pool_file)
            elif self.save_format == 'joblib':
                self.pool = joblib.load(pool_file)
            else: raise NotImplementedError

            logger.log('Restored from checkpoint %s'%checkpoint_file)
        else:
            logger.log('No checkpoint %s'%checkpoint_file)

    @overrides
    def train(self):
        global_itr = tf.Variable(0, name='global_itr', trainable=False, dtype=tf.int32)
        increment_global_itr_op = tf.assign(global_itr, global_itr+1)
        global_epoch = tf.Variable(0, name='global_epoch', trainable=False, dtype=tf.int32)
        increment_global_epoch_op = tf.assign(global_epoch, global_epoch+1)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # This seems like a rather sequential method
            self.pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
                env=self.env,
            )
            self.start_worker()

            self.init_opt()
            # This initializes the optimizer parameters
            sess.run(tf.global_variables_initializer())
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            n_updates = 0
            observation = self.env.reset()

            sample_policy = Serializable.clone(self.exec_policy, name="sample_policy")

            if self.restore_auto: self.restore()
            itr = sess.run(global_itr)
            epoch = sess.run(global_epoch)
            t0 = time()
            logger.log("Critic batch size=%d, Actor batch size=%d"%(self.qf_batch_size, self.policy_batch_size))
            while epoch < self.n_epochs:
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Mem: %f"%memory_usage_resource())
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0
                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        self.es.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False
                    action = self.es.get_action(itr, observation, policy=sample_policy)  # qf=qf)

                    next_observation, reward, terminal, _ = self.env.step(action)
                    path_length += 1
                    path_return += reward

                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            self.pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
                    else:
                        self.pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)

                    observation = next_observation

                    if self.pool.size > max(self.min_pool_size, self.qf_batch_size):
                        n_updates += self.updates_ratio
                        while n_updates > 0:
                            # Train policy
                            itrs = self.do_training(itr)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                            n_updates -= 1
                        sample_policy.set_param_values(self.exec_policy.get_param_values())

                    itr = sess.run(increment_global_itr_op)
                    if time() - t0 > 100: gc.collect(); t0 = time()

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
                if self.pool.size >= self.min_pool_size:
                    self.evaluate(epoch, self.pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                epoch = sess.run(increment_global_epoch_op)
                if self.save_freq > 0 and (epoch-1) % self.save_freq == 0: self.save()
            self.env.terminate()
            self.exec_policy.terminate()

    def init_opt(self):
        self.init_opt_critic()
        self.init_opt_policy()

    def init_opt_policy(self):
        if not self.qf_dqn:
            obs = self.policy.env_spec.observation_space.new_tensor_variable(
                'pol_obs',
                extra_dims=1,
            )

            if self.policy_use_target:
            	logger.log("[init_opt] using target policy.")
            	target_policy = Serializable.clone(self.policy, name="target_policy")
            else:
            	logger.log("[init_opt] no target policy.")
            	target_policy = self.policy

            policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])
            policy_qval = self.qf.get_e_qval_sym(
                obs, self.policy,
                deterministic=True
            )
            policy_surr = -tf.reduce_mean(policy_qval)

            policy_reg_surr = policy_surr + policy_weight_decay_term


            policy_input_list = [obs]

            if isinstance(self.policy_update_method, FirstOrderOptimizer):
                self.policy_update_method.update_opt(
                    loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)

                f_train_policy = tensor_utils.compile_function(
                    inputs=policy_input_list,
                    outputs=[policy_surr, self.policy_update_method._train_op],
                )
            else:
                f_train_policy = self.policy_update_method.update_opt_trust_region(
                        loss=policy_reg_surr,
                        input_list=policy_input_list,
                        obs_var=obs,
                        target=self.policy,
                        policy=self.policy,
                        step_size=self.policy_step_size,
                )

            self.opt_info = dict(
                f_train_policy=f_train_policy,
                target_policy=target_policy,
            )

    def do_training(self, itr):
        batch = self.pool.random_batch(self.qf_batch_size)
        self.do_critic_training(itr, batch=batch)

        train_policy_itr = 0

        if not self.qf_dqn and self.pool.size > max(
                self.min_pool_size, self.policy_batch_size):
            self.train_policy_itr += self.policy_updates_ratio
            while self.train_policy_itr > 0:
                if self.policy_sample_last:
                    pol_batch = self.pool.last_batch(self.policy_batch_size)
                else:
                    pol_batch = self.pool.random_batch(self.policy_batch_size)
                self.do_policy_training(itr, batch=pol_batch)
                self.train_policy_itr -= 1
                train_policy_itr += 1

        return 1, train_policy_itr # number of itrs qf, policy are trained

    def do_policy_training(self, itr, batch):
        target_policy = self.opt_info["target_policy"]
        obs, = ext.extract(batch, "observations")
        f_train_policy = self.opt_info["f_train_policy"]
        if isinstance(self.policy_update_method, FirstOrderOptimizer):
            policy_surr, _ = f_train_policy(obs)
        else:
            agent_infos = self.policy.dist_info(obs)
            state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
            dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
            all_input_values = (obs, obs, ) + tuple(state_info_list) + tuple(dist_info_list)
            policy_results = f_train_policy(all_input_values)
            policy_surr = policy_results["loss_after"]
        if self.policy_use_target:
            target_policy.set_param_values(
                target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
                self.policy.get_param_values() * self.soft_target_tau)
        self.policy_surr_averages.append(policy_surr)

    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.exec_policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path["rewards"], self.discount) for path in paths]
        )

        returns = [sum(path["rewards"]) for path in paths]

        average_action = np.mean(np.square(np.concatenate(
            [path["actions"] for path in paths]
        )))

        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True)
        )

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Iteration', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn',
                              np.std(returns))
        logger.record_tabular('MaxReturn',
                              np.max(returns))
        logger.record_tabular('MinReturn',
                              np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('QFunRegParamNorm',
                              qfun_reg_param_norm)
        self.env.log_diagnostics(paths)
        self.log_critic_training()

        self.es_path_returns = []

        if not self.qf_dqn:
            average_policy_surr = np.mean(self.policy_surr_averages)
            policy_reg_param_norm = np.linalg.norm(
                self.policy.get_param_values(regularizable=True)
            )
            logger.record_tabular('AveragePolicySurr', average_policy_surr)
            logger.record_tabular('PolicyRegParamNorm',
                              policy_reg_param_norm)
            self.policy.log_diagnostics(paths)
            self.policy_surr_averages = []

    def get_epoch_snapshot(self, epoch):
        snapshot = dict(
            env=self.env,
            epoch=epoch,
            qf=self.qf,
            target_qf=self.opt_info_critic["target_qf"],
            es=self.es,
        )
        if not self.qf_dqn:
            snapshot.update(dict(
                policy=self.policy,
                target_policy=self.opt_info["target_policy"],
            ))
        return snapshot
Пример #22
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.checkpoint:
        with tf.Session() as sess:
            data = joblib.load(args.checkpoint)
            policy = data['policy']
            env = data['env']
    else:
        if args.sample_maps:
            map_pool = np.load(args.map_file)
        else:
            if args.map_type == 'rectangle':
                env_map = TwoDMaps.rectangle_map(
                    *map(int, args.rectangle.split(',')))
            elif args.map_type == 'complex':
                env_map = TwoDMaps.complex_map(
                    *map(int, args.rectangle.split(',')))
            else:
                raise NotImplementedError()
            map_pool = [env_map]

        env = PursuitEvade(map_pool,
                           n_evaders=args.n_evaders,
                           n_pursuers=args.n_pursuers,
                           obs_range=args.obs_range,
                           n_catch=args.n_catch,
                           train_pursuit=args.train_pursuit,
                           urgency_reward=args.urgency,
                           surround=args.surround,
                           sample_maps=args.sample_maps,
                           constraint_window=args.constraint_window,
                           flatten=args.flatten,
                           reward_mech=args.reward_mech,
                           catchr=args.catchr,
                           term_pursuit=args.term_pursuit)

        env = TfEnv(
            RLLabEnv(StandardizedEnv(env,
                                     scale_reward=args.reward_scale,
                                     enable_obsnorm=False),
                     mode=args.control))

        if args.recurrent:
            if args.conv:
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=emv.spec.observation_space.shape,
                    output_dim=5,
                    conv_filters=(16, 32, 32),
                    conv_filter_sizes=(3, 3, 3),
                    conv_strides=(1, 1, 1),
                    conv_pads=('VALID', 'VALID', 'VALID'),
                    hidden_sizes=(64, ),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=tf.nn.softmax)
            else:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=5,
                    hidden_sizes=(256, 128, 64),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            if args.recurrent == 'gru':
                policy = CategoricalGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden_sizes),
                                              name='policy')
            elif args.recurrent == 'lstm':
                policy = CategoricalLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden_sizes),
                                               name='policy')
        elif args.conv:
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16),
                conv_filter_sizes=(3, 3),
                conv_strides=(2, 1),
                conv_pads=('VALID', 'VALID'),
                hidden_sizes=(32, ),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control,
    )

    algo.train()
Пример #23
0
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            optimizer=None,
            tr_optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):
            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.input_dim = input_shape[0]
            self.observation_space = Discrete(self.input_dim)
            self.action_space = Discrete(output_dim)


            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            if prob_network is None:
                prob_network = MLP(
                    input_shape=input_shape,
                    output_dim=output_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name="prob_network"
                )

            l_prob = prob_network.output_layer

            LayersPowered.__init__(self, [l_prob])

            xs_var = prob_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys")
            old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob")

            x_mean_var = tf.get_variable(
                name="x_mean",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(0., dtype=tf.float32)
            )
            x_std_var = tf.get_variable(
                name="x_std",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(1., dtype=tf.float32)
            )

            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=prob_var)

            dist = self._dist = Categorical(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim)

            self.prob_network = prob_network
            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_prob = tensor_utils.compile_function([xs_var], prob_var)
            self.l_prob = l_prob

            self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var],
                                         inputs=[xs_var, ys_var, old_prob_var],
                                         leq_constraint=(mean_kl, step_size)
                                         )

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
Пример #24
0
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(100, 50, 25),
    hidden_nonlinearity=tf.nn.relu,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=0.01,
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_prefix="UnifiedDDPG_" + args.env + "_trpo",
    seed=1,
    mode="ec2" if args.use_ec2 else "local",
    plot=False,
    # dry=True,
Пример #25
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.95)
    parser.add_argument('--gae_lambda', type=float, default=0.99)
    parser.add_argument('--reward_scale', type=float, default=1.0)
    parser.add_argument('--enable_obsnorm', action='store_true', default=False)
    parser.add_argument('--chunked', action='store_true', default=False)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--anneal_step_size', type=int, default=0)

    parser.add_argument('--n_timesteps', type=int, default=8000)

    parser.add_argument('--control', type=str, default='centralized')
    parser.add_argument('--buffer_size', type=int, default=1)
    parser.add_argument('--radius', type=float, default=0.015)
    parser.add_argument('--n_evaders', type=int, default=10)
    parser.add_argument('--n_pursuers', type=int, default=8)
    parser.add_argument('--n_poison', type=int, default=10)
    parser.add_argument('--n_coop', type=int, default=4)
    parser.add_argument('--n_sensors', type=int, default=30)
    parser.add_argument('--sensor_range', type=str, default='0.2')
    parser.add_argument('--food_reward', type=float, default=5)
    parser.add_argument('--poison_reward', type=float, default=-1)
    parser.add_argument('--encounter_reward', type=float, default=0.05)
    parser.add_argument('--reward_mech', type=str, default='local')

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--baseline_type', type=str, default='linear')
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128')

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    centralized = True if args.control == 'centralized' else False

    sensor_range = np.array(map(float, args.sensor_range.split(',')))
    if len(sensor_range) == 1:
        sensor_range = sensor_range[0]
    else:
        assert sensor_range.shape == (args.n_pursuers, )

    env = MAWaterWorld(args.n_pursuers,
                       args.n_evaders,
                       args.n_coop,
                       args.n_poison,
                       radius=args.radius,
                       n_sensors=args.n_sensors,
                       food_reward=args.food_reward,
                       poison_reward=args.poison_reward,
                       encounter_reward=args.encounter_reward,
                       reward_mech=args.reward_mech,
                       sensor_range=sensor_range,
                       obstacle_loc=None)

    env = TfEnv(
        RLLabEnv(StandardizedEnv(env,
                                 scale_reward=args.reward_scale,
                                 enable_obsnorm=args.enable_obsnorm),
                 mode=args.control))

    if args.buffer_size > 1:
        env = ObservationBuffer(env, args.buffer_size)

    if args.recurrent:
        feature_network = MLP(
            name='feature_net',
            input_shape=(env.spec.observation_space.flat_dim +
                         env.spec.action_space.flat_dim, ),
            output_dim=16,
            hidden_sizes=(128, 64, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None)
        if args.recurrent == 'gru':
            policy = GaussianGRUPolicy(env_spec=env.spec,
                                       feature_network=feature_network,
                                       hidden_dim=int(
                                           args.policy_hidden_sizes),
                                       name='policy')
        elif args.recurrent == 'lstm':
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        feature_network=feature_network,
                                        hidden_dim=int(
                                            args.policy_hidden_sizes),
                                        name='policy')
    else:
        policy = GaussianMLPPolicy(
            name='policy',
            env_spec=env.spec,
            hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))),
            min_std=10e-5)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif args.baseline_type == 'mlp':
        raise NotImplementedError()
        # baseline = GaussianMLPBaseline(
        #     env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(','))))
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        #max_path_length_limit=args.max_path_length_limit,
        update_max_path_length=args.update_curriculum,
        anneal_step_size=args.anneal_step_size,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control
        if not args.chunked else 'chunk_{}'.format(args.control),
    )

    algo.train()
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline
    policy = None
    baseline = None

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = StraightEnv(
            target_velocity=vv['target_velocity'],
            dt=vv['dt'],
            model_type=vv['model_type'],
            robot_type=vv['robot_type'],
            mu_s=vv['mu_s'],
            mu_k=vv['mu_k']
        )
        env=TfEnv(env)
    else:
        from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS
        env = StraightEnvROS(
            target_velocity=vv['target_velocity'],
            dt=vv['dt'],
            model_type=vv['model_type'],
            robot_type=vv['robot_type']
        )

    # Save variant information for comparison plots
    # variant_file = logger.get_snapshot_dir() + '/variant.json'
    # logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        target_velocity = vv['target_velocity']
        target_steering = 0
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)


        policy = GaussianLSTMPolicy(
            name="policy",
            env_spec=env.spec,
            # input_shape=(env.spec.observation_space.flat_dim,),
            # output_dim=env.spec.action_space.flat_dim,
           # gru_layer_cls=L.GRULayer,
        )               
        # mean_network = MLP(
        #     input_shape=(env.spec.observation_space.flat_dim,),
        #     output_dim=env.spec.action_space.flat_dim,
        #     hidden_sizes=hidden_sizes,
        #     hidden_nonlinearity=LN.rectify,
        #     output_nonlinearity=None,
        #     output_W_init=LI.GlorotUniform(gain=W_gain),
        #     output_b_init=output_mean
        # )
        # policy = GaussianMLPPolicy(
        #     env_spec=env.spec,
        #     hidden_sizes=(32, 32),
        #     init_std=init_std,
        #     mean_network=mean_network
        # )
        baseline = LinearFeatureBaseline(
            env_spec=env.spec,
            target_key='returns'
        )

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True
        )
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var]
        )

    safety_baseline = LinearFeatureBaseline(
        env_spec=env.spec,
        target_key='safety_returns'
    )

    safety_constraint = StraightSafetyConstraint(
        max_value=1.0,
        baseline=safety_baseline
    )

    if vv['algo'] == 'TRPO':
        algo = Trpo(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=2000,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
            optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
        )
    else:
        algo = CPO(
            env=env,
            policy=policy,
            baseline=baseline,
            safety_constraint=safety_constraint,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=2000,
            discount=0.99,
            step_size=trpo_stepsize,
            gae_lambda=0.95,
            safety_gae_lambda=1,
            optimizer_args={'subsample_factor': trpo_subsample_factor},
            plot=False
        )
    algo.train()
Пример #27
0
    def __init__(
        self,
        input_shape,
        output_dim,
        name,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.relu,
        optimizer=None,
        tr_optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            p_network = MLP(input_shape=input_shape,
                            output_dim=output_dim,
                            hidden_sizes=hidden_sizes,
                            hidden_nonlinearity=hidden_nonlinearity,
                            output_nonlinearity=tf.nn.sigmoid,
                            name="p_network")

            l_p = p_network.output_layer

            LayersPowered.__init__(self, [l_p])

            xs_var = p_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=(None, output_dim),
                                    name="ys")
            old_p_var = tf.placeholder(dtype=tf.float32,
                                       shape=(None, output_dim),
                                       name="old_p")

            x_mean_var = tf.get_variable(name="x_mean",
                                         initializer=tf.zeros_initializer,
                                         shape=(1, ) + input_shape)
            x_std_var = tf.get_variable(name="x_std",
                                        initializer=tf.ones_initializer,
                                        shape=(1, ) + input_shape)

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            p_var = L.get_output(l_p,
                                 {p_network.input_layer: normalized_xs_var})

            old_info_vars = dict(p=old_p_var)
            info_vars = dict(p=p_var)

            dist = self._dist = Bernoulli(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = p_var >= 0.5

            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_p = tensor_utils.compile_function([xs_var], p_var)
            self.l_p = l_p

            self.optimizer.update_opt(loss=loss,
                                      target=self,
                                      network_outputs=[p_var],
                                      inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss,
                                         target=self,
                                         network_outputs=[p_var],
                                         inputs=[xs_var, ys_var, old_p_var],
                                         leq_constraint=(mean_kl, step_size))

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
Пример #28
0
validator = auto_validator.AutoValidator(
    summary_writer, 
    data['obs_mean'], 
    data['obs_std'],
    render=args.validator_render,
    render_every=args.render_every,
    flat_recurrent=args.policy_recurrent,
    validate_normalization=args.validator_validate_normalization
)

# build algo 
saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5)
sampler_args = dict(n_envs=args.n_envs) if args.vectorize else None
if args.policy_recurrent:
    optimizer = ConjugateGradientOptimizer(
        max_backtracks=50,
        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)
    )
else:
    optimizer = None
algo = GAIL(
    critic=critic,
    recognition=recognition_model,
    reward_handler=reward_handler,
    env=env,
    policy=policy,
    baseline=baseline,
    validator=validator,
    batch_size=args.batch_size,
    max_path_length=args.max_path_length,
    n_itr=args.n_itr,
    discount=args.discount,
Пример #29
0
with tf.compat.v1.Session() as sess:
    for env_name, env in envs:

        logger.log("Training Policy on %s" % env_name)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=args.batch_size,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=args.step_size,
            optimizer=ConjugateGradientOptimizer(
                reg_coeff=args.reg_coeff,
                hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)))

        custom_train(algo, sess=sess)

        rollouts = algo.obtain_samples(args.num_epochs + 1)

        logger.log("Average reward for training rollouts on (%s): %f +- %f " %
                   (env_name, np.mean([np.sum(p['rewards'])
                                       for p in rollouts]),
                    np.std([np.sum(p['rewards']) for p in rollouts])))

    # Final evaluation on all environments using the learned policy

    total_rollouts = []
    for env_name, env in envs:
Пример #30
0
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size,
             replay_pool_size, discount, scale_reward, learning_rate,
             replacement_prob, policy_updates_ratio, step_size, gae_lambda,
             sample_backups, kl_sample_backups, qprop_eta_option, qprop_unbias,
             qprop_nu, algo_name, n_itr, recurrent, updates_ratio,
             policy_use_target, policy_batch_size, policy_sample_last,
             ac_delta, ac_sample_backups, save_freq, restore_auto,
             qf_learning_rate, qf_use_target, qf_mc_ratio, qf_batch_size,
             qf_residual_phi, **kwargs):
    algo = None
    algo_class = None
    min_pool_size = 1000
    qf_baseline = None
    extra_kwargs = dict()

    print('Creating algo=%s with n_itr=%d, max_path_length=%d...' %
          (algo_name, n_itr, max_path_length))
    if algo_name in [
            'ddpg',
            'dspg',
            'dspgoff',
            'dqn',
            'dsqn',
            'trpg',
            'trpgoff',
    ]:
        if algo_name in [
                'trpg',
        ]:
            extra_kwargs['policy_update_method'] = 'cg'
        algo = DDPG(
            env=env,
            policy=policy,
            policy_use_target=policy_use_target,
            es=es,
            qf=qf,
            qf_use_target=qf_use_target,
            policy_batch_size=policy_batch_size,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            max_path_length=max_path_length,
            epoch_length=batch_size,  # make comparable to batchopt methods
            min_pool_size=min_pool_size,
            replay_pool_size=replay_pool_size,
            n_epochs=n_itr,
            discount=discount,
            scale_reward=scale_reward,
            qf_learning_rate=qf_learning_rate,
            policy_learning_rate=learning_rate,
            policy_step_size=step_size,
            policy_sample_last=policy_sample_last,
            replacement_prob=replacement_prob,
            policy_updates_ratio=policy_updates_ratio,
            updates_ratio=updates_ratio,
            save_freq=save_freq,
            restore_auto=restore_auto,
            **extra_kwargs,
        )
        algo_class = 'DDPG'
    elif algo_name in [
            'trpo',
            'nuqprop',
            'nuqfqprop',
            'actrpo',
            'acqftrpo',
            'qprop',
            'mqprop',
            'qfqprop',
            'nafqprop',
    ]:
        if recurrent:
            extra_kwargs['optimizer'] = \
                ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
        if algo_name in [
                'actrpo',
                'acqftrpo',
        ]:
            extra_kwargs['ac_delta'] = ac_delta
            extra_kwargs['qprop'] = False  # disable qprop
            if ac_delta == 0: qf = None
        if algo_name in [
                'mqprop',
        ]:
            extra_kwargs['mqprop'] = True
        if algo_name in [
                'nuqprop',
                'nuqfqprop',
        ]:
            extra_kwargs['qprop_nu'] = qprop_nu
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    max_path_length=max_path_length,
                    n_itr=n_itr,
                    discount=discount,
                    step_size=step_size,
                    gae_lambda=gae_lambda,
                    sample_backups=sample_backups,
                    kl_sample_backups=kl_sample_backups,
                    qf=qf,
                    qf_use_target=qf_use_target,
                    qf_batch_size=qf_batch_size,
                    qf_mc_ratio=qf_mc_ratio,
                    qf_residual_phi=qf_residual_phi,
                    min_pool_size=min_pool_size,
                    scale_reward=scale_reward,
                    qf_updates_ratio=updates_ratio,
                    qprop_eta_option=qprop_eta_option,
                    qprop_unbias=qprop_unbias,
                    replay_pool_size=replay_pool_size,
                    replacement_prob=replacement_prob,
                    qf_baseline=qf_baseline,
                    qf_learning_rate=qf_learning_rate,
                    ac_sample_backups=ac_sample_backups,
                    policy_sample_last=policy_sample_last,
                    save_freq=save_freq,
                    restore_auto=restore_auto,
                    **extra_kwargs)
        algo_class = 'TRPO'
    elif algo_name in [
            'vpg',
            'qvpg',
    ]:
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = VPG(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            discount=discount,
            gae_lambda=gae_lambda,
            optimizer_args=dict(
                tf_optimizer_args=dict(learning_rate=learning_rate, )),
            qf=qf,
            qf_use_target=qf_use_target,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            min_pool_size=min_pool_size,
            scale_reward=scale_reward,
            qf_updates_ratio=updates_ratio,
            qprop_eta_option=qprop_eta_option,
            qprop_unbias=qprop_unbias,
            replay_pool_size=replay_pool_size,
            qf_baseline=qf_baseline,
            qf_learning_rate=qf_learning_rate,
            save_freq=save_freq,
            restore_auto=restore_auto,
        )
        algo_class = 'VPG'
    print('[get_algo] Instantiating %s.' % algo_class)
    return algo