예제 #1
0
    def __init__(
        self,
        env,
        f_model,
        sample_n,
        horizon_n,
        episode_n=1000,
        discount_factor=0.99,
        # sampler arguments
        update_interval=4,
        replay_size=1000,
        batch_size=32,
        # epsilon greedy arguments
        greedy_epsilon=0.3,
        network_optimizer_ctor=lambda: network.LocalOptimizer(
            tf.train.AdamOptimizer(1e-3), grad_clip=10.0)):
        self._env, self._f_model, self._episode_n, \
            self._discount_factor, \
            self._update_interval, \
            self._replay_size, \
            self._batch_size, \
            self._greedy_epsilon, \
            self._network_optimizer_ctor = \
            env, f_model, episode_n, \
            discount_factor, \
            update_interval, \
            replay_size, \
            batch_size, \
            greedy_epsilon, \
            network_optimizer_ctor
        self._sample_n, self._horizon_n = sample_n, horizon_n

        super(MPCExperiment, self).__init__()
예제 #2
0
    def __init__(self,
                 env=None,
                 f_create_actor=None,
                 f_create_q=None,
                 dim_noise=2,
                 target_sync_interval=100,
                 target_sync_rate=1.0,
                 alpha_exploration=utils.CappedExp(1e5, 0.2, 0.08),
                 max_gradient=10.0,
                 m_particle_svgd=16,
                 m_particle_v=16,
                 episode_n=1000,
                 discount_factor=0.9,
                 update_interval=4,
                 replay_size=100000,
                 batch_size=32,
                 network_optimizer_ctor=lambda: network.LocalOptimizer(
                     tf.train.AdamOptimizer(1e-4), grad_clip=10.0)):
        if env is None:
            env = gym.make("Pendulum-v0")
            env = envs.AugmentEnvWrapper(env,
                                         reward_decay=discount_factor,
                                         reward_scale=0.1)
        dim_state = env.observation_space.shape[0]
        dim_action = env.action_space.shape[0]
        l2 = 1e-8
        if f_create_actor is None:

            def f(inputs):
                x = tf.concat(inputs, axis=1)
                action = network.Utils.layer_fcs(x, [256, 256],
                                                 dim_action,
                                                 activation_out=tf.nn.tanh,
                                                 l2=l2,
                                                 var_scope="actor")
                return {"action": action}

            f_create_actor = f
        if f_create_q is None:

            def f(inputs):
                x = tf.concat(inputs, axis=1)
                q = network.Utils.layer_fcs(x, [256, 256],
                                            1,
                                            activation_out=None,
                                            l2=l2,
                                            var_scope="q")
                q = tf.squeeze(q, axis=1)
                return {"q": q}

            f_create_q = f

        super(SoftQPendulum,
              self).__init__(env, f_create_actor, f_create_q, dim_noise,
                             target_sync_interval, target_sync_rate,
                             alpha_exploration, max_gradient, m_particle_svgd,
                             m_particle_v, episode_n, discount_factor,
                             update_interval, replay_size, batch_size,
                             network_optimizer_ctor)
예제 #3
0
 def __init__(
     self,
     env,
     f_create_actor,
     f_create_q,
     f_model,
     dim_noise,
     target_sync_interval,
     target_sync_rate,
     greedy_epsilon=0.2,
     sample_n=4,
     horizon_n=4,
     alpha_exploration=1.0,
     max_gradient=10.0,
     m_particle_svgd=16,
     m_particle_v=16,
     episode_n=1000,
     discount_factor=0.99,
     # sampler arguments
     update_interval=4,
     replay_size=100000,
     batch_size=32,
     # epsilon greedy arguments
     network_optimizer_ctor=lambda: network.LocalOptimizer(
         tf.train.AdamOptimizer(1e-4), grad_clip=10.0)):
     self._env, \
     self._f_create_actor, \
     self._f_create_q, \
     self._dim_noise, \
     self._target_sync_interval, \
     self._target_sync_rate, \
     self._max_gradient, \
     self._m_particle_svgd, \
     self._m_particle_v, \
     self._episode_n, \
     self._discount_factor, \
     self._update_interval, \
     self._replay_size, \
     self._batch_size, \
     self._network_optimizer_ctor = \
         env, \
         f_create_actor, \
         f_create_q, \
         dim_noise, \
         target_sync_interval, \
         target_sync_rate, \
         max_gradient, \
         m_particle_svgd, \
         m_particle_v, \
         episode_n, \
         discount_factor, \
         update_interval, \
         replay_size, \
         batch_size, \
         network_optimizer_ctor
     self._alpha_exploration = alpha_exploration
     self._greedy_epsilon, self._sample_n, self._horizon_n = greedy_epsilon, sample_n, horizon_n
     self._f_model = f_model
     super(SoftQMPCExperiment, self).__init__()
예제 #4
0
    def __init__(self,
                 env=None,
                 f_model=None,
                 sample_n=16,
                 horizon_n=4,
                 episode_n=1000,
                 discount_factor=0.99,
                 update_interval=4,
                 replay_size=100000,
                 batch_size=32,
                 greedy_epsilon=utils.CappedExp(1e5, 2.5, 0.05),
                 network_optimizer_ctor=lambda: network.LocalOptimizer(
                     tf.train.AdamOptimizer(1e-4), grad_clip=10.0)):
        if env is None:
            env = gym.make("Pendulum-v0")
            env = envs.AugmentEnvWrapper(env,
                                         reward_decay=0.9,
                                         reward_scale=0.1)
        if f_model is None:

            dim_action = env.action_space.shape[0]
            dim_state = env.observation_space.shape[0]
            l2 = 1e-5

            def f(inputs):
                state, action = inputs[0], inputs[1]
                se = tf.concat([state, action], axis=-1)
                se = network.Utils.layer_fcs(se, [256],
                                             256,
                                             activation_out=None,
                                             l2=l2,
                                             var_scope="se")
                goal = network.Utils.layer_fcs(se, [],
                                               dim_state,
                                               activation_out=None,
                                               l2=l2,
                                               var_scope="goal")
                reward = network.Utils.layer_fcs(se, [],
                                                 1,
                                                 activation_out=None,
                                                 l2=l2,
                                                 var_scope="reward")
                reward = tf.squeeze(reward, axis=1)
                return {"goal": goal, "reward": reward}

            f_model = f

        super(MPCPendulum,
              self).__init__(env, f_model, sample_n, horizon_n, episode_n,
                             discount_factor, update_interval, replay_size,
                             batch_size, greedy_epsilon,
                             network_optimizer_ctor)
예제 #5
0
 def __init__(
         self,
         f_model,
         sample_n,
         horizon_n,
         dim_state,
         dim_action,
         greedy_epsilon,
         # optimizer arguments
         network_optimizer=None,
         max_gradient=10.0,
         # sampler arguments
         update_interval=4,
         replay_size=1000,
         batch_size=32,
         sampler=None,
         **kwargs):
     kwargs.update({
         "sample_n": sample_n,
         "horizon_n": horizon_n,
         "update_interval": update_interval,
         "replay_size": replay_size,
         "batch_size": batch_size,
         "greedy_epsilon": greedy_epsilon
     })
     if network_optimizer is None:
         network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
     if sampler is None:
         sampler = sampling.TransitionSampler(MapPlayback(replay_size),
                                              batch_size, update_interval)
     kwargs.update({"sampler": sampler})
     self._network_optimizer = network_optimizer
     self._dim_state, self._dim_action = dim_state, dim_action
     self._f_model, self._sample_n, self._horizon_n = f_model, sample_n, horizon_n
     self._greedy_epsilon = greedy_epsilon
     super(MPCAgent, self).__init__(**kwargs)
     network_optimizer.add_updater(ModelUpdater(self.network), name="model")
     network_optimizer.compile()
     self._policy = WrapEpsilonGreedy(MPCPolicy(
         NetworkFunction({
             "goal": self.network["goal"],
             "reward": self.network["reward"]
         }),
         sample_n=self._sample_n,
         horizon_n=self._horizon_n,
     ),
                                      epsilon=greedy_epsilon,
                                      num_actions=dim_action,
                                      is_continuous=True)
예제 #6
0
파일: dqn.py 프로젝트: hobotrl/hobotrl
 def __init__(self,
              f_create_q, state_shape,
              # OneStepTD arguments
              num_actions, discount_factor, ddqn,
              # target network sync arguments
              target_sync_interval,
              target_sync_rate,
              # epsilon greeedy arguments
              greedy_epsilon,
              # optimizer arguments
              network_optimizer=None, max_gradient=10.0,
              # sampler arguments
              update_interval=4, replay_size=1000, batch_size=32,
              sampler=None,
              *args, **kwargs):
     """
     :param f_create_q: function, f_create_q([state, action]) => {"q": op_q}
     :param state_shape: shape of state
     :param num_actions: action count
     :param discount_factor:
     :param ddqn: True if using double DQN
     :param target_sync_interval: interval syncing weights from learned network to target network
     :param target_sync_rate: syncing rate. 1.0 for hard sync, 0 < r < 1.0 for soft sync.
     :param greedy_epsilon: epsilon for epsilon greedy policy
     :param network_optimizer: NetworkOptimizer instance, default to LocalOptimizer
     :type network_optimizer: network.NetworkOptimizer
     :param max_gradient: gradient clip value
     :param update_interval: network update interval between Agent.step()
     :param replay_size: replay memory size.
     :param batch_size:
     :param sampler: Sampler, default to TransitionSampler.
             if None, a TransitionSampler is created using update_interval, replay_size, batch_size
     :param args:
     :param kwargs:
     """
     kwargs.update({
         "f_create_q": f_create_q,
         "state_shape": state_shape,
         "num_actions": num_actions,
         "discount_factor": discount_factor,
         "ddqn": ddqn,
         "target_sync_interval": target_sync_interval,
         "target_sync_rate": target_sync_rate,
         "update_interval": update_interval,
         "replay_size": replay_size,
         "batch_size": batch_size,
         "greedy_epsilon": greedy_epsilon,
         "max_gradient": max_gradient
     })
     if network_optimizer is None:
         network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
     if sampler is None:
         sampler = sampling.TransitionSampler(MapPlayback(replay_size), batch_size, update_interval)
     kwargs.update({"sampler": sampler})
     # call super.__init__
     super(DQN, self).__init__(*args, **kwargs)
     self.network_optimizer = network_optimizer
     self._ddqn, self._discount_factor = ddqn, discount_factor
     self.init_updaters_()
     self._target_sync_interval, self._target_sync_rate = target_sync_interval, target_sync_rate
     self._update_count = 0
예제 #7
0
    def __init__(self,
                 f_se, f_actor, f_critic,
                 state_shape, dim_action,
                 # ACUpdate arguments
                 discount_factor, target_estimator=None,
                 # optimizer arguments
                 network_optimizer=None, max_gradient=10.0,
                 # policy arguments
                 ou_params=(0.0, 0.2, 0.2),
                 # target network sync arguments
                 target_sync_interval=10,
                 target_sync_rate=0.01,
                 # sampler arguments
                 sampler=None,
                 batch_size=32,
                 update_interval=4,
                 replay_size=1000,
                 noise_type=OUNoise,
                 *args, **kwargs):
        """

        :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action}
        :param state_shape: state shape
        :param dim_action: action dimension
        :param discount_factor:
        :param target_estimator: default to target_estimate.ContinuousActionEstimator
        :type target_estimator: target_estimate.TargetEstimator
        :param network_optimizer: default to network.LocalOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient:
        :param ou_params: (mu, theta, sigma) of OU noise arguments
        :param target_sync_interval:
        :param target_sync_rate:
        :param sampler: default to sampling.TransitionSampler
        :type sampler: sampling.Sampler
        :param batch_size:
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_se": f_se,
            "f_actor": f_actor,
            "f_critic": f_critic,
            "state_shape": state_shape,
            "dim_action": dim_action,
            "discount_factor": discount_factor,
            "target_estimator": target_estimator,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
            "ou_params": ou_params,
        })
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TransitionSampler(hrl.playback.MapPlayback(replay_size), batch_size, update_interval)
        kwargs.update({"sampler": sampler})
        super(DPG, self).__init__(*args, **kwargs)
        self.network_optimizer = network_optimizer
        self._q_function = network.NetworkFunction(self.network["q"])
        self._actor_function = network.NetworkFunction(self.network["action"], inputs=[self.network.inputs[0]])
        net_target = self.network.target
        self._target_q_function = network.NetworkFunction(net_target["q"])
        self._target_actor_function = network.NetworkFunction(net_target["action"], inputs=[self.network.inputs[0]])
        self._target_v_function = network.NetworkFunction(net_target["v"], inputs=[self.network.inputs[0]])
        self._discount_factor = discount_factor
        self.init_updaters_()

        self._policy = OUExplorationPolicy(self._actor_function, *ou_params, noise_type=noise_type)
        self._target_sync_interval = target_sync_interval
        self._target_sync_rate = target_sync_rate
        self._update_count = 0
예제 #8
0
    def __init__(
            self,
            f_create_net,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            batch_size=32,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_create_net": f_create_net,
            "state_shape": state_shape,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        print "network_optimizer:", network_optimizer
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCritic, self).__init__(*args, **kwargs)
        pi = self.network["pi"]
        # tf.stop_gradient(pi.op)
        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            self._input_action = tf.placeholder(dtype=tf.uint8,
                                                shape=[None],
                                                name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution
        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        # network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)
예제 #9
0
    def __init__(
            self,
            num_action,
            f_se,
            f_ac,
            f_tran,
            f_decoder,
            f_rollout,
            f_encoder,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            policy_with_iaa=False,
            compute_with_diff=False,
            with_momentum=True,
            rollout_depth=3,
            rollout_lane=3,
            dynamic_rollout=None,
            dynamic_skip_step=None,
            model_train_depth=3,
            batch_size=32,
            save_image_interval=1000,
            log_dir="./log/img",
            with_ob=False,
            with_goal=True,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param max_rollout: optional, should be an odd number
        :param args:
        :param kwargs:
        """

        self.processed_state_shape = []

        def f_iaa(inputs):
            input_observation = inputs[0]
            if compute_with_diff:
                logging.warning("use diff 2333")
                diff_ob = []
                for i in range(input_observation.shape[-1] / 3 - 1):
                    diff_ob.append(input_observation[:, :, :, (i + 1) *
                                                     3:(i + 1) * 3 + 3] -
                                   input_observation[:, :, :, i * 3:i * 3 + 3])
                net_se = network.Network([tf.concat(diff_ob[:], axis=3)],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = copy.copy(state_shape)
                self.processed_state_shape[-1] = state_shape[-1] - 3
            else:
                net_se = network.Network([input_observation],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = state_shape
            input_action = inputs[1]
            action_dim = inputs[2]
            input_action = tf.one_hot(indices=input_action,
                                      depth=action_dim,
                                      on_value=1.0,
                                      off_value=0.0,
                                      axis=-1)

            se = net_se["se"].op

            input_reward = tf.placeholder(dtype=tf.float32,
                                          shape=[None, 1],
                                          name="input_reward")
            encode_state = tf.placeholder(dtype=tf.float32,
                                          shape=[None,
                                                 se.shape.as_list()[-1]],
                                          name="encode_states")
            input_frame = tf.placeholder(
                dtype=tf.float32,
                shape=[None, state_shape[0], state_shape[1], 3],
                name="input_frame")
            rollout = network.Network([se],
                                      f_rollout,
                                      var_scope="rollout_policy")

            if not with_ob:
                net_model = network.Network([se, input_action],
                                            f_tran,
                                            var_scope="TranModel")
                net_decoder = network.Network([
                    tf.concat(
                        (encode_state, encode_state), axis=-1), input_frame
                ],
                                              f_decoder,
                                              var_scope="Decoder")

            else:
                net_model = network.Network([input_observation, input_action],
                                            f_tran,
                                            var_scope="TranModelOB")
                net_decoder = network.Network([input_frame],
                                              f_decoder,
                                              var_scope="DecoderOB")

            rollout_encoder = network.Network(
                [tf.concat((se, se), axis=-1), input_reward],
                f_encoder,
                var_scope="rollout_encoder")

            current_state = se
            current_ob = input_observation

            for i in range(rollout_lane):
                for j in range(rollout_depth):
                    current_rollout = rollout([current_state],
                                              name_scope="rollout_%d_%d" %
                                              (i, j))

                    # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op)
                    # current_action = rollout_action_dist.sample()

                    if not with_ob:
                        tran_model = net_model([
                            current_state, current_rollout["rollout_action"].op
                        ],
                                               name_scope="env_model_%d_%d" %
                                               (i, j))
                    else:
                        tran_model = net_model(
                            [current_ob, current_rollout["rollout_action"].op],
                            name_scope="env_model_%d_%d" % (i, j))

                    next_goal = tran_model["next_state"].op
                    reward = tran_model["reward"].op

                    if not with_ob:
                        current_state += next_goal
                    else:
                        current_ob = tf.concat(
                            [current_ob[:, :, :, 3:], next_goal], axis=-1)
                        next_goal = tf.stop_gradient(
                            net_se([current_ob])["se"].op)

                    if j == 0:
                        encode_states = next_goal
                        rollout_reward = reward
                    else:
                        encode_states = tf.concat([next_goal, encode_states],
                                                  axis=-1)
                        rollout_reward = tf.concat([rollout_reward, reward],
                                                   axis=0)

                current_state = se
                current_ob = input_observation

                input_reward = tf.reshape(rollout_reward, [-1, rollout_depth])
                input_reward = tf.split(input_reward, rollout_depth, axis=1)
                encode_state = tf.split(encode_states, rollout_depth, axis=1)

                for m in range(rollout_depth):
                    if m == 0:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([
                                    encode_state[-(m + 1)],
                                    encode_state[-(m + 1)]
                                ],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                    else:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([re, encode_state[-(m + 1)]],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                if i == 0:
                    path = re
                else:
                    path = tf.concat([path, re], axis=1)
            if policy_with_iaa:
                feature = tf.concat([path, se], axis=1)
            else:
                feature = se
            ac = network.Network([feature], f_ac, var_scope='ac')
            v = ac["v"].op
            pi_dist = ac["pi"].op

            return {"v": v, "pi": pi_dist, "rollout_action": None}, \
                    {
                        "se": net_se, "transition": net_model,
                        "state_decoder": net_decoder
                    }

        self._log_dir = log_dir
        self._rollout_depth = rollout_depth
        if dynamic_rollout is None:
            self._dynamic_rollout = [1, 3, 5]
            self._dynamic_skip_step = [5000, 15000]
        else:
            self._dynamic_rollout = dynamic_rollout
            self._dynamic_skip_step = dynamic_skip_step
        kwargs.update({
            "f_iaa": f_iaa,
            "state_shape": state_shape,
            "num_action": num_action,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        logging.warning(network_optimizer)
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCriticWithI2A, self).__init__(*args, **kwargs)
        pi = self.network["pi"]
        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            self._input_action = tf.placeholder(dtype=tf.uint8,
                                                shape=[None],
                                                name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution

        # self._rollout_action = network.NetworkFunction(self.network["rollout_action"])
        # self._rollout_dist = distribution.DiscreteDistribution(self._rollout_action, self._input_action)

        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        # network_optimizer.add_updater(
        #     PolicyNetUpdater(rollout_dist=self._rollout_dist,
        #                      rollout_action_function=self._rollout_action,
        #                      pi_function=self._pi_function),
        #     name="policy_net"
        # )
        network_optimizer.add_updater(EnvModelUpdater(
            net_se=self.network.sub_net("se"),
            net_transition=self.network.sub_net("transition"),
            net_decoder=self.network.sub_net("state_decoder"),
            curriculum=self._dynamic_rollout,
            skip_step=self._dynamic_skip_step,
            state_shape=state_shape,
            dim_action=num_action,
            transition_weight=1.0,
            with_momentum=with_momentum,
            compute_with_diff=compute_with_diff,
            save_image_interval=save_image_interval,
            with_ob=with_ob,
            with_goal=with_goal),
                                      name="env_model")
        # network_optimizer.freeze(self.network.sub_net("transition").variables)
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)
예제 #10
0
    def __init__(
            self,
            f_se,
            f_actor,
            f_critic,
            f_noise,
            state_shape,
            dim_action,
            dim_noise,
            # ACUpdate arguments
            discount_factor,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # policy arguments
            ou_params=(0.0, 0.2, 0.2),
            noise_stddev=0.5,
            noise_weight=1.0,
            noise_mean_weight=1e-2,
            noise_stddev_weight=1e-4,
            # target network sync arguments
            target_sync_interval=10,
            target_sync_rate=0.01,
            # sampler arguments
            replay_size=1000,
            batch_size=32,
            disentangle_with_dpg=True,
            *args,
            **kwargs):
        """

        :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action}
        :param state_shape: state shape
        :param dim_action: action dimension
        :param discount_factor:
        :param target_estimator: default to target_estimate.ContinuousActionEstimator
        :type target_estimator: target_estimate.TargetEstimator
        :param network_optimizer: default to network.LocalOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient:
        :param ou_params: (mu, theta, sigma) of OU noise arguments
        :param target_sync_interval:
        :param target_sync_rate:
        :param sampler: default to sampling.TransitionSampler
        :type sampler: sampling.Sampler
        :param batch_size:
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_se": f_se,
            "f_actor": f_actor,
            "f_critic": f_critic,
            "f_noise": f_noise,
            "state_shape": state_shape,
            "dim_action": dim_action,
            "dim_noise": dim_noise,
            "discount_factor": discount_factor,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
            "replay_size": replay_size,
            "ou_params": ou_params,
            "noise_stddev": noise_stddev,
            "noise_weight": noise_weight
        })
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        super(NoisyDPG, self).__init__(*args, **kwargs)

        self._disentangle_with_dpg = disentangle_with_dpg

        def make_sample(state, action, reward, next_state, episode_done, noise,
                        **kwargs):
            sample = sampling.default_make_sample(state, action, reward,
                                                  next_state, episode_done)
            sample.update({"noise": noise})
            return sample

        self._sampler = sampling.TransitionSampler(
            hrl.playback.MapPlayback(replay_size),
            batch_size,
            4,
            sample_maker=make_sample)

        self._q_function = network.NetworkFunction(
            self.network["q"], inputs=[self._input_state, self._input_action])
        self._actor_function = network.NetworkFunction(
            self.network["action"],
            inputs=[self._input_state, self._input_noise])
        self._actor_mean_function = network.NetworkFunction(
            self.network["action_mean"], inputs=[self._input_state])
        self._noise_function = network.NetworkFunction(
            self.network["action_noise"],
            inputs=[self._input_state, self._input_noise])
        self._target_q_function = network.NetworkFunction(
            self.network.target["q"],
            inputs=[self._input_state, self._input_action])
        self._target_actor_function = network.NetworkFunction(
            self.network.target["action"],
            inputs=[self._input_state, self._input_noise])
        target_estimator = NoisyContinuousActionEstimator(
            self._target_actor_function, self._target_q_function,
            discount_factor)
        self.network_optimizer = network_optimizer
        if disentangle_with_dpg:
            network_optimizer.add_updater(DisentangleNoisyDPGUpdater(
                actor=self._actor_function,
                critic=self._q_function,
                f_noise=self._noise_function,
                target_estimator=target_estimator,
                discount_factor=discount_factor,
                actor_weight=0.02,
                actor_mean=self._actor_mean_function,
                zero_mean_weight=noise_mean_weight,
                stddev_weight=noise_stddev_weight),
                                          name="ac")
        else:
            network_optimizer.add_updater(NoisyDPGUpdater(
                actor=self._actor_function,
                critic=self._q_function,
                target_estimator=target_estimator,
                discount_factor=discount_factor,
                actor_weight=0.02,
                actor_mean=self._actor_mean_function),
                                          name="ac")
            network_optimizer.add_updater(DisentangleUpdater(
                self.network.sub_net("se"),
                self.network.sub_net("noise"),
                stddev=noise_stddev),
                                          weight=noise_weight,
                                          name="disentangle")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.compile()

        self._act_all_function = network.NetworkFunction(
            {
                "action": self.network["action"],
                "mean": self.network["action_mean"],
                "noise": self.network["action_noise"]
            },
            inputs=[self._input_state, self._input_noise])

        self._noise_source = OUNoise([dim_noise], *ou_params)
        self._last_input_noise = None
        # self._policy = OUExplorationPolicy(self._actor_function, *ou_params)
        self._target_sync_interval = target_sync_interval
        self._target_sync_rate = target_sync_rate
        self._update_count = 0
예제 #11
0
파일: icm.py 프로젝트: hobotrl/hobotrl
    def __init__(
            self,
            env,
            f_se,
            f_ac,
            f_forward,
            f_inverse,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            batch_size=32,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param args:
        :param kwargs:
        """
        def f_icm(inputs):
            """
            :param inputs: a list, [state, next_state, action]
            :return: a dict of op
            """
            f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1')
            f_se1 = network.NetworkFunction(f_se1["se"]).output().op
            f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2')
            f_se2 = network.NetworkFunction(f_se2["se"]).output().op

            f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac')
            v = network.NetworkFunction(f_ac_out["v"]).output().op
            pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op

            one_hot_action = tf.one_hot(indices=inputs[2],
                                        depth=env.action_space.n,
                                        on_value=1.0,
                                        off_value=0.0,
                                        axis=-1)
            f_forward_out = network.Network([one_hot_action, f_se1],
                                            f_forward,
                                            var_scope='learn_forward')
            phi2_hat = network.NetworkFunction(
                f_forward_out["phi2_hat"]).output().op

            f_inverse_out = network.Network([f_se1, f_se2],
                                            f_inverse,
                                            var_scope='learn_inverse')
            logits = network.NetworkFunction(
                f_inverse_out["logits"]).output().op

            bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1)

            return {
                "pi": pi_dist,
                "v": v,
                "logits": logits,
                "phi1": f_se1,
                "phi2": f_se2,
                "phi2_hat": phi2_hat,
                "bonus": bonus
            }

        kwargs.update({
            "f_icm": f_icm,
            "state_shape": state_shape,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        print "network_optimizer:", network_optimizer
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCriticWithICM, self).__init__(*args, **kwargs)

        pi = self.network["pi"]

        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            # self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution
        self._phi2_hat_function = network.NetworkFunction(
            self.network["phi2_hat"])
        self._phi2_function = network.NetworkFunction(self.network["phi2"])
        self._phi1_function = network.NetworkFunction(self.network["phi1"])
        self._logits = network.NetworkFunction(self.network["logits"])
        self._bonus = network.NetworkFunction(self.network["bonus"])

        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor,
                                                       bonus=self._bonus)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.add_updater(ForwardUpdater(
            forward_function=self._phi2_hat_function,
            feature_function=self._phi2_function,
            policy_dist=self._pi_distribution),
                                      name="forward")
        network_optimizer.add_updater(InverseUpdater(
            inverse_function=self._logits, policy_dist=self._pi_distribution),
                                      name="inverse")
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)
예제 #12
0
파일: model.py 프로젝트: hobotrl/hobotrl
    def __init__(
            self,
            f_se,
            f_transition,
            f_decoder,
            # optimality tightening parameters
            state_shape,
            num_actions,
            # env model parameters
            rollout_depth,
            network_optimizer=None,
            max_gradient=10.0,
            update_interval=4,
            replay_size=1000,
            batch_size=32,
            sampler=None,
            with_momentum=True,
            curriculum=[1, 3, 5],
            skip_step=[10000, 20000],
            save_image_interval=10000,
            log_dir=None,
            with_ob=False,
            with_goal=True,
            *args,
            **kwargs):
        kwargs.update({
            "f_se": f_se,
            "f_transition": f_transition,
            "f_decoder": f_decoder,
            "state_shape": state_shape,
            "num_actions": num_actions,
            "rollout_depth": rollout_depth,
            "log_dir": log_dir
        })
        self._state_shape, self._num_actions = state_shape, num_actions
        self._rollout_depth = rollout_depth
        self._with_momentum = with_momentum
        self._curriculum, self._skip_step = curriculum, skip_step
        self._save_image_interval = save_image_interval
        self._with_ob = with_ob
        self._with_goal = with_goal

        if network_optimizer is None:
            self.network_optimizer = network.LocalOptimizer(
                grad_clip=max_gradient)

        if sampler is None:
            max_traj_length = 200
            sampler = sampling.TruncateTrajectorySampler2(
                None,
                replay_size / max_traj_length,
                max_traj_length,
                batch_size=1,
                trajectory_length=batch_size,
                interval=update_interval)
        self._sampler = sampler
        # BaseDeepAgent.__init__(self, *args, **kwargs)
        # kwargs.pop("global_step")
        kwargs.update({"sampler": sampler})
        # sampling.TruncateTrajectorySampler2.__init__(self, *args, **kwargs)
        super(Model, self).__init__(*args, **kwargs)
        self.init_updaters_()

        self._log_dir = log_dir
        self._update_count = 0