예제 #1
0
    def __init__(self, actor, critic, target_estimator, discount_factor, actor_weight):
        """

        :param actor:
        :type actor network.NetworkFunction
        :param critic:
        :type critic network.NetworkFunction
        :param target_estimator:
        :type target_estimator: target_estimate.TargetEstimator
        """
        super(DPGUpdater, self).__init__()
        self._actor, self._critic, self._target_estimator = \
            actor, critic, target_estimator
        self._dim_action = actor.output().op.shape.as_list()[-1]
        op_q = critic.output().op
        with tf.name_scope("DPGUpdater"):
            with tf.name_scope("input"):
                self._input_target_q = tf.placeholder(dtype=tf.float32, shape=[None], name="input_target_q")
                self._input_action_gradient = tf.placeholder(dtype=tf.float32,
                                                             shape=[None, self._dim_action],
                                                             name="input_action_gradient")
            with tf.name_scope("critic"):
                self._critic_loss = tf.reduce_mean(network.Utils.clipped_square(
                    self._input_target_q - op_q
                ))
            with tf.name_scope("actor"):
                # critic.inputs[1] is input_action
                self._action_gradient = tf.gradients(critic.output().op, critic.inputs[1])[0]
                self._gradient_func = network.NetworkFunction(
                    outputs=network.NetworkSymbol(self._action_gradient, "gradient", critic.network),
                    inputs=critic.inputs
                )
                self._actor_loss = tf.reduce_sum(actor.output().op * self._input_action_gradient, axis=1)
                self._actor_loss = -tf.reduce_mean(self._actor_loss)

            self._op_loss = self._actor_loss * actor_weight + self._critic_loss
        self._update_operation = network.MinimizeLoss(self._op_loss,
                                                      var_list=self._actor.variables +
                                                               self._critic.variables)
예제 #2
0
파일: icm.py 프로젝트: hobotrl/hobotrl
        def f_icm(inputs):
            """
            :param inputs: a list, [state, next_state, action]
            :return: a dict of op
            """
            f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1')
            f_se1 = network.NetworkFunction(f_se1["se"]).output().op
            f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2')
            f_se2 = network.NetworkFunction(f_se2["se"]).output().op

            f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac')
            v = network.NetworkFunction(f_ac_out["v"]).output().op
            pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op

            one_hot_action = tf.one_hot(indices=inputs[2],
                                        depth=env.action_space.n,
                                        on_value=1.0,
                                        off_value=0.0,
                                        axis=-1)
            f_forward_out = network.Network([one_hot_action, f_se1],
                                            f_forward,
                                            var_scope='learn_forward')
            phi2_hat = network.NetworkFunction(
                f_forward_out["phi2_hat"]).output().op

            f_inverse_out = network.Network([f_se1, f_se2],
                                            f_inverse,
                                            var_scope='learn_inverse')
            logits = network.NetworkFunction(
                f_inverse_out["logits"]).output().op

            bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1)

            return {
                "pi": pi_dist,
                "v": v,
                "logits": logits,
                "phi1": f_se1,
                "phi2": f_se2,
                "phi2_hat": phi2_hat,
                "bonus": bonus
            }
예제 #3
0
파일: dqn.py 프로젝트: hobotrl/hobotrl
 def init_value_function(self, **kwargs):
     self.learn_q = network.NetworkFunction(self.network["q"])
     self.target_q = network.NetworkFunction(self.network.target["q"])
     self.target_v = GreedyStateValueFunction(self.target_q)
     return self.learn_q
예제 #4
0
    def __init__(self,
                 f_se, f_actor, f_critic,
                 state_shape, dim_action,
                 # ACUpdate arguments
                 discount_factor, target_estimator=None,
                 # optimizer arguments
                 network_optimizer=None, max_gradient=10.0,
                 # policy arguments
                 ou_params=(0.0, 0.2, 0.2),
                 # target network sync arguments
                 target_sync_interval=10,
                 target_sync_rate=0.01,
                 # sampler arguments
                 sampler=None,
                 batch_size=32,
                 update_interval=4,
                 replay_size=1000,
                 noise_type=OUNoise,
                 *args, **kwargs):
        """

        :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action}
        :param state_shape: state shape
        :param dim_action: action dimension
        :param discount_factor:
        :param target_estimator: default to target_estimate.ContinuousActionEstimator
        :type target_estimator: target_estimate.TargetEstimator
        :param network_optimizer: default to network.LocalOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient:
        :param ou_params: (mu, theta, sigma) of OU noise arguments
        :param target_sync_interval:
        :param target_sync_rate:
        :param sampler: default to sampling.TransitionSampler
        :type sampler: sampling.Sampler
        :param batch_size:
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_se": f_se,
            "f_actor": f_actor,
            "f_critic": f_critic,
            "state_shape": state_shape,
            "dim_action": dim_action,
            "discount_factor": discount_factor,
            "target_estimator": target_estimator,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
            "ou_params": ou_params,
        })
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TransitionSampler(hrl.playback.MapPlayback(replay_size), batch_size, update_interval)
        kwargs.update({"sampler": sampler})
        super(DPG, self).__init__(*args, **kwargs)
        self.network_optimizer = network_optimizer
        self._q_function = network.NetworkFunction(self.network["q"])
        self._actor_function = network.NetworkFunction(self.network["action"], inputs=[self.network.inputs[0]])
        net_target = self.network.target
        self._target_q_function = network.NetworkFunction(net_target["q"])
        self._target_actor_function = network.NetworkFunction(net_target["action"], inputs=[self.network.inputs[0]])
        self._target_v_function = network.NetworkFunction(net_target["v"], inputs=[self.network.inputs[0]])
        self._discount_factor = discount_factor
        self.init_updaters_()

        self._policy = OUExplorationPolicy(self._actor_function, *ou_params, noise_type=noise_type)
        self._target_sync_interval = target_sync_interval
        self._target_sync_rate = target_sync_rate
        self._update_count = 0
예제 #5
0
    def __init__(
            self,
            f_create_net,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            batch_size=32,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_create_net": f_create_net,
            "state_shape": state_shape,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        print "network_optimizer:", network_optimizer
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCritic, self).__init__(*args, **kwargs)
        pi = self.network["pi"]
        # tf.stop_gradient(pi.op)
        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            self._input_action = tf.placeholder(dtype=tf.uint8,
                                                shape=[None],
                                                name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution
        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        # network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)
예제 #6
0
    def __init__(
            self,
            num_action,
            f_se,
            f_ac,
            f_tran,
            f_decoder,
            f_rollout,
            f_encoder,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            policy_with_iaa=False,
            compute_with_diff=False,
            with_momentum=True,
            rollout_depth=3,
            rollout_lane=3,
            dynamic_rollout=None,
            dynamic_skip_step=None,
            model_train_depth=3,
            batch_size=32,
            save_image_interval=1000,
            log_dir="./log/img",
            with_ob=False,
            with_goal=True,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param max_rollout: optional, should be an odd number
        :param args:
        :param kwargs:
        """

        self.processed_state_shape = []

        def f_iaa(inputs):
            input_observation = inputs[0]
            if compute_with_diff:
                logging.warning("use diff 2333")
                diff_ob = []
                for i in range(input_observation.shape[-1] / 3 - 1):
                    diff_ob.append(input_observation[:, :, :, (i + 1) *
                                                     3:(i + 1) * 3 + 3] -
                                   input_observation[:, :, :, i * 3:i * 3 + 3])
                net_se = network.Network([tf.concat(diff_ob[:], axis=3)],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = copy.copy(state_shape)
                self.processed_state_shape[-1] = state_shape[-1] - 3
            else:
                net_se = network.Network([input_observation],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = state_shape
            input_action = inputs[1]
            action_dim = inputs[2]
            input_action = tf.one_hot(indices=input_action,
                                      depth=action_dim,
                                      on_value=1.0,
                                      off_value=0.0,
                                      axis=-1)

            se = net_se["se"].op

            input_reward = tf.placeholder(dtype=tf.float32,
                                          shape=[None, 1],
                                          name="input_reward")
            encode_state = tf.placeholder(dtype=tf.float32,
                                          shape=[None,
                                                 se.shape.as_list()[-1]],
                                          name="encode_states")
            input_frame = tf.placeholder(
                dtype=tf.float32,
                shape=[None, state_shape[0], state_shape[1], 3],
                name="input_frame")
            rollout = network.Network([se],
                                      f_rollout,
                                      var_scope="rollout_policy")

            if not with_ob:
                net_model = network.Network([se, input_action],
                                            f_tran,
                                            var_scope="TranModel")
                net_decoder = network.Network([
                    tf.concat(
                        (encode_state, encode_state), axis=-1), input_frame
                ],
                                              f_decoder,
                                              var_scope="Decoder")

            else:
                net_model = network.Network([input_observation, input_action],
                                            f_tran,
                                            var_scope="TranModelOB")
                net_decoder = network.Network([input_frame],
                                              f_decoder,
                                              var_scope="DecoderOB")

            rollout_encoder = network.Network(
                [tf.concat((se, se), axis=-1), input_reward],
                f_encoder,
                var_scope="rollout_encoder")

            current_state = se
            current_ob = input_observation

            for i in range(rollout_lane):
                for j in range(rollout_depth):
                    current_rollout = rollout([current_state],
                                              name_scope="rollout_%d_%d" %
                                              (i, j))

                    # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op)
                    # current_action = rollout_action_dist.sample()

                    if not with_ob:
                        tran_model = net_model([
                            current_state, current_rollout["rollout_action"].op
                        ],
                                               name_scope="env_model_%d_%d" %
                                               (i, j))
                    else:
                        tran_model = net_model(
                            [current_ob, current_rollout["rollout_action"].op],
                            name_scope="env_model_%d_%d" % (i, j))

                    next_goal = tran_model["next_state"].op
                    reward = tran_model["reward"].op

                    if not with_ob:
                        current_state += next_goal
                    else:
                        current_ob = tf.concat(
                            [current_ob[:, :, :, 3:], next_goal], axis=-1)
                        next_goal = tf.stop_gradient(
                            net_se([current_ob])["se"].op)

                    if j == 0:
                        encode_states = next_goal
                        rollout_reward = reward
                    else:
                        encode_states = tf.concat([next_goal, encode_states],
                                                  axis=-1)
                        rollout_reward = tf.concat([rollout_reward, reward],
                                                   axis=0)

                current_state = se
                current_ob = input_observation

                input_reward = tf.reshape(rollout_reward, [-1, rollout_depth])
                input_reward = tf.split(input_reward, rollout_depth, axis=1)
                encode_state = tf.split(encode_states, rollout_depth, axis=1)

                for m in range(rollout_depth):
                    if m == 0:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([
                                    encode_state[-(m + 1)],
                                    encode_state[-(m + 1)]
                                ],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                    else:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([re, encode_state[-(m + 1)]],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                if i == 0:
                    path = re
                else:
                    path = tf.concat([path, re], axis=1)
            if policy_with_iaa:
                feature = tf.concat([path, se], axis=1)
            else:
                feature = se
            ac = network.Network([feature], f_ac, var_scope='ac')
            v = ac["v"].op
            pi_dist = ac["pi"].op

            return {"v": v, "pi": pi_dist, "rollout_action": None}, \
                    {
                        "se": net_se, "transition": net_model,
                        "state_decoder": net_decoder
                    }

        self._log_dir = log_dir
        self._rollout_depth = rollout_depth
        if dynamic_rollout is None:
            self._dynamic_rollout = [1, 3, 5]
            self._dynamic_skip_step = [5000, 15000]
        else:
            self._dynamic_rollout = dynamic_rollout
            self._dynamic_skip_step = dynamic_skip_step
        kwargs.update({
            "f_iaa": f_iaa,
            "state_shape": state_shape,
            "num_action": num_action,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        logging.warning(network_optimizer)
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCriticWithI2A, self).__init__(*args, **kwargs)
        pi = self.network["pi"]
        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            self._input_action = tf.placeholder(dtype=tf.uint8,
                                                shape=[None],
                                                name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution

        # self._rollout_action = network.NetworkFunction(self.network["rollout_action"])
        # self._rollout_dist = distribution.DiscreteDistribution(self._rollout_action, self._input_action)

        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        # network_optimizer.add_updater(
        #     PolicyNetUpdater(rollout_dist=self._rollout_dist,
        #                      rollout_action_function=self._rollout_action,
        #                      pi_function=self._pi_function),
        #     name="policy_net"
        # )
        network_optimizer.add_updater(EnvModelUpdater(
            net_se=self.network.sub_net("se"),
            net_transition=self.network.sub_net("transition"),
            net_decoder=self.network.sub_net("state_decoder"),
            curriculum=self._dynamic_rollout,
            skip_step=self._dynamic_skip_step,
            state_shape=state_shape,
            dim_action=num_action,
            transition_weight=1.0,
            with_momentum=with_momentum,
            compute_with_diff=compute_with_diff,
            save_image_interval=save_image_interval,
            with_ob=with_ob,
            with_goal=with_goal),
                                      name="env_model")
        # network_optimizer.freeze(self.network.sub_net("transition").variables)
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)
예제 #7
0
    def __init__(
            self,
            f_se,
            f_actor,
            f_critic,
            f_noise,
            state_shape,
            dim_action,
            dim_noise,
            # ACUpdate arguments
            discount_factor,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # policy arguments
            ou_params=(0.0, 0.2, 0.2),
            noise_stddev=0.5,
            noise_weight=1.0,
            noise_mean_weight=1e-2,
            noise_stddev_weight=1e-4,
            # target network sync arguments
            target_sync_interval=10,
            target_sync_rate=0.01,
            # sampler arguments
            replay_size=1000,
            batch_size=32,
            disentangle_with_dpg=True,
            *args,
            **kwargs):
        """

        :param f_create_net: function, f_create_net([state, action]) => {"q": op_q, "action": op_action}
        :param state_shape: state shape
        :param dim_action: action dimension
        :param discount_factor:
        :param target_estimator: default to target_estimate.ContinuousActionEstimator
        :type target_estimator: target_estimate.TargetEstimator
        :param network_optimizer: default to network.LocalOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient:
        :param ou_params: (mu, theta, sigma) of OU noise arguments
        :param target_sync_interval:
        :param target_sync_rate:
        :param sampler: default to sampling.TransitionSampler
        :type sampler: sampling.Sampler
        :param batch_size:
        :param args:
        :param kwargs:
        """
        kwargs.update({
            "f_se": f_se,
            "f_actor": f_actor,
            "f_critic": f_critic,
            "f_noise": f_noise,
            "state_shape": state_shape,
            "dim_action": dim_action,
            "dim_noise": dim_noise,
            "discount_factor": discount_factor,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
            "replay_size": replay_size,
            "ou_params": ou_params,
            "noise_stddev": noise_stddev,
            "noise_weight": noise_weight
        })
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        super(NoisyDPG, self).__init__(*args, **kwargs)

        self._disentangle_with_dpg = disentangle_with_dpg

        def make_sample(state, action, reward, next_state, episode_done, noise,
                        **kwargs):
            sample = sampling.default_make_sample(state, action, reward,
                                                  next_state, episode_done)
            sample.update({"noise": noise})
            return sample

        self._sampler = sampling.TransitionSampler(
            hrl.playback.MapPlayback(replay_size),
            batch_size,
            4,
            sample_maker=make_sample)

        self._q_function = network.NetworkFunction(
            self.network["q"], inputs=[self._input_state, self._input_action])
        self._actor_function = network.NetworkFunction(
            self.network["action"],
            inputs=[self._input_state, self._input_noise])
        self._actor_mean_function = network.NetworkFunction(
            self.network["action_mean"], inputs=[self._input_state])
        self._noise_function = network.NetworkFunction(
            self.network["action_noise"],
            inputs=[self._input_state, self._input_noise])
        self._target_q_function = network.NetworkFunction(
            self.network.target["q"],
            inputs=[self._input_state, self._input_action])
        self._target_actor_function = network.NetworkFunction(
            self.network.target["action"],
            inputs=[self._input_state, self._input_noise])
        target_estimator = NoisyContinuousActionEstimator(
            self._target_actor_function, self._target_q_function,
            discount_factor)
        self.network_optimizer = network_optimizer
        if disentangle_with_dpg:
            network_optimizer.add_updater(DisentangleNoisyDPGUpdater(
                actor=self._actor_function,
                critic=self._q_function,
                f_noise=self._noise_function,
                target_estimator=target_estimator,
                discount_factor=discount_factor,
                actor_weight=0.02,
                actor_mean=self._actor_mean_function,
                zero_mean_weight=noise_mean_weight,
                stddev_weight=noise_stddev_weight),
                                          name="ac")
        else:
            network_optimizer.add_updater(NoisyDPGUpdater(
                actor=self._actor_function,
                critic=self._q_function,
                target_estimator=target_estimator,
                discount_factor=discount_factor,
                actor_weight=0.02,
                actor_mean=self._actor_mean_function),
                                          name="ac")
            network_optimizer.add_updater(DisentangleUpdater(
                self.network.sub_net("se"),
                self.network.sub_net("noise"),
                stddev=noise_stddev),
                                          weight=noise_weight,
                                          name="disentangle")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.compile()

        self._act_all_function = network.NetworkFunction(
            {
                "action": self.network["action"],
                "mean": self.network["action_mean"],
                "noise": self.network["action_noise"]
            },
            inputs=[self._input_state, self._input_noise])

        self._noise_source = OUNoise([dim_noise], *ou_params)
        self._last_input_noise = None
        # self._policy = OUExplorationPolicy(self._actor_function, *ou_params)
        self._target_sync_interval = target_sync_interval
        self._target_sync_rate = target_sync_rate
        self._update_count = 0
예제 #8
0
파일: icm.py 프로젝트: hobotrl/hobotrl
    def __init__(
            self,
            env,
            f_se,
            f_ac,
            f_forward,
            f_inverse,
            state_shape,
            # ACUpdate arguments
            discount_factor,
            entropy=1e-3,
            target_estimator=None,
            max_advantage=10.0,
            # optimizer arguments
            network_optimizer=None,
            max_gradient=10.0,
            # sampler arguments
            sampler=None,
            batch_size=32,
            *args,
            **kwargs):
        """
        :param f_create_net: function: f_create_net(inputs) => {"pi": dist_pi, "q": q_values},
                in which {inputs} is [input_state],
                {dist_pi} is probability distribution of policy with shape [None, num_actions],
                {q_values} is Q values with shape [None, num_actions];
                or f_create_net(inputs) => {"mean": mean, "stddev": stddev, "v": v},
                in which {mean} {stddev} is mean and stddev if normal distribution for continuous actions,
                {v} is state value.
        :param state_shape:
        :param discount_factor:
        :param entropy: entropy regulator weight.
        :param target_estimator: optional, default to target_estimate.NStepTD
        :type target_estimator.TargetEstimator
        :param max_advantage: advantage regulation: max advantage value in policy gradient step
        :param network_optimizer: optional, default to network.LocalNetworkOptimizer
        :type network_optimizer: network.NetworkOptimizer
        :param max_gradient: optional, max_gradient clip value
        :param sampler: optional, default to sampling.TrajectoryOnSampler.
                if None, a TrajectoryOnSampler will be created using batch_size.
        :type sampler: sampling.Sampler
        :param batch_size: optional, batch_size when creating sampler
        :param args:
        :param kwargs:
        """
        def f_icm(inputs):
            """
            :param inputs: a list, [state, next_state, action]
            :return: a dict of op
            """
            f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1')
            f_se1 = network.NetworkFunction(f_se1["se"]).output().op
            f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2')
            f_se2 = network.NetworkFunction(f_se2["se"]).output().op

            f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac')
            v = network.NetworkFunction(f_ac_out["v"]).output().op
            pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op

            one_hot_action = tf.one_hot(indices=inputs[2],
                                        depth=env.action_space.n,
                                        on_value=1.0,
                                        off_value=0.0,
                                        axis=-1)
            f_forward_out = network.Network([one_hot_action, f_se1],
                                            f_forward,
                                            var_scope='learn_forward')
            phi2_hat = network.NetworkFunction(
                f_forward_out["phi2_hat"]).output().op

            f_inverse_out = network.Network([f_se1, f_se2],
                                            f_inverse,
                                            var_scope='learn_inverse')
            logits = network.NetworkFunction(
                f_inverse_out["logits"]).output().op

            bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1)

            return {
                "pi": pi_dist,
                "v": v,
                "logits": logits,
                "phi1": f_se1,
                "phi2": f_se2,
                "phi2_hat": phi2_hat,
                "bonus": bonus
            }

        kwargs.update({
            "f_icm": f_icm,
            "state_shape": state_shape,
            "discount_factor": discount_factor,
            "entropy": entropy,
            "target_estimator": target_estimator,
            "max_advantage": max_advantage,
            "max_gradient": max_gradient,
            "batch_size": batch_size,
        })
        print "network_optimizer:", network_optimizer
        if network_optimizer is None:
            network_optimizer = network.LocalOptimizer(grad_clip=max_gradient)
        if sampler is None:
            sampler = sampling.TrajectoryOnSampler(interval=batch_size)
            kwargs.update({"sampler": sampler})

        super(ActorCriticWithICM, self).__init__(*args, **kwargs)

        pi = self.network["pi"]

        if pi is not None:
            # discrete action: pi is categorical probability distribution
            self._pi_function = network.NetworkFunction(self.network["pi"])
            # self._input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action")

            self._pi_distribution = distribution.DiscreteDistribution(
                self._pi_function, self._input_action)
            q = self.network["q"]
            if q is not None:
                # network outputs q
                self._q_function = network.NetworkFunction(q)
                self._v_function = GreedyStateValueFunction(self._q_function)
            else:
                # network output v
                self._v_function = network.NetworkFunction(self.network["v"])
        else:
            # continuous action: mean / stddev represents normal distribution
            dim_action = self.network["mean"].op.shape.as_list()[-1]
            self._input_action = tf.placeholder(dtype=tf.float32,
                                                shape=[None, dim_action],
                                                name="input_action")
            self._pi_function = network.NetworkFunction(
                outputs={
                    "mean": self.network["mean"],
                    "stddev": self.network["stddev"]
                },
                inputs=self.network.inputs)
            self._pi_distribution = distribution.NormalDistribution(
                self._pi_function, self._input_action)
            self._v_function = network.NetworkFunction(self.network["v"])
            # continuous action: mean / stddev for normal distribution
        self._phi2_hat_function = network.NetworkFunction(
            self.network["phi2_hat"])
        self._phi2_function = network.NetworkFunction(self.network["phi2"])
        self._phi1_function = network.NetworkFunction(self.network["phi1"])
        self._logits = network.NetworkFunction(self.network["logits"])
        self._bonus = network.NetworkFunction(self.network["bonus"])

        if target_estimator is None:
            target_estimator = target_estimate.NStepTD(self._v_function,
                                                       discount_factor,
                                                       bonus=self._bonus)
            # target_estimator = target_estimate.GAENStep(self._v_function, discount_factor)
        self.network_optimizer = network_optimizer
        network_optimizer.add_updater(ActorCriticUpdater(
            policy_dist=self._pi_distribution,
            v_function=self._v_function,
            target_estimator=target_estimator,
            entropy=entropy),
                                      name="ac")
        network_optimizer.add_updater(network.L2(self.network), name="l2")
        network_optimizer.add_updater(ForwardUpdater(
            forward_function=self._phi2_hat_function,
            feature_function=self._phi2_function,
            policy_dist=self._pi_distribution),
                                      name="forward")
        network_optimizer.add_updater(InverseUpdater(
            inverse_function=self._logits, policy_dist=self._pi_distribution),
                                      name="inverse")
        network_optimizer.compile()

        self._policy = StochasticPolicy(self._pi_distribution)