예제 #1
0
파일: model.py 프로젝트: hobotrl/hobotrl
        def f(inputs):
            input_state, input_action, input_frame = inputs[0], inputs[
                1], inputs[2]
            action_onehot = tf.one_hot(indices=input_action,
                                       depth=num_actions,
                                       on_value=1.0,
                                       off_value=0.0,
                                       axis=-1)
            net_se = network.Network([input_state],
                                     f_se,
                                     var_scope="state_encoder")
            se = net_se["se"].op

            if not self._with_ob:
                net_transition = network.Network([se, action_onehot],
                                                 f_transition,
                                                 var_scope="TranModel")
                net_decoder = network.Network(
                    [tf.concat((se, se), axis=-1), input_frame],
                    f_decoder,
                    var_scope="Decoder")
            else:
                net_transition = network.Network([input_state, action_onehot],
                                                 f_transition,
                                                 var_scope="ObTranModel")
                net_decoder = network.Network([input_frame],
                                              f_decoder,
                                              var_scope="ObDecoder")
            return {}, {
                "se": net_se,
                "decoder": net_decoder,
                "transition": net_transition
            }
예제 #2
0
 def init_network(self, f_create_net, state_shape, *args, **kwargs):
     input_state = tf.placeholder(dtype=tf.float32,
                                  shape=[None] + list(state_shape),
                                  name="input_state")
     net = network.Network([input_state], f_create_net, var_scope="learn")
     self._old_network = network.Network([input_state],
                                         f_create_net,
                                         var_scope="old")
     self._old_network_syncer = network.NetworkSyncer(
         net, self._old_network)
     return net
예제 #3
0
 def f(inputs):
     state, action = input_state, input_action
     net_se = network.Network([state], f_se, var_scope="se")
     se = net_se["se"].op
     net_actor = network.Network([se], f_actor, var_scope="actor")
     net_critic = network.Network([se, input_action], f_critic, var_scope="critic")
     net_critic_for_a = net_critic([se, net_actor["action"].op], name_scope="v_critic")
     return {
         "action": net_actor["action"].op,
         "q": net_critic["q"].op,
         "v": net_critic_for_a["q"].op,
     }, {
         "se": net_se,
         "actor": net_actor,
         "critic": net_critic
     }
예제 #4
0
 def init_network(self, f_iaa, state_shape, num_action, *args, **kwargs):
     input_state = tf.placeholder(dtype=tf.float32,
                                  shape=[None] + list(state_shape),
                                  name="input_state")
     input_action = tf.placeholder(dtype=tf.uint8,
                                   shape=[None],
                                   name="input_action")
     return network.Network([input_state, input_action, num_action],
                            f_iaa,
                            var_scope="learn")
예제 #5
0
파일: icm.py 프로젝트: hobotrl/hobotrl
        def f_icm(inputs):
            """
            :param inputs: a list, [state, next_state, action]
            :return: a dict of op
            """
            f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1')
            f_se1 = network.NetworkFunction(f_se1["se"]).output().op
            f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2')
            f_se2 = network.NetworkFunction(f_se2["se"]).output().op

            f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac')
            v = network.NetworkFunction(f_ac_out["v"]).output().op
            pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op

            one_hot_action = tf.one_hot(indices=inputs[2],
                                        depth=env.action_space.n,
                                        on_value=1.0,
                                        off_value=0.0,
                                        axis=-1)
            f_forward_out = network.Network([one_hot_action, f_se1],
                                            f_forward,
                                            var_scope='learn_forward')
            phi2_hat = network.NetworkFunction(
                f_forward_out["phi2_hat"]).output().op

            f_inverse_out = network.Network([f_se1, f_se2],
                                            f_inverse,
                                            var_scope='learn_inverse')
            logits = network.NetworkFunction(
                f_inverse_out["logits"]).output().op

            bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1)

            return {
                "pi": pi_dist,
                "v": v,
                "logits": logits,
                "phi1": f_se1,
                "phi2": f_se2,
                "phi2_hat": phi2_hat,
                "bonus": bonus
            }
예제 #6
0
파일: model.py 프로젝트: hobotrl/hobotrl
    def init_network(self, f_se, f_transition, f_decoder, state_shape,
                     num_actions, *args, **kwargs):
        def f(inputs):
            input_state, input_action, input_frame = inputs[0], inputs[
                1], inputs[2]
            action_onehot = tf.one_hot(indices=input_action,
                                       depth=num_actions,
                                       on_value=1.0,
                                       off_value=0.0,
                                       axis=-1)
            net_se = network.Network([input_state],
                                     f_se,
                                     var_scope="state_encoder")
            se = net_se["se"].op

            if not self._with_ob:
                net_transition = network.Network([se, action_onehot],
                                                 f_transition,
                                                 var_scope="TranModel")
                net_decoder = network.Network(
                    [tf.concat((se, se), axis=-1), input_frame],
                    f_decoder,
                    var_scope="Decoder")
            else:
                net_transition = network.Network([input_state, action_onehot],
                                                 f_transition,
                                                 var_scope="ObTranModel")
                net_decoder = network.Network([input_frame],
                                              f_decoder,
                                              var_scope="ObDecoder")
            return {}, {
                "se": net_se,
                "decoder": net_decoder,
                "transition": net_transition
            }

        input_frame = tf.placeholder(
            dtype=tf.float32,
            shape=[None, state_shape[0], state_shape[1], 3],
            name="input_frame")
        input_state = tf.placeholder(dtype=tf.float32,
                                     shape=[None] + list(state_shape),
                                     name="input_state")
        input_action = tf.placeholder(dtype=tf.uint8,
                                      shape=[None],
                                      name="input_action")
        return network.Network([input_state, input_action, input_frame],
                               f,
                               var_scope="learn")
예제 #7
0
 def f(inputs):
     state, action, noise = inputs
     se_net = network.Network([state], f_se, var_scope="se")
     se = se_net["se"].op
     q_net = network.Network([se, action], f_critic, var_scope="critic")
     pi_net = network.Network([se], f_actor, var_scope="actor")
     noise_net = network.Network([tf.stop_gradient(se), noise],
                                 f_noise,
                                 var_scope="noise")
     a_out, n_out = pi_net["action"].op, noise_net["noise"].op
     action_out = a_out + tf.abs(tf.sign(n_out) -
                                 a_out) * tf.tanh(n_out)
     return {
         "se": se,
         "action": action_out,
         "action_mean": a_out,
         "action_noise": n_out,
         "q": q_net["q"].op
     }, {
         "se": se_net,
         "actor": pi_net,
         "critic": q_net,
         "noise": noise_net
     }
예제 #8
0
        def f_iaa(inputs):
            input_observation = inputs[0]
            if compute_with_diff:
                logging.warning("use diff 2333")
                diff_ob = []
                for i in range(input_observation.shape[-1] / 3 - 1):
                    diff_ob.append(input_observation[:, :, :, (i + 1) *
                                                     3:(i + 1) * 3 + 3] -
                                   input_observation[:, :, :, i * 3:i * 3 + 3])
                net_se = network.Network([tf.concat(diff_ob[:], axis=3)],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = copy.copy(state_shape)
                self.processed_state_shape[-1] = state_shape[-1] - 3
            else:
                net_se = network.Network([input_observation],
                                         f_se,
                                         var_scope="se_1")
                self.processed_state_shape = state_shape
            input_action = inputs[1]
            action_dim = inputs[2]
            input_action = tf.one_hot(indices=input_action,
                                      depth=action_dim,
                                      on_value=1.0,
                                      off_value=0.0,
                                      axis=-1)

            se = net_se["se"].op

            input_reward = tf.placeholder(dtype=tf.float32,
                                          shape=[None, 1],
                                          name="input_reward")
            encode_state = tf.placeholder(dtype=tf.float32,
                                          shape=[None,
                                                 se.shape.as_list()[-1]],
                                          name="encode_states")
            input_frame = tf.placeholder(
                dtype=tf.float32,
                shape=[None, state_shape[0], state_shape[1], 3],
                name="input_frame")
            rollout = network.Network([se],
                                      f_rollout,
                                      var_scope="rollout_policy")

            if not with_ob:
                net_model = network.Network([se, input_action],
                                            f_tran,
                                            var_scope="TranModel")
                net_decoder = network.Network([
                    tf.concat(
                        (encode_state, encode_state), axis=-1), input_frame
                ],
                                              f_decoder,
                                              var_scope="Decoder")

            else:
                net_model = network.Network([input_observation, input_action],
                                            f_tran,
                                            var_scope="TranModelOB")
                net_decoder = network.Network([input_frame],
                                              f_decoder,
                                              var_scope="DecoderOB")

            rollout_encoder = network.Network(
                [tf.concat((se, se), axis=-1), input_reward],
                f_encoder,
                var_scope="rollout_encoder")

            current_state = se
            current_ob = input_observation

            for i in range(rollout_lane):
                for j in range(rollout_depth):
                    current_rollout = rollout([current_state],
                                              name_scope="rollout_%d_%d" %
                                              (i, j))

                    # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op)
                    # current_action = rollout_action_dist.sample()

                    if not with_ob:
                        tran_model = net_model([
                            current_state, current_rollout["rollout_action"].op
                        ],
                                               name_scope="env_model_%d_%d" %
                                               (i, j))
                    else:
                        tran_model = net_model(
                            [current_ob, current_rollout["rollout_action"].op],
                            name_scope="env_model_%d_%d" % (i, j))

                    next_goal = tran_model["next_state"].op
                    reward = tran_model["reward"].op

                    if not with_ob:
                        current_state += next_goal
                    else:
                        current_ob = tf.concat(
                            [current_ob[:, :, :, 3:], next_goal], axis=-1)
                        next_goal = tf.stop_gradient(
                            net_se([current_ob])["se"].op)

                    if j == 0:
                        encode_states = next_goal
                        rollout_reward = reward
                    else:
                        encode_states = tf.concat([next_goal, encode_states],
                                                  axis=-1)
                        rollout_reward = tf.concat([rollout_reward, reward],
                                                   axis=0)

                current_state = se
                current_ob = input_observation

                input_reward = tf.reshape(rollout_reward, [-1, rollout_depth])
                input_reward = tf.split(input_reward, rollout_depth, axis=1)
                encode_state = tf.split(encode_states, rollout_depth, axis=1)

                for m in range(rollout_depth):
                    if m == 0:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([
                                    encode_state[-(m + 1)],
                                    encode_state[-(m + 1)]
                                ],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                    else:
                        rollout_encoder = rollout_encoder(
                            [
                                tf.concat([re, encode_state[-(m + 1)]],
                                          axis=-1), input_reward[-(m + 1)]
                            ],
                            name_scope="rollout_encoder_%d_%d" % (i, m))
                        re = rollout_encoder["re"].op

                if i == 0:
                    path = re
                else:
                    path = tf.concat([path, re], axis=1)
            if policy_with_iaa:
                feature = tf.concat([path, se], axis=1)
            else:
                feature = se
            ac = network.Network([feature], f_ac, var_scope='ac')
            v = ac["v"].op
            pi_dist = ac["pi"].op

            return {"v": v, "pi": pi_dist, "rollout_action": None}, \
                    {
                        "se": net_se, "transition": net_model,
                        "state_decoder": net_decoder
                    }