예제 #1
0
    def create_model(self, model_info):
        """Create keras model."""
        state_input = Input(shape=self.state_dim, name='state_input')
        advantage = Input(shape=(1, ), name='adv')

        denselayer = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        for _ in range(NUM_LAYERS - 1):
            denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer)

        out_actions = Dense(self.action_dim,
                            activation='softmax',
                            name='output_actions')(denselayer)  # y_pred
        out_value = Dense(1, name='output_value')(denselayer)
        model = Model(inputs=[state_input, advantage],
                      outputs=[out_actions, out_value])
        losses = {
            "output_actions": impala_loss(advantage),
            "output_value": 'mse'
        }
        lossweights = {"output_actions": 1.0, "output_value": .5}

        model.compile(optimizer=Adam(lr=LR),
                      loss=losses,
                      loss_weights=lossweights)

        self.infer_state = tf.placeholder(tf.float32,
                                          name="infer_state",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1))
        self.infer_p, self.infer_v = model([self.infer_state, self.adv])
        self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess)
        self.sess.run(tf.initialize_all_variables())

        return model
예제 #2
0
    def create_model(self, model_info):
        """Create Deep-Q network."""
        state = Input(shape=self.state_dim)
        denselayer = Dense(HIDDEN_SIZE, activation='relu')(state)
        for _ in range(NUM_LAYERS - 1):
            denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer)

        value = Dense(self.action_dim, activation='linear')(denselayer)
        if self.dueling:
            adv = Dense(1, activation='linear')(denselayer)
            mean = Lambda(layer_normalize)(value)
            value = Lambda(layer_add)([adv, mean])

        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate)
        model.compile(loss='mse', optimizer=adam)

        self.infer_state = tf.placeholder(tf.float32,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
예제 #3
0
파일: ppo.py 프로젝트: shishouyuan/xingtian
    def build_graph(self, input_type, model):
        # pylint: disable=W0201
        self.state_ph = tf.placeholder(input_type,
                                       name='state',
                                       shape=(None, *self.state_dim))
        self.old_logp_ph = tf.placeholder(tf.float32,
                                          name='old_log_p',
                                          shape=(None, 1))
        self.adv_ph = tf.placeholder(tf.float32,
                                     name='advantage',
                                     shape=(None, 1))
        self.old_v_ph = tf.placeholder(tf.float32,
                                       name='old_v',
                                       shape=(None, 1))
        self.target_v_ph = tf.placeholder(tf.float32,
                                          name='target_value',
                                          shape=(None, 1))

        pi_latent, self.out_v = model(self.state_ph)

        if self.action_type == 'Categorical':
            self.behavior_action_ph = tf.placeholder(tf.int32,
                                                     name='behavior_action',
                                                     shape=(None, ))
            dist_param = pi_latent
        elif self.action_type == 'DiagGaussian':
            # fixme: add input dependant log_std logic
            self.behavior_action_ph = tf.placeholder(tf.float32,
                                                     name='real_action',
                                                     shape=(None,
                                                            self.action_dim))
            log_std = tf.get_variable('pi_logstd',
                                      shape=(1, self.action_dim),
                                      initializer=tf.zeros_initializer())
            dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std],
                                   axis=-1)
        else:
            raise NotImplementedError(
                'action type: {} not match any implemented distributions.'.
                format(self.action_type))

        self.dist.init_by_param(dist_param)
        self.action = self.dist.sample()
        self.action_log_prob = self.dist.log_prob(self.action)
        self.actor_var = TFVariables([self.action_log_prob, self.out_v],
                                     self.sess)

        self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph,
                                                  self.old_logp_ph,
                                                  self.behavior_action_ph,
                                                  self.clip_ratio,
                                                  self.ent_coef)
        self.critic_loss = critic_loss(self.target_v_ph, self.out_v,
                                       self.old_v_ph, self.vf_clip)
        self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss
        self.train_op = self.build_train_op(self.loss)

        self.sess.run(tf.initialize_all_variables())
예제 #4
0
    def build_infer_graph(self):
        self.infer_obs = tf.placeholder(tf.float32, name="infer_obs",
                                        shape=(None, ) + tuple(self.state_dim))
        init_infer_h = self.representation_network(self.obs)
        init_infer_p, init_infer_v = self.policy_network(init_infer_h)
        self.init_infer = [init_infer_p, init_infer_v, init_infer_h]

        self.conditioned_hidden = self.dynamic_network.inputs[0]
        rec_infer_h, rec_infer_r = self.dynamic_network(self.conditioned_hidden)
        rec_infer_p, rec_infer_v = self.policy_network(rec_infer_h)
        self.rec_infer = [rec_infer_h, rec_infer_r, rec_infer_p, rec_infer_v]
예제 #5
0
    def __init__(self, output_op, session):
        """Extract variables, makeup the TFVariables class."""
        self.session = session
        if not isinstance(output_op, (list, tuple)):
            output_op = [output_op]

        track_explored_ops = set(output_op)
        to_process_queue = deque(output_op)
        to_handle_node_list = list()

        # find the dependency variables start with inputs with BFS.
        while len(to_process_queue) != 0:
            tf_object = to_process_queue.popleft()
            if tf_object is None:
                continue

            if hasattr(tf_object, "op"):
                tf_object = tf_object.op
            for input_op in tf_object.inputs:
                if input_op not in track_explored_ops:
                    to_process_queue.append(input_op)
                    track_explored_ops.add(input_op)

            # keep track of explored operations,
            for control in tf_object.control_inputs:
                if control not in track_explored_ops:
                    to_process_queue.append(control)
                    track_explored_ops.add(control)

            # process the op with 'Variable' or 'VarHandle' attribute
            if "VarHandle" in tf_object.node_def.op or "Variable" in tf_object.node_def.op:
                to_handle_node_list.append(tf_object.node_def.name)

        self.node_hub_with_order = OrderedDict()
        # go through whole global variables
        for _val in tf.global_variables():
            if _val.op.node_def.name in to_handle_node_list:
                self.node_hub_with_order[_val.op.node_def.name] = _val

        self._ph, self._to_assign_node_dict = dict(), dict()

        for node_name, variable in self.node_hub_with_order.items():
            self._ph[node_name] = tf.placeholder(
                variable.value().dtype,
                variable.get_shape().as_list(),
                name="ph_{}".format(node_name))
            self._to_assign_node_dict[node_name] = variable.assign(
                self._ph[node_name])

        logging.debug("node_hub_with_order: \n{}".format(
            self.node_hub_with_order.keys()))
예제 #6
0
    def create_model(self, model_info):
        """Create Deep-Q CNN network."""
        state = Input(shape=self.state_dim, dtype="int8")
        state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state)
        convlayer = Conv2D(32, (8, 8),
                           strides=(4, 4),
                           activation='relu',
                           padding='valid')(state1)
        convlayer = Conv2D(64, (4, 4),
                           strides=(2, 2),
                           activation='relu',
                           padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3),
                           strides=(1, 1),
                           activation='relu',
                           padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu')(flattenlayer)
        value = Dense(self.action_dim, activation='linear')(denselayer)
        if self.dueling:
            adv = Dense(1, activation='linear')(denselayer)
            mean = Lambda(layer_normalize)(value)
            value = Lambda(layer_add)([adv, mean])
        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate, clipnorm=10.)
        model.compile(loss='mse', optimizer=adam)
        if model_info.get("summary"):
            model.summary()

        self.infer_state = tf.placeholder(tf.int8,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
예제 #7
0
    def build_train_graph(self):
        self.obs = tf.placeholder(self.obs_type, name="obs",
                                  shape=(None, ) + tuple(self.state_dim))
        self.action = tf.placeholder(tf.int32, name="action",
                                     shape=(None, self.td_step))
        target_value_shape = (None, ) + (1 + self.td_step, self.value_support_size)
        self.target_value = tf.placeholder(tf.float32, name="value",
                                           shape=target_value_shape)
        self.target_reward = tf.placeholder(tf.float32, name="reward",
                                            shape=(None, ) + (1 + self.td_step, self.reward_support_size))
        self.target_policy = tf.placeholder(tf.float32, name="policy",
                                            shape=(None, ) + (1 + self.td_step, self.action_dim))
        self.loss_weights = tf.placeholder(tf.float32, name="loss_weights", shape=(None, 1))

        hidden_state = self.representation_network(self.obs)
        policy_logits, value = self.policy_network(hidden_state)

        loss = cross_entropy(policy_logits, self.target_policy[:, 0], self.loss_weights)
        loss += cross_entropy(value, self.target_value[:, 0], self.loss_weights)

        gradient_scale = 1.0 / self.td_step
        for i in range(self.td_step):
            action = tf.one_hot(self.action[:, i], self.action_dim)
            action = tf.reshape(action, (-1, self.action_dim,))
            conditioned_state = tf.concat((hidden_state, action), axis=-1)
            hidden_state, reward = self.dynamic_network(conditioned_state)
            policy_logits, value = self.policy_network(hidden_state)
            hidden_state = scale_gradient(hidden_state, 0.5)

            l = cross_entropy(reward, self.target_reward[:, i], self.loss_weights)
            l += cross_entropy(policy_logits, self.target_policy[:, i + 1], self.loss_weights)
            l += cross_entropy(value, self.target_value[:, i + 1], self.loss_weights)
            loss += scale_gradient(l, gradient_scale)

        for weights in self.full_model.get_weights():
            loss += self.weight_decay * tf.nn.l2_loss(weights)
        self.loss = loss
        self.train_op = self.optimizer.minimize(loss)
예제 #8
0
    def __init__(self, model_info):
        """
        Update default model.parameters with model info.

        owing to the big graph contains five sub-graph, while,
        explorer could work well with the explore.graph,
        Based on the least-cost principle,
        explorer could init the explore.graph;
        and, train process init the train.graph.
        """
        logging.debug("init qmix model with:\n{}".format(model_info))
        model_config = model_info.get("model_config", None)

        self.model_config = model_config

        self.graph = tf.Graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config, graph=self.graph)
        self.sess = sess

        # start to fetch parameters
        self.gamma = model_config.get("gamma", 0.99)
        self.lr = model_config.get("lr", 0.0005)
        self.grad_norm_clip = model_config.get("grad_norm_clip", 10)

        self.n_agents = model_config["n_agents"]
        self.obs_shape = model_config["obs_shape"]
        self.rnn_hidden_dim = model_config["rnn_hidden_dim"]

        seq_limit = model_config["episode_limit"]
        self.fix_seq_length = seq_limit  # use the episode limit as fix shape.

        self.n_actions = model_config["n_actions"]

        self.batch_size = model_config["batch_size"]
        self.avail_action_num = model_config["n_actions"]
        self.state_dim = int(np.prod(model_config["state_shape"]))
        self.embed_dim = model_config["mixing_embed_dim"]

        self.use_double_q = model_config.get("use_double_q", True)
        # fetch parameters from configure ready

        with self.graph.as_default():
            # placeholder work with tf.sess.run
            # buffer for explore
            # note: 4-d make same significance with train operation !
            self.ph_obs = tf.placeholder(
                tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs")

            self.ph_hidden_states_in = tf.placeholder(
                tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in")
            self.agent_outs, self.hidden_outs = None, None
            self._explore_paras = None
            self.gru_cell = None
            self.hi_out_val = None

            # placeholder for train
            self.ph_avail_action = tf.placeholder(
                tf.float32,
                shape=[
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.avail_action_num,
                ],
                name="avail_action",
            )

            self.ph_actions = tf.placeholder(
                tf.float32,
                shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1],
                name="actions",
            )

            self.ph_train_obs = tf.placeholder(
                tf.float32,
                shape=(
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.obs_shape,
                ),
                name="train_obs",
            )
            self.ph_train_obs_len = tf.placeholder(
                tf.float32, shape=(None, ), name="train_obs_len")

            # eval mixer ---------------
            self.ph_train_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_stats",
            )
            # target mixer -------------------
            self.ph_train_target_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_target_stats",
            )

            self.q_tot, self.target_q_tot = None, None

            self.ph_rewards = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="rewards",
            )
            self.ph_terminated = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="terminated",
            )
            self.ph_mask = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="mask",
            )

            self.loss, self.grad_update = None, None

            # graph weights update
            self.agent_train_replace_op = None
            self.agent_explore_replace_op = None
            self.mix_train_replace_op = None

        # init graph
        self.g_type = model_info.get("scene", "explore")

        self.build_actor_graph()  # NOTE: build actor always
        if self.g_type == "train":
            self.build_train_graph()

        # note: init with only once are importance!
        with self.graph.as_default():
            self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess)

            self.sess.run(tf.global_variables_initializer())
            self.hi_out_val_default = self.sess.run(
                self.gru_cell.zero_state(self.n_agents, dtype=tf.float32))

            # max_to_keep = 5 default, may been remove when to evaluate
            self.explore_saver = tf.train.Saver({
                t.name: t for t in self._explore_paras}, max_to_keep=100,)
예제 #9
0
    def create_model(self, model_info):
        """Create Deep-Q network."""

        user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type)
        history_click_input = Input(
            shape=(self.n_history_click * self.item_dim), name="history_click",
            dtype=self.input_type
        )
        history_no_click_input = Input(
            shape=(self.n_history_no_click * self.item_dim), name="history_no_click",
            dtype=self.input_type
        )
        item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type)
        shared_embedding = Embedding(
            self.vocab_size,
            self.emb_dim,
            name="Emb",
            mask_zero=True,
            embeddings_initializer=self.embedding_initializer,
            trainable=False,
        )  # un-trainable
        gru_click = GRU(self.item_dim * self.emb_dim)
        gru_no_click = GRU(self.item_dim * self.emb_dim)

        user_feature = Flatten()(shared_embedding(user_input))
        item_feature = Flatten()(shared_embedding(item_input))

        history_click_feature = Reshape(
            (self.n_history_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_click_input))
        history_click_feature = gru_click(history_click_feature)

        history_no_click_feature = Reshape(
            (self.n_history_no_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_no_click_input))
        history_no_click_feature = gru_no_click(history_no_click_feature)

        x = concatenate(
            [
                user_feature,
                history_click_feature,
                history_no_click_feature,
                item_feature,
            ]
        )
        x_dense1 = Dense(128, activation="relu")(x)
        x_dense2 = Dense(128, activation="relu")(x_dense1)
        # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2)
        ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2)
        model = Model(
            inputs=[
                user_input,
                history_click_input,
                history_no_click_input,
                item_input,
            ],
            outputs=ctr_pred,
        )
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        if self._summary:
            model.summary()

        self.user_input = tf.placeholder(
            dtype=self.input_type, name="user_input", shape=(None, self.user_dim)
        )
        self.history_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_click_input",
            shape=(None, self.n_history_click * self.item_dim),
        )
        self.history_no_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_no_click_input",
            shape=(None, self.n_history_no_click * self.item_dim),
        )
        self.item_input = tf.placeholder(
            dtype=self.input_type, name="item_input", shape=(None, self.item_dim)
        )

        self.ctr_predict = model(
            [
                self.user_input,
                self.history_click_input,
                self.history_no_click_input,
                self.item_input,
            ]
        )
        self.actor_var = TFVariables([self.ctr_predict], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
예제 #10
0
    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True