def create_model(self, model_info): """Create Deep-Q network.""" state = Input(shape=self.state_dim) denselayer = Dense(HIDDEN_SIZE, activation='relu')(state) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate) model.compile(loss='mse', optimizer=adam) self.infer_state = tf.placeholder(tf.float32, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create keras model.""" state_input = Input(shape=self.state_dim, name='state_input') advantage = Input(shape=(1, ), name='adv') denselayer = Dense(HIDDEN_SIZE, activation='relu')(state_input) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer) # y_pred out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value]) losses = { "output_actions": impala_loss(advantage), "output_value": 'mse' } lossweights = {"output_actions": 1.0, "output_value": .5} model.compile(optimizer=Adam(lr=LR), loss=losses, loss_weights=lossweights) self.infer_state = tf.placeholder(tf.float32, name="infer_state", shape=(None, ) + tuple(self.state_dim)) self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1)) self.infer_p, self.infer_v = model([self.infer_state, self.adv]) self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def build_graph(self, input_type, model): # pylint: disable=W0201 self.state_ph = tf.placeholder(input_type, name='state', shape=(None, *self.state_dim)) self.old_logp_ph = tf.placeholder(tf.float32, name='old_log_p', shape=(None, 1)) self.adv_ph = tf.placeholder(tf.float32, name='advantage', shape=(None, 1)) self.old_v_ph = tf.placeholder(tf.float32, name='old_v', shape=(None, 1)) self.target_v_ph = tf.placeholder(tf.float32, name='target_value', shape=(None, 1)) pi_latent, self.out_v = model(self.state_ph) if self.action_type == 'Categorical': self.behavior_action_ph = tf.placeholder(tf.int32, name='behavior_action', shape=(None, )) dist_param = pi_latent elif self.action_type == 'DiagGaussian': # fixme: add input dependant log_std logic self.behavior_action_ph = tf.placeholder(tf.float32, name='real_action', shape=(None, self.action_dim)) log_std = tf.get_variable('pi_logstd', shape=(1, self.action_dim), initializer=tf.zeros_initializer()) dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std], axis=-1) else: raise NotImplementedError( 'action type: {} not match any implemented distributions.'. format(self.action_type)) self.dist.init_by_param(dist_param) self.action = self.dist.sample() self.action_log_prob = self.dist.log_prob(self.action) self.actor_var = TFVariables([self.action_log_prob, self.out_v], self.sess) self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph, self.old_logp_ph, self.behavior_action_ph, self.clip_ratio, self.ent_coef) self.critic_loss = critic_loss(self.target_v_ph, self.out_v, self.old_v_ph, self.vf_clip) self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss self.train_op = self.build_train_op(self.loss) self.sess.run(tf.initialize_all_variables())
def create_model(self, model_info): """Create Deep-Q CNN network.""" state = Input(shape=self.state_dim, dtype="int8") state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate, clipnorm=10.) model.compile(loss='mse', optimizer=adam) if model_info.get("summary"): model.summary() self.infer_state = tf.placeholder(tf.int8, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create Deep-Q network.""" user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type) history_click_input = Input( shape=(self.n_history_click * self.item_dim), name="history_click", dtype=self.input_type ) history_no_click_input = Input( shape=(self.n_history_no_click * self.item_dim), name="history_no_click", dtype=self.input_type ) item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type) shared_embedding = Embedding( self.vocab_size, self.emb_dim, name="Emb", mask_zero=True, embeddings_initializer=self.embedding_initializer, trainable=False, ) # un-trainable gru_click = GRU(self.item_dim * self.emb_dim) gru_no_click = GRU(self.item_dim * self.emb_dim) user_feature = Flatten()(shared_embedding(user_input)) item_feature = Flatten()(shared_embedding(item_input)) history_click_feature = Reshape( (self.n_history_click, self.item_dim * self.emb_dim) )(shared_embedding(history_click_input)) history_click_feature = gru_click(history_click_feature) history_no_click_feature = Reshape( (self.n_history_no_click, self.item_dim * self.emb_dim) )(shared_embedding(history_no_click_input)) history_no_click_feature = gru_no_click(history_no_click_feature) x = concatenate( [ user_feature, history_click_feature, history_no_click_feature, item_feature, ] ) x_dense1 = Dense(128, activation="relu")(x) x_dense2 = Dense(128, activation="relu")(x_dense1) # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2) ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2) model = Model( inputs=[ user_input, history_click_input, history_no_click_input, item_input, ], outputs=ctr_pred, ) model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) if self._summary: model.summary() self.user_input = tf.placeholder( dtype=self.input_type, name="user_input", shape=(None, self.user_dim) ) self.history_click_input = tf.placeholder( dtype=self.input_type, name="history_click_input", shape=(None, self.n_history_click * self.item_dim), ) self.history_no_click_input = tf.placeholder( dtype=self.input_type, name="history_no_click_input", shape=(None, self.n_history_no_click * self.item_dim), ) self.item_input = tf.placeholder( dtype=self.input_type, name="item_input", shape=(None, self.item_dim) ) self.ctr_predict = model( [ self.user_input, self.history_click_input, self.history_no_click_input, self.item_input, ] ) self.actor_var = TFVariables([self.ctr_predict], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def build_graph(self): self.build_train_graph() self.build_infer_graph() self.sess.run(tf.initialize_all_variables())