def create_model(self, model_info): """Create keras model.""" state_input = Input(shape=self.state_dim, name='state_input') advantage = Input(shape=(1, ), name='adv') denselayer = Dense(HIDDEN_SIZE, activation='relu')(state_input) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer) # y_pred out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value]) losses = { "output_actions": impala_loss(advantage), "output_value": 'mse' } lossweights = {"output_actions": 1.0, "output_value": .5} model.compile(optimizer=Adam(lr=LR), loss=losses, loss_weights=lossweights) self.infer_state = tf.placeholder(tf.float32, name="infer_state", shape=(None, ) + tuple(self.state_dim)) self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1)) self.infer_p, self.infer_v = model([self.infer_state, self.adv]) self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create Deep-Q network.""" state = Input(shape=self.state_dim) denselayer = Dense(HIDDEN_SIZE, activation='relu')(state) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate) model.compile(loss='mse', optimizer=adam) self.infer_state = tf.placeholder(tf.float32, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def build_graph(self, input_type, model): # pylint: disable=W0201 self.state_ph = tf.placeholder(input_type, name='state', shape=(None, *self.state_dim)) self.old_logp_ph = tf.placeholder(tf.float32, name='old_log_p', shape=(None, 1)) self.adv_ph = tf.placeholder(tf.float32, name='advantage', shape=(None, 1)) self.old_v_ph = tf.placeholder(tf.float32, name='old_v', shape=(None, 1)) self.target_v_ph = tf.placeholder(tf.float32, name='target_value', shape=(None, 1)) pi_latent, self.out_v = model(self.state_ph) if self.action_type == 'Categorical': self.behavior_action_ph = tf.placeholder(tf.int32, name='behavior_action', shape=(None, )) dist_param = pi_latent elif self.action_type == 'DiagGaussian': # fixme: add input dependant log_std logic self.behavior_action_ph = tf.placeholder(tf.float32, name='real_action', shape=(None, self.action_dim)) log_std = tf.get_variable('pi_logstd', shape=(1, self.action_dim), initializer=tf.zeros_initializer()) dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std], axis=-1) else: raise NotImplementedError( 'action type: {} not match any implemented distributions.'. format(self.action_type)) self.dist.init_by_param(dist_param) self.action = self.dist.sample() self.action_log_prob = self.dist.log_prob(self.action) self.actor_var = TFVariables([self.action_log_prob, self.out_v], self.sess) self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph, self.old_logp_ph, self.behavior_action_ph, self.clip_ratio, self.ent_coef) self.critic_loss = critic_loss(self.target_v_ph, self.out_v, self.old_v_ph, self.vf_clip) self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss self.train_op = self.build_train_op(self.loss) self.sess.run(tf.initialize_all_variables())
def build_infer_graph(self): self.infer_obs = tf.placeholder(tf.float32, name="infer_obs", shape=(None, ) + tuple(self.state_dim)) init_infer_h = self.representation_network(self.obs) init_infer_p, init_infer_v = self.policy_network(init_infer_h) self.init_infer = [init_infer_p, init_infer_v, init_infer_h] self.conditioned_hidden = self.dynamic_network.inputs[0] rec_infer_h, rec_infer_r = self.dynamic_network(self.conditioned_hidden) rec_infer_p, rec_infer_v = self.policy_network(rec_infer_h) self.rec_infer = [rec_infer_h, rec_infer_r, rec_infer_p, rec_infer_v]
def __init__(self, output_op, session): """Extract variables, makeup the TFVariables class.""" self.session = session if not isinstance(output_op, (list, tuple)): output_op = [output_op] track_explored_ops = set(output_op) to_process_queue = deque(output_op) to_handle_node_list = list() # find the dependency variables start with inputs with BFS. while len(to_process_queue) != 0: tf_object = to_process_queue.popleft() if tf_object is None: continue if hasattr(tf_object, "op"): tf_object = tf_object.op for input_op in tf_object.inputs: if input_op not in track_explored_ops: to_process_queue.append(input_op) track_explored_ops.add(input_op) # keep track of explored operations, for control in tf_object.control_inputs: if control not in track_explored_ops: to_process_queue.append(control) track_explored_ops.add(control) # process the op with 'Variable' or 'VarHandle' attribute if "VarHandle" in tf_object.node_def.op or "Variable" in tf_object.node_def.op: to_handle_node_list.append(tf_object.node_def.name) self.node_hub_with_order = OrderedDict() # go through whole global variables for _val in tf.global_variables(): if _val.op.node_def.name in to_handle_node_list: self.node_hub_with_order[_val.op.node_def.name] = _val self._ph, self._to_assign_node_dict = dict(), dict() for node_name, variable in self.node_hub_with_order.items(): self._ph[node_name] = tf.placeholder( variable.value().dtype, variable.get_shape().as_list(), name="ph_{}".format(node_name)) self._to_assign_node_dict[node_name] = variable.assign( self._ph[node_name]) logging.debug("node_hub_with_order: \n{}".format( self.node_hub_with_order.keys()))
def create_model(self, model_info): """Create Deep-Q CNN network.""" state = Input(shape=self.state_dim, dtype="int8") state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate, clipnorm=10.) model.compile(loss='mse', optimizer=adam) if model_info.get("summary"): model.summary() self.infer_state = tf.placeholder(tf.int8, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def build_train_graph(self): self.obs = tf.placeholder(self.obs_type, name="obs", shape=(None, ) + tuple(self.state_dim)) self.action = tf.placeholder(tf.int32, name="action", shape=(None, self.td_step)) target_value_shape = (None, ) + (1 + self.td_step, self.value_support_size) self.target_value = tf.placeholder(tf.float32, name="value", shape=target_value_shape) self.target_reward = tf.placeholder(tf.float32, name="reward", shape=(None, ) + (1 + self.td_step, self.reward_support_size)) self.target_policy = tf.placeholder(tf.float32, name="policy", shape=(None, ) + (1 + self.td_step, self.action_dim)) self.loss_weights = tf.placeholder(tf.float32, name="loss_weights", shape=(None, 1)) hidden_state = self.representation_network(self.obs) policy_logits, value = self.policy_network(hidden_state) loss = cross_entropy(policy_logits, self.target_policy[:, 0], self.loss_weights) loss += cross_entropy(value, self.target_value[:, 0], self.loss_weights) gradient_scale = 1.0 / self.td_step for i in range(self.td_step): action = tf.one_hot(self.action[:, i], self.action_dim) action = tf.reshape(action, (-1, self.action_dim,)) conditioned_state = tf.concat((hidden_state, action), axis=-1) hidden_state, reward = self.dynamic_network(conditioned_state) policy_logits, value = self.policy_network(hidden_state) hidden_state = scale_gradient(hidden_state, 0.5) l = cross_entropy(reward, self.target_reward[:, i], self.loss_weights) l += cross_entropy(policy_logits, self.target_policy[:, i + 1], self.loss_weights) l += cross_entropy(value, self.target_value[:, i + 1], self.loss_weights) loss += scale_gradient(l, gradient_scale) for weights in self.full_model.get_weights(): loss += self.weight_decay * tf.nn.l2_loss(weights) self.loss = loss self.train_op = self.optimizer.minimize(loss)
def __init__(self, model_info): """ Update default model.parameters with model info. owing to the big graph contains five sub-graph, while, explorer could work well with the explore.graph, Based on the least-cost principle, explorer could init the explore.graph; and, train process init the train.graph. """ logging.debug("init qmix model with:\n{}".format(model_info)) model_config = model_info.get("model_config", None) self.model_config = model_config self.graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config, graph=self.graph) self.sess = sess # start to fetch parameters self.gamma = model_config.get("gamma", 0.99) self.lr = model_config.get("lr", 0.0005) self.grad_norm_clip = model_config.get("grad_norm_clip", 10) self.n_agents = model_config["n_agents"] self.obs_shape = model_config["obs_shape"] self.rnn_hidden_dim = model_config["rnn_hidden_dim"] seq_limit = model_config["episode_limit"] self.fix_seq_length = seq_limit # use the episode limit as fix shape. self.n_actions = model_config["n_actions"] self.batch_size = model_config["batch_size"] self.avail_action_num = model_config["n_actions"] self.state_dim = int(np.prod(model_config["state_shape"])) self.embed_dim = model_config["mixing_embed_dim"] self.use_double_q = model_config.get("use_double_q", True) # fetch parameters from configure ready with self.graph.as_default(): # placeholder work with tf.sess.run # buffer for explore # note: 4-d make same significance with train operation ! self.ph_obs = tf.placeholder( tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs") self.ph_hidden_states_in = tf.placeholder( tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in") self.agent_outs, self.hidden_outs = None, None self._explore_paras = None self.gru_cell = None self.hi_out_val = None # placeholder for train self.ph_avail_action = tf.placeholder( tf.float32, shape=[ self.batch_size, self.fix_seq_length + 1, self.n_agents, self.avail_action_num, ], name="avail_action", ) self.ph_actions = tf.placeholder( tf.float32, shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1], name="actions", ) self.ph_train_obs = tf.placeholder( tf.float32, shape=( self.batch_size, self.fix_seq_length + 1, self.n_agents, self.obs_shape, ), name="train_obs", ) self.ph_train_obs_len = tf.placeholder( tf.float32, shape=(None, ), name="train_obs_len") # eval mixer --------------- self.ph_train_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_stats", ) # target mixer ------------------- self.ph_train_target_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_target_stats", ) self.q_tot, self.target_q_tot = None, None self.ph_rewards = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="rewards", ) self.ph_terminated = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="terminated", ) self.ph_mask = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="mask", ) self.loss, self.grad_update = None, None # graph weights update self.agent_train_replace_op = None self.agent_explore_replace_op = None self.mix_train_replace_op = None # init graph self.g_type = model_info.get("scene", "explore") self.build_actor_graph() # NOTE: build actor always if self.g_type == "train": self.build_train_graph() # note: init with only once are importance! with self.graph.as_default(): self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess) self.sess.run(tf.global_variables_initializer()) self.hi_out_val_default = self.sess.run( self.gru_cell.zero_state(self.n_agents, dtype=tf.float32)) # max_to_keep = 5 default, may been remove when to evaluate self.explore_saver = tf.train.Saver({ t.name: t for t in self._explore_paras}, max_to_keep=100,)
def create_model(self, model_info): """Create Deep-Q network.""" user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type) history_click_input = Input( shape=(self.n_history_click * self.item_dim), name="history_click", dtype=self.input_type ) history_no_click_input = Input( shape=(self.n_history_no_click * self.item_dim), name="history_no_click", dtype=self.input_type ) item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type) shared_embedding = Embedding( self.vocab_size, self.emb_dim, name="Emb", mask_zero=True, embeddings_initializer=self.embedding_initializer, trainable=False, ) # un-trainable gru_click = GRU(self.item_dim * self.emb_dim) gru_no_click = GRU(self.item_dim * self.emb_dim) user_feature = Flatten()(shared_embedding(user_input)) item_feature = Flatten()(shared_embedding(item_input)) history_click_feature = Reshape( (self.n_history_click, self.item_dim * self.emb_dim) )(shared_embedding(history_click_input)) history_click_feature = gru_click(history_click_feature) history_no_click_feature = Reshape( (self.n_history_no_click, self.item_dim * self.emb_dim) )(shared_embedding(history_no_click_input)) history_no_click_feature = gru_no_click(history_no_click_feature) x = concatenate( [ user_feature, history_click_feature, history_no_click_feature, item_feature, ] ) x_dense1 = Dense(128, activation="relu")(x) x_dense2 = Dense(128, activation="relu")(x_dense1) # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2) ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2) model = Model( inputs=[ user_input, history_click_input, history_no_click_input, item_input, ], outputs=ctr_pred, ) model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) if self._summary: model.summary() self.user_input = tf.placeholder( dtype=self.input_type, name="user_input", shape=(None, self.user_dim) ) self.history_click_input = tf.placeholder( dtype=self.input_type, name="history_click_input", shape=(None, self.n_history_click * self.item_dim), ) self.history_no_click_input = tf.placeholder( dtype=self.input_type, name="history_no_click_input", shape=(None, self.n_history_no_click * self.item_dim), ) self.item_input = tf.placeholder( dtype=self.input_type, name="item_input", shape=(None, self.item_dim) ) self.ctr_predict = model( [ self.user_input, self.history_click_input, self.history_no_click_input, self.item_input, ] ) self.actor_var = TFVariables([self.ctr_predict], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): self.ph_state = tf.placeholder(self.input_dtype, shape=(None, *self.state_dim), name="state_input") with tf.variable_scope("explore_agent"): state_input = Lambda(self._transform)(self.ph_state) last_layer = state_input for (out_size, kernel, stride) in self.filter_arch[:-1]: last_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="same", )(last_layer) # last convolution (out_size, kernel, stride) = self.filter_arch[-1] convolution_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="valid", )(last_layer) self.pi_logic_outs = tf.squeeze( Conv2D(self.action_dim, (1, 1), padding="same")(convolution_layer), axis=[1, 2], ) baseline_flat = Flatten()(convolution_layer) self.baseline = tf.squeeze( tf.layers.dense( inputs=baseline_flat, units=1, activation=None, kernel_initializer=custom_norm_initializer(0.01), ), 1, ) self.out_actions = tf.squeeze( tf.multinomial(self.pi_logic_outs, num_samples=1, output_dtype=tf.int32), 1, name="out_action", ) # create learner self.ph_bp_logic_outs = tf.placeholder(self.dtype, shape=(None, self.action_dim), name="ph_b_logits") self.ph_actions = tf.placeholder(tf.int32, shape=(None, ), name="ph_action") self.ph_dones = tf.placeholder(tf.bool, shape=(None, ), name="ph_dones") self.ph_rewards = tf.placeholder(self.dtype, shape=(None, ), name="ph_rewards") # Split the tensor into batches at known episode cut boundaries. # [batch_count * batch_step] -> [batch_step, batch_count] batch_step = self.sample_batch_steps def split_batches(tensor, drop_last=False): batch_count = tf.shape(tensor)[0] // batch_step reshape_tensor = tf.reshape( tensor, tf.concat([[batch_count, batch_step], tf.shape(tensor)[1:]], axis=0), ) # swap B and T axes res = tf.transpose( reshape_tensor, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))), ) if drop_last: return res[:-1] return res self.loss = vtrace_loss( bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True), tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True), actions=split_batches(self.ph_actions, drop_last=True), discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) * GAMMA, drop_last=True), rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1), drop_last=True), values=split_batches(self.baseline, drop_last=True), bootstrap_value=split_batches(self.baseline)[-1], ) global_step = tf.Variable(0, trainable=False, dtype=tf.int32) if self.opt_type == "adam": if self.lr_schedule: learning_rate = self._get_lr(global_step) else: learning_rate = LR optimizer = AdamOptimizer(learning_rate) elif self.opt_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(LR, decay=0.99, epsilon=0.1, centered=True) else: raise KeyError("invalid opt_type: {}".format(self.opt_type)) grads_and_vars = optimizer.compute_gradients(self.loss) # global norm grads, var = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip) clipped_gvs = list(zip(grads, var)) self.train_op = optimizer.apply_gradients(clipped_gvs, global_step=global_step) # fixme: help to show the learning rate among training processing self.lr = optimizer._lr self.actor_var = TFVariables(self.out_actions, self.sess) self.sess.run(global_variables_initializer()) self.explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") self.saver = Saver({t.name: t for t in self.explore_paras}, max_to_keep=self.max_to_keep) return True