def create_model(self, model_info): """Create keras model.""" state_input = Input(shape=self.state_dim, name='state_input') advantage = Input(shape=(1, ), name='adv') denselayer = Dense(HIDDEN_SIZE, activation='relu')(state_input) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer) # y_pred out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value]) losses = { "output_actions": impala_loss(advantage), "output_value": 'mse' } lossweights = {"output_actions": 1.0, "output_value": .5} model.compile(optimizer=Adam(lr=LR), loss=losses, loss_weights=lossweights) self.infer_state = tf.placeholder(tf.float32, name="infer_state", shape=(None, ) + tuple(self.state_dim)) self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1)) self.infer_p, self.infer_v = model([self.infer_state, self.adv]) self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create Deep-Q network.""" state = Input(shape=self.state_dim) denselayer = Dense(HIDDEN_SIZE, activation='relu')(state) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate) model.compile(loss='mse', optimizer=adam) self.infer_state = tf.placeholder(tf.float32, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create Deep-Q CNN network.""" state = Input(shape=self.state_dim, dtype="uint8") state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) value = Dense(self.action_dim, activation='linear')(denselayer) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate, clipnorm=10.) model.compile(loss='mse', optimizer=adam) if model_info.get("summary"): model.summary() self.infer_state = tf.placeholder(tf.uint8, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def build_graph(self, input_type, model): # pylint: disable=W0201 self.state_ph = tf.placeholder(input_type, name='state', shape=(None, *self.state_dim)) self.old_logp_ph = tf.placeholder(tf.float32, name='old_log_p', shape=(None, 1)) self.adv_ph = tf.placeholder(tf.float32, name='advantage', shape=(None, 1)) self.old_v_ph = tf.placeholder(tf.float32, name='old_v', shape=(None, 1)) self.target_v_ph = tf.placeholder(tf.float32, name='target_value', shape=(None, 1)) pi_latent, self.out_v = model(self.state_ph) if self.action_type == 'Categorical': self.behavior_action_ph = tf.placeholder(tf.int32, name='behavior_action', shape=(None, )) dist_param = pi_latent elif self.action_type == 'DiagGaussian': # fixme: add input dependant log_std logic self.behavior_action_ph = tf.placeholder(tf.float32, name='real_action', shape=(None, self.action_dim)) log_std = tf.get_variable('pi_logstd', shape=(1, self.action_dim), initializer=tf.zeros_initializer()) dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std], axis=-1) else: raise NotImplementedError( 'action type: {} not match any implemented distributions.'. format(self.action_type)) self.dist.init_by_param(dist_param) self.action = self.dist.sample() self.action_log_prob = self.dist.log_prob(self.action) self.actor_var = TFVariables([self.action_log_prob, self.out_v], self.sess) self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph, self.old_logp_ph, self.behavior_action_ph, self.clip_ratio, self.ent_coef) self.critic_loss = critic_loss(self.target_v_ph, self.out_v, self.old_v_ph, self.vf_clip) self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss self.train_op = self.build_train_op(self.loss) self.sess.run(tf.initialize_all_variables())
def create_model(self, model_info): state_input = Input(shape=self.state_dim, name='state_input', dtype='uint8') state_input_1 = Lambda(layer_function)(state_input) advantage = Input(shape=(1, ), name='adv') convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state_input_1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer) out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value]) losses = { "output_actions": impala_loss(advantage), "output_value": 'mse' } lossweights = {"output_actions": 1.0, "output_value": .5} decay_value = 0.00000000512 model.compile(optimizer=Adam(lr=LR, clipnorm=40., decay=decay_value), loss=losses, loss_weights=lossweights) self.infer_state = tf.placeholder(tf.uint8, name="infer_state", shape=(None, ) + tuple(self.state_dim)) self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1)) self.infer_p, self.infer_v = model([self.infer_state, self.adv]) self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def __init__(self, model_info): """ Update default model.parameters with model info. owing to the big graph contains five sub-graph, while, explorer could work well with the explore.graph, Based on the least-cost principle, explorer could init the explore.graph; and, train process init the train.graph. """ logging.debug("init qmix model with:\n{}".format(model_info)) model_config = model_info.get("model_config", None) self.model_config = model_config self.graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config, graph=self.graph) self.sess = sess # start to fetch parameters self.gamma = model_config.get("gamma", 0.99) self.lr = model_config.get("lr", 0.0005) self.grad_norm_clip = model_config.get("grad_norm_clip", 10) self.n_agents = model_config["n_agents"] self.obs_shape = model_config["obs_shape"] self.rnn_hidden_dim = model_config["rnn_hidden_dim"] seq_limit = model_config["episode_limit"] self.fix_seq_length = seq_limit # use the episode limit as fix shape. self.n_actions = model_config["n_actions"] self.batch_size = model_config["batch_size"] self.avail_action_num = model_config["n_actions"] self.state_dim = int(np.prod(model_config["state_shape"])) self.embed_dim = model_config["mixing_embed_dim"] self.use_double_q = model_config.get("use_double_q", True) # fetch parameters from configure ready with self.graph.as_default(): # placeholder work with tf.sess.run # buffer for explore # note: 4-d make same significance with train operation ! self.ph_obs = tf.placeholder( tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs") self.ph_hidden_states_in = tf.placeholder( tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in") self.agent_outs, self.hidden_outs = None, None self._explore_paras = None self.gru_cell = None self.hi_out_val = None # placeholder for train self.ph_avail_action = tf.placeholder( tf.float32, shape=[ self.batch_size, self.fix_seq_length + 1, self.n_agents, self.avail_action_num, ], name="avail_action", ) self.ph_actions = tf.placeholder( tf.float32, shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1], name="actions", ) self.ph_train_obs = tf.placeholder( tf.float32, shape=( self.batch_size, self.fix_seq_length + 1, self.n_agents, self.obs_shape, ), name="train_obs", ) self.ph_train_obs_len = tf.placeholder( tf.float32, shape=(None, ), name="train_obs_len") # eval mixer --------------- self.ph_train_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_stats", ) # target mixer ------------------- self.ph_train_target_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_target_stats", ) self.q_tot, self.target_q_tot = None, None self.ph_rewards = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="rewards", ) self.ph_terminated = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="terminated", ) self.ph_mask = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="mask", ) self.loss, self.grad_update = None, None # graph weights update self.agent_train_replace_op = None self.agent_explore_replace_op = None self.mix_train_replace_op = None # init graph self.g_type = model_info.get("scene", "explore") self.build_actor_graph() # NOTE: build actor always if self.g_type == "train": self.build_train_graph() # note: init with only once are importance! with self.graph.as_default(): self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess) self.sess.run(tf.global_variables_initializer()) self.hi_out_val_default = self.sess.run( self.gru_cell.zero_state(self.n_agents, dtype=tf.float32)) # max_to_keep = 5 default, may been remove when to evaluate self.explore_saver = tf.train.Saver({ t.name: t for t in self._explore_paras}, max_to_keep=100,)
class QMixModel(object): """Define QMix model with tensorflow.graph.""" def __init__(self, model_info): """ Update default model.parameters with model info. owing to the big graph contains five sub-graph, while, explorer could work well with the explore.graph, Based on the least-cost principle, explorer could init the explore.graph; and, train process init the train.graph. """ logging.debug("init qmix model with:\n{}".format(model_info)) model_config = model_info.get("model_config", None) self.model_config = model_config self.graph = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config, graph=self.graph) self.sess = sess # start to fetch parameters self.gamma = model_config.get("gamma", 0.99) self.lr = model_config.get("lr", 0.0005) self.grad_norm_clip = model_config.get("grad_norm_clip", 10) self.n_agents = model_config["n_agents"] self.obs_shape = model_config["obs_shape"] self.rnn_hidden_dim = model_config["rnn_hidden_dim"] seq_limit = model_config["episode_limit"] self.fix_seq_length = seq_limit # use the episode limit as fix shape. self.n_actions = model_config["n_actions"] self.batch_size = model_config["batch_size"] self.avail_action_num = model_config["n_actions"] self.state_dim = int(np.prod(model_config["state_shape"])) self.embed_dim = model_config["mixing_embed_dim"] self.use_double_q = model_config.get("use_double_q", True) # fetch parameters from configure ready with self.graph.as_default(): # placeholder work with tf.sess.run # buffer for explore # note: 4-d make same significance with train operation ! self.ph_obs = tf.placeholder( tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs") self.ph_hidden_states_in = tf.placeholder( tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in") self.agent_outs, self.hidden_outs = None, None self._explore_paras = None self.gru_cell = None self.hi_out_val = None # placeholder for train self.ph_avail_action = tf.placeholder( tf.float32, shape=[ self.batch_size, self.fix_seq_length + 1, self.n_agents, self.avail_action_num, ], name="avail_action", ) self.ph_actions = tf.placeholder( tf.float32, shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1], name="actions", ) self.ph_train_obs = tf.placeholder( tf.float32, shape=( self.batch_size, self.fix_seq_length + 1, self.n_agents, self.obs_shape, ), name="train_obs", ) self.ph_train_obs_len = tf.placeholder( tf.float32, shape=(None, ), name="train_obs_len") # eval mixer --------------- self.ph_train_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_stats", ) # target mixer ------------------- self.ph_train_target_states = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, self.state_dim), name="train_target_stats", ) self.q_tot, self.target_q_tot = None, None self.ph_rewards = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="rewards", ) self.ph_terminated = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="terminated", ) self.ph_mask = tf.placeholder( tf.float32, shape=(self.batch_size, self.fix_seq_length, 1), name="mask", ) self.loss, self.grad_update = None, None # graph weights update self.agent_train_replace_op = None self.agent_explore_replace_op = None self.mix_train_replace_op = None # init graph self.g_type = model_info.get("scene", "explore") self.build_actor_graph() # NOTE: build actor always if self.g_type == "train": self.build_train_graph() # note: init with only once are importance! with self.graph.as_default(): self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess) self.sess.run(tf.global_variables_initializer()) self.hi_out_val_default = self.sess.run( self.gru_cell.zero_state(self.n_agents, dtype=tf.float32)) # max_to_keep = 5 default, may been remove when to evaluate self.explore_saver = tf.train.Saver({ t.name: t for t in self._explore_paras}, max_to_keep=100,) def build_actor_graph(self): """Build explorer graph with minimum principle.""" with self.graph.as_default(): with tf.variable_scope("explore_agent"): self.agent_outs, self.hidden_outs = self.build_agent_net( inputs_obs=self.ph_obs, seq_max=1, # 1, importance for inference obs_lengths=[1 for _ in range(self.n_agents)], hidden_state_in=self.ph_hidden_states_in, ) self._explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") def build_agent_net(self, inputs_obs, seq_max, obs_lengths, hidden_state_in): """ Build agent architecture. could work well among explorer & train with different sequence. """ fc1 = tf.layers.dense( inputs=inputs_obs, units=self.rnn_hidden_dim, activation=tf.nn.relu, ) fc1 = tf.transpose(fc1, perm=[0, 2, 1, 3]) logging.debug("fc1 before reshape: {}".format(fc1)) fc1 = tf.reshape(fc1, [-1, seq_max, self.rnn_hidden_dim]) logging.debug("fc1 after reshape: {}".format(fc1)) gru_cell = tf.nn.rnn_cell.GRUCell( num_units=self.rnn_hidden_dim, # dtype=self.dtype ) # only record the gru cell once time, to init the hidden value. if not self.gru_cell: self.gru_cell = gru_cell # tf.nn.dynamic_rnn could be work well with different-length sequence rnn_output, hidden_state_out = tf.nn.dynamic_rnn( gru_cell, fc1, dtype=tf.float32, initial_state=hidden_state_in, sequence_length=obs_lengths, ) logging.debug("rnn raw out: {} ".format(rnn_output)) rnn_output = tf.reshape( rnn_output, [-1, self.n_agents, seq_max, self.rnn_hidden_dim]) rnn_output = tf.transpose(rnn_output, perm=[0, 2, 1, 3]) rnn_output = tf.reshape(rnn_output, [-1, self.rnn_hidden_dim]) fc2_outputs = tf.layers.dense( inputs=rnn_output, units=self.n_actions, activation=None, ) out_actions = tf.reshape( fc2_outputs, (-1, self.n_agents, self.avail_action_num)) logging.debug("out action: {}".format(out_actions)) return out_actions, hidden_state_out def reset_hidden_state(self): """Reset hidden state with value assign.""" self.hi_out_val = self.hi_out_val_default def infer_actions(self, agent_inputs): """Unify inference api.""" out_val, self.hi_out_val = self.sess.run( [self.agent_outs, self.hidden_outs], feed_dict={ self.ph_obs: agent_inputs, self.ph_hidden_states_in: self.hi_out_val, }, ) return out_val def gather_custom(self, inputs, indices): indices = tf.cast(indices, tf.uint8) one_hot = tf.squeeze( tf.one_hot(indices=indices, depth=self.n_actions, on_value=1., off_value=0., axis=-1, dtype=tf.float32), axis=-2) mul_test = tf.multiply(inputs, one_hot) # reduce_sum_val = tf.reduce_sum(mul_test, axis=-1, keep_dims=True) reduce_sum_val = tf.reduce_sum(mul_test, axis=-1) return reduce_sum_val def _build_mix_net2(self, agent_qs, states): hypernet_embed = self.model_config["hypernet_embed"] def hyper_w1(hyper_w1_input): """ Create hyper_w1. input shape (none, state_dim) """ with tf.variable_scope("hyper_w1"): hw0 = tf.layers.dense(inputs=hyper_w1_input, units=hypernet_embed, activation=tf.nn.relu) hw1 = tf.layers.dense(inputs=hw0, units=self.embed_dim * self.n_agents, activation=None) return hw1 def hyper_w_final(hyper_w_final_input): """ Create hyper_w_final. input shape (none, state_dim) """ with tf.variable_scope("hyper_w_final"): hw_f0 = tf.layers.dense( inputs=hyper_w_final_input, units=hypernet_embed, activation=tf.nn.relu, ) hw_f1 = tf.layers.dense(inputs=hw_f0, units=self.embed_dim, activation=None) return hw_f1 def hyper_b1(state_input): """State dependent bias for hidden layer.""" with tf.variable_scope("hyper_b1"): return tf.layers.dense(inputs=state_input, units=self.embed_dim, activation=None) def val(state_input): """V(s) instead of a bias for the last layers.""" with tf.variable_scope("val_for_bias"): val0 = tf.layers.dense(inputs=state_input, units=self.embed_dim, activation=tf.nn.relu) val2 = tf.layers.dense(inputs=val0, units=1, activation=None) return val2 bs = agent_qs.get_shape().as_list()[0] states_reshaped = tf.reshape(states, (-1, self.state_dim)) agent_qs_reshaped = tf.reshape(agent_qs, (-1, 1, self.n_agents)) # firstly layer w1 = tf.math.abs(hyper_w1(states_reshaped)) b1 = hyper_b1(states_reshaped) w1_reshaped = tf.reshape(w1, (-1, self.n_agents, self.embed_dim)) b1_reshaped = tf.reshape(b1, (-1, 1, self.embed_dim)) to_hidden_val = tf.math.add( tf.matmul(agent_qs_reshaped, w1_reshaped), b1_reshaped) hidden = tf.nn.elu(to_hidden_val) # second layer w_final = tf.math.abs(hyper_w_final(states_reshaped)) w_final_reshaped = tf.reshape(w_final, (-1, self.embed_dim, 1)) # state-dependent bias v = tf.reshape(val(states_reshaped), (-1, 1, 1)) # compute final output y = tf.math.add(tf.matmul(hidden, w_final_reshaped), v) # reshape and return q_tot = tf.reshape(y, (bs, -1, 1)) return q_tot @staticmethod def _print_trainable_var_name(**kwargs): """Print trainable variable name.""" for k, v in kwargs.items(): logging.info("{}: \n {}".format(k, list([t.name for t in v]))) def build_train_graph(self): """ Build train graph. Because of the different seq_max(1 vs limit), train graph cannot connect-up to actor.graph directly. Hence, we build an explore sub-graph and train sub-graph, which sync with tf.assign between two collections. :return: """ with self.graph.as_default(): with tf.variable_scope("eval_agent"): trajectory_agent_outs, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, seq_max=self.fix_seq_length + 1, # importance obs_lengths=self.ph_train_obs_len, hidden_state_in=None, # total trajectory, needn't hold hidden ) with tf.variable_scope("target_agent"): tar_agent_outs_tmp, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, # fix value, different between explore and train seq_max=self.fix_seq_length + 1, obs_lengths=self.ph_train_obs_len, hidden_state_in=None, ) target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp) _eval_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent") _target_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent") with tf.variable_scope("soft_replacement"): self.agent_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_agent_paras, _eval_agent_paras)] self.agent_explore_replace_op = [ tf.assign(t, e) for t, e in zip(self._explore_paras, _eval_agent_paras) ] self._print_trainable_var_name( _eval_agent_paras=_eval_agent_paras, _target_agent_paras=_target_agent_paras, _explore_paras=self._explore_paras, ) # agent out to max q values # Calculate estimated Q-Values ---------------- mac_out = tf.reshape( trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) logging.debug("mac_out: {}".format(mac_out)) chosen_action_qvals = self.gather_custom(mac_out[:, :-1], self.ph_actions) # Calculate the Q-Values necessary for the target ----------- target_mac_out = tf.reshape( target_trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) target_mac_out = target_mac_out[:, 1:] # Mask out unavailable actions # target_mac_out[avail_actions[:, 1:] == 0] = -9999999 indices = tf.equal(self.ph_avail_action[:, 1:], 0) mask_val = tf.tile( [[[[-999999.0]]]], [ self.batch_size, self.fix_seq_length, self.n_agents, self.avail_action_num, ], ) logging.debug("indices:{}, mask_val:{}, target mac out:{}".format( indices, mask_val, target_mac_out)) target_mac_out = tf.where(indices, mask_val, target_mac_out) if self.use_double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:])) mac_out_detach = tf.where(indices, mask_val, mac_out_detach) cur_max_actions = tf.expand_dims( tf.argmax(mac_out_detach, axis=-1), -1) target_max_qvals = self.gather_custom(target_mac_out, cur_max_actions) else: target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1]) # eval mixer --------------- with tf.variable_scope("eval_mixer"): self.q_tot = self._build_mix_net2(chosen_action_qvals, self.ph_train_states) with tf.variable_scope("target_mixer"): q_tot_tmp = self._build_mix_net2(target_max_qvals, self.ph_train_target_states) self.target_q_tot = tf.stop_gradient(q_tot_tmp) _eval_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer") _target_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer") with tf.variable_scope("soft_replacement"): self.mix_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_mix_paras, _eval_mix_paras)] self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras, _target_mix_paras=_target_mix_paras) # Calculate 1-step Q-Learning targets targets = (self.ph_rewards + self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot) # Td-error td_error = self.q_tot - tf.stop_gradient(targets) # mask = mask.expand_as(td_error) #fixme: default as same shape! # 0-out the targets that came from padded data masked_td_error = tf.multiply(td_error, self.ph_mask) self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask) # Optimise optimizer = tf.train.RMSPropOptimizer( self.lr, decay=0.95, epsilon=1.5e-7, centered=True) grads_and_vars = optimizer.compute_gradients(self.loss) capped_gvs = [( grad if grad is None else tf.clip_by_norm( grad, clip_norm=self.grad_norm_clip), var, ) for grad, var in grads_and_vars] self.grad_update = optimizer.apply_gradients(capped_gvs) def assign_targets(self): """ Update weights periodically. 1. from eval agent to target agent 2. from target mixer to eval mixer :return: """ _a, _m = self.sess.run([self.agent_train_replace_op, self.mix_train_replace_op]) def assign_explore_agent(self): """ Update explore agent after each train process. :return: """ _ = self.sess.run(self.agent_explore_replace_op) def save_explore_agent_weights(self, save_path): """Save explore agent weight for explorer.""" # explore_saver = tf.train.Saver({t.name: t for t in self._explore_paras}) self.explore_saver.save( self.sess, save_path=save_path, write_meta_graph=False) # tf.train.list_variables(tf.train.latest_checkpoint(wp)) def set_weights(self, weights): """Set weight with memory tensor.""" with self.graph.as_default(): self.actor_var.set_weights(weights) def get_weights(self): """Get the weights.""" with self.graph.as_default(): return self.actor_var.get_weights() def restore_explorer_variable(self, model_name): """Restore explorer variable with tf.train.checkpoint.""" reader = tf.train.NewCheckpointReader(model_name) var_names = reader.get_variable_to_shape_map().keys() result = {} for n in var_names: result[n] = reader.get_tensor(n) logging.debug("read variable-{} from file:{}".format(n, model_name)) with self.sess.as_default(): # must been sess for var_key in self._explore_paras: try: var_key.load(result[var_key.name]) logging.debug("load {} success".format(var_key.name)) except BaseException as err: raise KeyError("update {} error:{}".format(var_key.name, err)) def train( self, batch_trajectories, train_obs_len, avail_actions, actions, cur_stats, target_stats, rewards, terminated, mask): """ Train with the whole graph. Update explorer.graph after each train process, and target as required. :param batch_trajectories: :param train_obs_len: list([max_ep for _ in range(batch.batch_size * n_agents)] :param avail_actions: avail action from environment :param actions: actual actions within trajectory :param cur_stats: batch["state"][:, :-1] :param target_stats: batch["state"][:, 1:] :param rewards: :param terminated: :param mask: :return: """ _, loss_val = self.sess.run( [self.grad_update, self.loss], feed_dict={ self.ph_train_obs: batch_trajectories, # Note: split trajectory with each agent. self.ph_train_obs_len: train_obs_len, self.ph_avail_action: avail_actions, self.ph_actions: actions, self.ph_train_states: cur_stats, self.ph_train_target_states: target_stats, self.ph_rewards: rewards, self.ph_terminated: terminated, self.ph_mask: mask, }, ) logging.debug("train_loss: {}".format(loss_val)) return loss_val
def create_model(self, model_info): """Create Deep-Q network.""" user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type) history_click_input = Input( shape=(self.n_history_click * self.item_dim), name="history_click", dtype=self.input_type ) history_no_click_input = Input( shape=(self.n_history_no_click * self.item_dim), name="history_no_click", dtype=self.input_type ) item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type) shared_embedding = Embedding( self.vocab_size, self.emb_dim, name="Emb", mask_zero=True, embeddings_initializer=self.embedding_initializer, trainable=False, ) # un-trainable gru_click = GRU(self.item_dim * self.emb_dim) gru_no_click = GRU(self.item_dim * self.emb_dim) user_feature = Flatten()(shared_embedding(user_input)) item_feature = Flatten()(shared_embedding(item_input)) history_click_feature = Reshape( (self.n_history_click, self.item_dim * self.emb_dim) )(shared_embedding(history_click_input)) history_click_feature = gru_click(history_click_feature) history_no_click_feature = Reshape( (self.n_history_no_click, self.item_dim * self.emb_dim) )(shared_embedding(history_no_click_input)) history_no_click_feature = gru_no_click(history_no_click_feature) x = concatenate( [ user_feature, history_click_feature, history_no_click_feature, item_feature, ] ) x_dense1 = Dense(128, activation="relu")(x) x_dense2 = Dense(128, activation="relu")(x_dense1) # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2) ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2) model = Model( inputs=[ user_input, history_click_input, history_no_click_input, item_input, ], outputs=ctr_pred, ) model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) if self._summary: model.summary() self.user_input = tf.placeholder( dtype=self.input_type, name="user_input", shape=(None, self.user_dim) ) self.history_click_input = tf.placeholder( dtype=self.input_type, name="history_click_input", shape=(None, self.n_history_click * self.item_dim), ) self.history_no_click_input = tf.placeholder( dtype=self.input_type, name="history_no_click_input", shape=(None, self.n_history_no_click * self.item_dim), ) self.item_input = tf.placeholder( dtype=self.input_type, name="item_input", shape=(None, self.item_dim) ) self.ctr_predict = model( [ self.user_input, self.history_click_input, self.history_no_click_input, self.item_input, ] ) self.actor_var = TFVariables([self.ctr_predict], self.sess) self.sess.run(tf.initialize_all_variables()) return model
class DqnInfoFlowModel(XTModel): """DQN Class for information flow.""" def __init__(self, model_info): """Init Dqn model for information flow.""" model_config = model_info.get("model_config", None) import_config(globals(), model_config) self.state_dim = model_info["state_dim"] self.action_dim = model_info["action_dim"] self.tau = 0.01 self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.learning_rate = 0.001 self.vocab_size = model_info["vocab_size"] self.emb_dim = model_info["emb_dim"] self.user_dim = model_info["user_dim"] self.item_dim = model_info["item_dim"] self.input_type = model_info["input_type"] # logging.info("set input type: {}".format(self.input_type)) self.embeddings = model_info["embeddings"] self.last_act = model_info["last_activate"] embedding_weights = np.loadtxt(self.embeddings, delimiter=",", dtype=float) self.embedding_initializer = tf.constant_initializer(embedding_weights) self.n_history_click = 5 self.n_history_no_click = 5 super().__init__(model_info) def create_model(self, model_info): """Create Deep-Q network.""" user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type) history_click_input = Input( shape=(self.n_history_click * self.item_dim), name="history_click", dtype=self.input_type ) history_no_click_input = Input( shape=(self.n_history_no_click * self.item_dim), name="history_no_click", dtype=self.input_type ) item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type) shared_embedding = Embedding( self.vocab_size, self.emb_dim, name="Emb", mask_zero=True, embeddings_initializer=self.embedding_initializer, trainable=False, ) # un-trainable gru_click = GRU(self.item_dim * self.emb_dim) gru_no_click = GRU(self.item_dim * self.emb_dim) user_feature = Flatten()(shared_embedding(user_input)) item_feature = Flatten()(shared_embedding(item_input)) history_click_feature = Reshape( (self.n_history_click, self.item_dim * self.emb_dim) )(shared_embedding(history_click_input)) history_click_feature = gru_click(history_click_feature) history_no_click_feature = Reshape( (self.n_history_no_click, self.item_dim * self.emb_dim) )(shared_embedding(history_no_click_input)) history_no_click_feature = gru_no_click(history_no_click_feature) x = concatenate( [ user_feature, history_click_feature, history_no_click_feature, item_feature, ] ) x_dense1 = Dense(128, activation="relu")(x) x_dense2 = Dense(128, activation="relu")(x_dense1) # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2) ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2) model = Model( inputs=[ user_input, history_click_input, history_no_click_input, item_input, ], outputs=ctr_pred, ) model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate)) if self._summary: model.summary() self.user_input = tf.placeholder( dtype=self.input_type, name="user_input", shape=(None, self.user_dim) ) self.history_click_input = tf.placeholder( dtype=self.input_type, name="history_click_input", shape=(None, self.n_history_click * self.item_dim), ) self.history_no_click_input = tf.placeholder( dtype=self.input_type, name="history_no_click_input", shape=(None, self.n_history_no_click * self.item_dim), ) self.item_input = tf.placeholder( dtype=self.input_type, name="item_input", shape=(None, self.item_dim) ) self.ctr_predict = model( [ self.user_input, self.history_click_input, self.history_no_click_input, self.item_input, ] ) self.actor_var = TFVariables([self.ctr_predict], self.sess) self.sess.run(tf.initialize_all_variables()) return model def train(self, state, label, batch_size, verbose=False): """Train the model.""" with self.graph.as_default(): K.set_session(self.sess) history = self.model.fit( state, label, batch_size=batch_size, verbose=verbose ) return history.history["loss"][0] def predict(self, state): """ Do predict use the newest model. :param state: :return: """ with self.graph.as_default(): # K.set_session(self.sess) # return np.array(self.model.predict_on_batch(state)).reshape(-1) feed_dict = { self.user_input: state["user_input"], self.history_click_input: state["history_click"], self.history_no_click_input: state["history_no_click"], self.item_input: state["item_input"], } return np.array(self.sess.run(self.ctr_predict, feed_dict)).reshape(-1) def set_weights(self, weights): """Set weight with memory tensor.""" # split keras and xingtian npz with self.graph.as_default(): K.set_session(self.sess) if isinstance(weights, dict) and self.actor_var: self.actor_var.set_weights(weights) else: # keras self.model.set_weights(weights) def get_weights(self): """Get the weights.""" with self.graph.as_default(): K.set_session(self.sess) return self.model.get_weights() def load_model(self, model_name): if self.actor_var and str(model_name).endswith(".npz"): self.actor_var.set_weights_with_npz(model_name) else: with self.graph.as_default(): K.set_session(self.sess) self.model.load_weights(model_name)
class ImpalaCnnOpt(XTModel): """Docstring for ActorNetwork.""" def __init__(self, model_info): model_config = model_info.get("model_config", dict()) import_config(globals(), model_config) self.dtype = DTYPE_MAP.get(model_info.get("default_dtype", "float32")) self.input_dtype = model_info.get("input_dtype", "float32") self.sta_mean = model_info.get("state_mean", 0.) self.sta_std = model_info.get("state_std", 255.) self._transform = partial(state_transform, mean=self.sta_mean, std=self.sta_std, input_dtype=self.input_dtype) self.state_dim = model_info["state_dim"] self.action_dim = model_info["action_dim"] self.filter_arch = get_atari_filter(self.state_dim) # lr schedule with linear_cosine_decay self.lr_schedule = model_config.get("lr_schedule", None) self.opt_type = model_config.get("opt_type", "adam") self.lr = None self.ph_state = None self.ph_adv = None self.out_actions = None self.pi_logic_outs, self.baseline = None, None # placeholder for behavior policy logic outputs self.ph_bp_logic_outs = None self.ph_actions = None self.ph_dones = None self.ph_rewards = None self.loss, self.optimizer, self.train_op = None, None, None self.grad_norm_clip = model_config.get("grad_norm_clip", 40.0) self.sample_batch_steps = model_config.get("sample_batch_step", 50) self.saver = None self.explore_paras = None self.actor_var = None # store weights for agent super().__init__(model_info) def create_model(self, model_info): self.ph_state = tf.placeholder(self.input_dtype, shape=(None, *self.state_dim), name="state_input") with tf.variable_scope("explore_agent"): state_input = Lambda(self._transform)(self.ph_state) last_layer = state_input for (out_size, kernel, stride) in self.filter_arch[:-1]: last_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="same", )(last_layer) # last convolution (out_size, kernel, stride) = self.filter_arch[-1] convolution_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="valid", )(last_layer) self.pi_logic_outs = tf.squeeze( Conv2D(self.action_dim, (1, 1), padding="same")(convolution_layer), axis=[1, 2], ) baseline_flat = Flatten()(convolution_layer) self.baseline = tf.squeeze( tf.layers.dense( inputs=baseline_flat, units=1, activation=None, kernel_initializer=custom_norm_initializer(0.01), ), 1, ) self.out_actions = tf.squeeze( tf.multinomial(self.pi_logic_outs, num_samples=1, output_dtype=tf.int32), 1, name="out_action", ) # create learner self.ph_bp_logic_outs = tf.placeholder(self.dtype, shape=(None, self.action_dim), name="ph_b_logits") self.ph_actions = tf.placeholder(tf.int32, shape=(None, ), name="ph_action") self.ph_dones = tf.placeholder(tf.bool, shape=(None, ), name="ph_dones") self.ph_rewards = tf.placeholder(self.dtype, shape=(None, ), name="ph_rewards") # Split the tensor into batches at known episode cut boundaries. # [batch_count * batch_step] -> [batch_step, batch_count] batch_step = self.sample_batch_steps def split_batches(tensor, drop_last=False): batch_count = tf.shape(tensor)[0] // batch_step reshape_tensor = tf.reshape( tensor, tf.concat([[batch_count, batch_step], tf.shape(tensor)[1:]], axis=0), ) # swap B and T axes res = tf.transpose( reshape_tensor, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))), ) if drop_last: return res[:-1] return res self.loss = vtrace_loss( bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True), tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True), actions=split_batches(self.ph_actions, drop_last=True), discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) * GAMMA, drop_last=True), rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1), drop_last=True), values=split_batches(self.baseline, drop_last=True), bootstrap_value=split_batches(self.baseline)[-1], ) global_step = tf.Variable(0, trainable=False, dtype=tf.int32) if self.opt_type == "adam": if self.lr_schedule: learning_rate = self._get_lr(global_step) else: learning_rate = LR optimizer = AdamOptimizer(learning_rate) elif self.opt_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(LR, decay=0.99, epsilon=0.1, centered=True) else: raise KeyError("invalid opt_type: {}".format(self.opt_type)) grads_and_vars = optimizer.compute_gradients(self.loss) # global norm grads, var = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip) clipped_gvs = list(zip(grads, var)) self.train_op = optimizer.apply_gradients(clipped_gvs, global_step=global_step) # fixme: help to show the learning rate among training processing self.lr = optimizer._lr self.actor_var = TFVariables(self.out_actions, self.sess) self.sess.run(global_variables_initializer()) self.explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") self.saver = Saver({t.name: t for t in self.explore_paras}, max_to_keep=self.max_to_keep) return True def _get_lr(self, global_step, decay_step=20000.): """Make decay learning rate.""" lr_schedule = self.lr_schedule if len(lr_schedule) != 2: logging.warning("Need 2 elements in lr_schedule!\n, " "likes [[0, 0.01], [20000, 0.000001]]") logging.fatal("lr_schedule invalid: {}".format(lr_schedule)) if lr_schedule[0][0] != 0: logging.info("lr_schedule[0][1] could been init learning rate") learning_rate = linear_cosine_decay(lr_schedule[0][1], global_step, decay_step, beta=lr_schedule[1][1] / float(decay_step)) return learning_rate def train(self, state, label): """Train with sess.run.""" bp_logic_outs, actions, dones, rewards = label with self.graph.as_default(): _, loss = self.sess.run( [self.train_op, self.loss], feed_dict={ self.ph_state: state, self.ph_bp_logic_outs: bp_logic_outs, self.ph_actions: actions, self.ph_dones: dones, self.ph_rewards: rewards, }, ) return loss def predict(self, state): """ Do predict use the newest model. :param: state :return: action_logits, action_val, value """ with self.graph.as_default(): feed_dict = {self.ph_state: state} return self.sess.run( [self.pi_logic_outs, self.baseline, self.out_actions], feed_dict) def save_model(self, file_name): """Save model without meta graph.""" ck_name = self.saver.save(self.sess, save_path=file_name, write_meta_graph=False) return ck_name def load_model(self, model_name, by_name=False): """Load model with inference variables.""" restore_tf_variable(self.sess, self.explore_paras, model_name) def set_weights(self, weights): """Set weight with memory tensor.""" with self.graph.as_default(): self.actor_var.set_weights(weights) def get_weights(self): """Get weights.""" with self.graph.as_default(): return self.actor_var.get_weights()
def create_model(self, model_info): self.ph_state = tf.placeholder(self.input_dtype, shape=(None, *self.state_dim), name="state_input") with tf.variable_scope("explore_agent"): state_input = Lambda(self._transform)(self.ph_state) last_layer = state_input for (out_size, kernel, stride) in self.filter_arch[:-1]: last_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="same", )(last_layer) # last convolution (out_size, kernel, stride) = self.filter_arch[-1] convolution_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="valid", )(last_layer) self.pi_logic_outs = tf.squeeze( Conv2D(self.action_dim, (1, 1), padding="same")(convolution_layer), axis=[1, 2], ) baseline_flat = Flatten()(convolution_layer) self.baseline = tf.squeeze( tf.layers.dense( inputs=baseline_flat, units=1, activation=None, kernel_initializer=custom_norm_initializer(0.01), ), 1, ) self.out_actions = tf.squeeze( tf.multinomial(self.pi_logic_outs, num_samples=1, output_dtype=tf.int32), 1, name="out_action", ) # create learner self.ph_bp_logic_outs = tf.placeholder(self.dtype, shape=(None, self.action_dim), name="ph_b_logits") self.ph_actions = tf.placeholder(tf.int32, shape=(None, ), name="ph_action") self.ph_dones = tf.placeholder(tf.bool, shape=(None, ), name="ph_dones") self.ph_rewards = tf.placeholder(self.dtype, shape=(None, ), name="ph_rewards") # Split the tensor into batches at known episode cut boundaries. # [batch_count * batch_step] -> [batch_step, batch_count] batch_step = self.sample_batch_steps def split_batches(tensor, drop_last=False): batch_count = tf.shape(tensor)[0] // batch_step reshape_tensor = tf.reshape( tensor, tf.concat([[batch_count, batch_step], tf.shape(tensor)[1:]], axis=0), ) # swap B and T axes res = tf.transpose( reshape_tensor, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))), ) if drop_last: return res[:-1] return res self.loss = vtrace_loss( bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True), tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True), actions=split_batches(self.ph_actions, drop_last=True), discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) * GAMMA, drop_last=True), rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1), drop_last=True), values=split_batches(self.baseline, drop_last=True), bootstrap_value=split_batches(self.baseline)[-1], ) global_step = tf.Variable(0, trainable=False, dtype=tf.int32) if self.opt_type == "adam": if self.lr_schedule: learning_rate = self._get_lr(global_step) else: learning_rate = LR optimizer = AdamOptimizer(learning_rate) elif self.opt_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(LR, decay=0.99, epsilon=0.1, centered=True) else: raise KeyError("invalid opt_type: {}".format(self.opt_type)) grads_and_vars = optimizer.compute_gradients(self.loss) # global norm grads, var = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip) clipped_gvs = list(zip(grads, var)) self.train_op = optimizer.apply_gradients(clipped_gvs, global_step=global_step) # fixme: help to show the learning rate among training processing self.lr = optimizer._lr self.actor_var = TFVariables(self.out_actions, self.sess) self.sess.run(global_variables_initializer()) self.explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") self.saver = Saver({t.name: t for t in self.explore_paras}, max_to_keep=self.max_to_keep) return True