def get_loss(model, placeholder_dict): a = placeholder_dict["A"] adv = placeholder_dict["ADV"] r = placeholder_dict["R"] oldneglocpac = placeholder_dict["OLDNEGLOGPAC"] oldvpred = placeholder_dict["OLDVPRED"] clip_range = placeholder_dict["CLIPRANGE"] neglogpac = model.pd.neglogp(a) entropy = tf.reduce_mean(cat_entropy(model.pi_logit)) vpred = model.vf vpredclipped = oldvpred + tf.clip_by_value(model.vf - oldvpred, -clip_range, clip_range) vf_losses1 = tf.square(vpred - r) vf_losses2 = tf.square(vpredclipped - r) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(oldneglocpac - neglogpac) pg_losses = -adv * ratio pg_losses2 = -adv * tf.clip_by_value(ratio, 1.0 - clip_range, 1.0 + clip_range) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - oldneglocpac)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), clip_range))) return pg_loss, entropy, vf_loss, vpred, neglogpac, approxkl, clipfrac
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): # pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) #X, processed_x = observation_input(ob_space, nbatch) X, processed_x = observation_input(ob_space, None) print('X:', X.shape) print('processed_X:', processed_x.shape) print('ac_space:', ac_space.n) with tf.variable_scope("model", reuse=reuse): h = cnn_grid(processed_x, **conv_kwargs) actor_l1 = fc(h, 'actor', nh=64, init_scale=np.sqrt(2)) self.phi = actor_l2 = tf.nn.tanh(actor_l1) #actor_l3 = fc(actor_l2, 'actor2', nh=action_space.n, init_scale=np.sqrt(2)) critic_l1 = fc(h, 'critic', nh=64, init_scale=np.sqrt(2)) critic_l2 = tf.nn.tanh(critic_l1) critic_l3 = fc(critic_l2, 'critic2', nh=1, init_scale=np.sqrt(2)) vf0 = critic_l3 vf = vf0[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(actor_l2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.entropy = cat_entropy(self.pi) with tf.variable_scope("planning", reuse=reuse): # predict next action a0_onehot = tf.stop_gradient(tf.one_hot(a0, ac_space.n, axis=-1)) f = tf.concat([self.phi, a0_onehot], axis=1) self.pd_p, self.pi_p = self.pdtype.pdfromlatent(f, init_scale=0.01) self.ap = self.pd_p.sample() def step(ob, *_args, **_kwargs): a, v, neglogp, ap = sess.run([a0, vf, neglogp0, self.ap], {X: ob}) return a, v, ap, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def neg_log_prob(actions): return tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob
def get_loss(model, placeholder_dict): a = placeholder_dict["A"] adv = placeholder_dict["ADV"] r = placeholder_dict["R"] # Compute cross entropy loss between estimated distribution of action and 'true' distribution of actions chosen_action_log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=model.pi_logit, labels=a) pg_loss = tf.reduce_mean(adv * chosen_action_log_probs) # minimize vf_loss = tf.reduce_mean(mse(tf.squeeze(model.vf), r)) # minimize entropy = -tf.reduce_mean(cat_entropy(model.pi_logit)) # maximize return pg_loss, entropy, vf_loss, model.vf, chosen_action_log_probs, None, None
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): # pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) #X, processed_x = observation_input(ob_space, nbatch) X, processed_x = observation_input(ob_space, None) print('X:', X.shape) print('processed_X:', processed_x.shape) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.entropy = cat_entropy(self.pi) def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def neg_log_prob(actions): return tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): # pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, None) #X = tf.placeholder(shape=input_shape,dtype=tf.float32, name='ob') with tf.variable_scope("model", reuse=reuse): activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ( fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ( fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.entropy = cat_entropy(self.pi) def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def neg_log_prob(actions): return tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob
def __init__(self, state_dim, n_actions, n_steps, vf_coeff=0.5, entropy_coeff=0.001, lr=0.0007, lr_decay=0.99, fuzz_factor=0.00005, total_timesteps=800000, max_grad_norm=0.5, scope='actor_critic'): # fuzz_factor was called epsilon sess = tf.Session() # TODO add CPU config information # Targets in loss computation advantage = tf.placeholder( dtype=tf.float32, shape=[None], name='advantage') # advantage of the chosen action discounted_reward = tf.placeholder( dtype=tf.float32, shape=[None], name='reward') # value function target action = tf.placeholder(dtype=tf.int32, shape=[None], name='action_in') # action index LR = tf.placeholder(dtype=tf.float32, shape=[]) # learning rate # target_model = SharedMLP(sess, state_dim, n_actions) # used to predict action probs and state values # train_model = SharedMLP(sess, state_dim, n_actions) #, reuse=True) # target_model = LSTM(sess, state_dim, n_actions, n_steps=n_steps) # used to predict action probs and state values train_model = LSTM_SM(sess, state_dim, n_actions, n_steps=n_steps) action_onehot = tf.one_hot(action, n_actions, dtype=tf.float32) chosen_action_prob = tf.reduce_sum(train_model.ap_out * action_onehot, 1) # Compute losses: policy gradient loss (= advantage of the action), the value function loss # (Mean Squared error of 1-step TD-target) and an additional regulator regularizing the entropy of the policy, # to enhance exploration # vf_loss = mean(discounted_reward - estimated_value)² # pg_loss = mean(log(action_probs) * advantages) pg_loss = -tf.reduce_sum(tf.log(chosen_action_prob) * advantage) # action_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.ap_out, labels=action) # pg_loss = tf.reduce_mean(action_log_prob * advantage) vf_loss = tf.reduce_mean( tf.squared_difference(train_model.vf_out, discounted_reward) / 2.) entropy = tf.reduce_mean(cat_entropy(train_model.ap_out)) loss = pg_loss + vf_coeff * vf_loss - entropy_coeff * entropy # Compute gradient of the expected reward w.r.t. the policy parameters with tf.variable_scope("model"): params = tf.trainable_variables() grads = tf.gradients(loss, params) # clip gradients eventually if max_grad_norm is not None: # correct way of clipping but slower than clip_by_norm grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = zip(grads, params) # grads = list(zip(grads, params)) optimizer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=lr_decay, epsilon=fuzz_factor) train_step = optimizer.apply_gradients(grads) _lr = LrDecay(v_init=lr, decay=lr_decay, n_step=total_timesteps) # training of training model def trainActorCritic(obs, actions, discounted_rewards, values, dones, states): adv = discounted_rewards - values # enter advantage as one-hot vector # adv = [tf.one_hot(a, 1)*adv for a in actions] for i in range(len(obs)): lr_cur = _lr.value() if states is not None: # LSTM network train_dict = { train_model.obs_in: obs, train_model.D: dones, train_model.LS: states, advantage: adv, action: actions, discounted_reward: discounted_rewards, LR: lr_cur } else: # MLP network train_dict = { train_model.obs_in: obs, advantage: adv, action: actions, discounted_reward: discounted_rewards, LR: lr_cur } # policy_loss, value_loss, policy_entropy, _, ap, a = sess.run([pg_loss, # vf_loss, # entropy, # train_step, # train_model.ap_out, # train_model.a0], # train_dict) policy_loss, value_loss, policy_entropy, _, aprob = sess.run( [pg_loss, vf_loss, entropy, train_step, train_model.ap_out], train_dict) return policy_loss, value_loss, policy_entropy, aprob # def save_params(): # # def load_params(): self.train = trainActorCritic self.train_model = train_model # self.target_model = target_model # self.step = target_model.step # self.value = target_model.value # self.initial_states = target_model.initial_states self.target_model = train_model self.step = train_model.step self.value = train_model.value self.initial_states = train_model.initial_states # self.save = save_params # self.load = load_params tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, mf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) ADV_MOMENT = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) R2 = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) ENT_COEF = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean((ADV) * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) mf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.mf), R2)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) ent_coef = Scheduler(v=ent_coef, nvalues=total_timesteps / 10, schedule='step') mf_coef = 0.01 loss = pg_loss - entropy * ENT_COEF + vf_loss * vf_coef + mf_loss * mf_coef # loss = pg_loss + vf_loss * vf_coef + mf_loss * mf_coef # loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, rewards_square, masks, actions, values, moments): values_random = np.random.normal( loc=values, scale=np.sqrt(np.maximum(moments - values**2, 0))) # values_random = values - np.sqrt(np.maximum(moments - values ** 2,0)) advs = rewards - values_random # advs = (1 - 2 * rewards) * rewards - values + 2 * values * values advs_moment = rewards_square - moments # advs = (1 + 2 * rewards) * (rewards) # advs_moment = rewards_square for step in range(len(obs)): cur_lr = lr.value() cur_ent_coef = ent_coef.value() td_map = { train_model.X: obs, A: actions, ADV: advs, ADV_MOMENT: advs_moment, R: rewards, R2: rewards_square, LR: cur_lr, ENT_COEF: cur_ent_coef } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, moment_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, mf_loss, entropy, _train], td_map) return policy_loss, value_loss, moment_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', summary_dir=None): sess = tf_util.make_session() nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # storing summaries episode_reward = tf.placeholder("float") tf.summary.scalar("policy_loss", pg_loss) tf.summary.scalar("entropy", entropy) tf.summary.scalar("value_loss", vf_loss) tf.summary.scalar("episode_reward", episode_reward) summary_op = tf.summary.merge_all() def train(obs, states, mean_reward, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, episode_reward: mean_reward } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, summary, _ = sess.run( [pg_loss, vf_loss, entropy, summary_op, _train], td_map) return policy_loss, value_loss, policy_entropy, summary def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) self.train_writer = tf.summary.FileWriter(summary_dir, sess.graph)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', optimizer='adam'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) step_model = train_model neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss + vf_loss * vf_coef - entropy * ent_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if optimizer == 'adam': trainer = tf.train.AdamOptimizer() else: trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks total_loss, policy_loss, value_loss, policy_entropy, _ = sess.run( [loss, pg_loss, vf_loss, entropy, _train], td_map) return total_loss, policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def create_train_graph(self): opt = tf.train.AdamOptimizer(learning_rate=POLICY_LR) if IS_TRAINING: in_shape = [None, N_FRAMES * C_IN, HEIGHT, WIDTH] else: in_shape = [1, N_FRAMES * C_IN, HEIGHT, WIDTH] self.x = tf.placeholder(tf.float32, in_shape, "x_fake") self.adv_vec = tf.placeholder(tf.float32, [None, NUM_ACTIONS], "advantage_vector") self.y = tf.placeholder(tf.int32, [ None, ]) # self.a_idx = tf.placeholder(tf.int32, [None, ], "action_index") self.model = DIN(num_actions=NUM_ACTIONS, is_training=True) # Discriminator training graph: expert label = [1, 0], fake label = [0, 1] self.expert_sequence = self.read_data(N_EPISODES) self.x_expert = tf.reshape(self.expert_sequence, [N_STEPS, N_FRAMES * C_IN, HEIGHT, WIDTH]) self.a_logits, self.d_logits = self.model.forward(self.x, reuse=False) _, self.d_logits_ex = self.model.forward(self.x_expert, reuse=True) d_expert_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.zeros(N_STEPS, tf.int32), logits=self.d_logits_ex)) d_fake_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y, logits=self.d_logits)) neglogpac = tf.nn.softmax_cross_entropy_with_logits( logits=self.a_logits, labels=self.adv_vec) self.pg_loss = BETA_PG * tf.reduce_mean(neglogpac) self.entropy = BETA_ENT * tf.reduce_mean( utils.cat_entropy(self.a_logits)) self.d_loss = BETA_DISC * (d_fake_loss + d_expert_loss) train_vars = tf.trainable_variables() self.grad_norm_d = utils.gradient_norm(self.d_loss, train_vars) self.grad_norm_p = utils.gradient_norm(self.pg_loss, train_vars) self.grad_norm_ent = utils.gradient_norm(self.entropy, train_vars) loss = self.pg_loss - self.entropy + self.d_loss update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): gvs = opt.compute_gradients(loss) capped_gvs = [(tf.clip_by_value(grad, -MAX_GRAD, MAX_GRAD), var) for grad, var in gvs] self.g_norm, self.w_norm = utils.compute_mean_abs_norm(capped_gvs) self.grad_op = opt.apply_gradients(capped_gvs)