def test_vtrace_from_logits(self, batch_size=2): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. values = { "behavior_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "target_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "actions": np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), "discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)], dtype=np.float32, ), "rewards": _shaped_arange(seq_len, batch_size), "values": _shaped_arange(seq_len, batch_size) / batch_size, "bootstrap_value": _shaped_arange(batch_size) + 1.0, # B } values = {k: torch.from_numpy(v) for k, v in values.items()} from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **values, ) target_log_probs = vtrace.action_log_probs( values["target_policy_logits"], values["actions"]) behavior_log_probs = vtrace.action_log_probs( values["behavior_policy_logits"], values["actions"]) log_rhos = target_log_probs - behavior_log_probs # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=log_rhos, discounts=values["discounts"], rewards=values["rewards"], values=values["values"], bootstrap_value=values["bootstrap_value"], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, ) assert_allclose(from_iw.vs, from_logits_output.vs) assert_allclose(from_iw.pg_advantages, from_logits_output.pg_advantages) assert_allclose(behavior_log_probs, from_logits_output.behavior_action_log_probs) assert_allclose(target_log_probs, from_logits_output.target_action_log_probs) assert_allclose(log_rhos, from_logits_output.log_rhos)
def learner(model, data, ps, args): """Learner to get trajectories from Actors.""" optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.epsilon, weight_decay=args.decay, momentum=args.momentum) batch_size = args.batch_size baseline_cost = args.baseline_cost entropy_cost = args.entropy_cost gamma = args.gamma save_path = args.save_path """Gets trajectories from actors and trains learner.""" batch = [] best = 0. while True: trajectory = data.get() batch.append(trajectory) if torch.cuda.is_available(): trajectory.cuda() if len(batch) < batch_size: continue behaviour_logits, obs, actions, rewards, dones, hx = make_time_major( batch) optimizer.zero_grad() logits, values = model(obs, actions, rewards, dones, hx=hx) bootstrap_value = values[-1] actions, behaviour_logits, dones, rewards = actions[ 1:], behaviour_logits[1:], dones[1:], rewards[1:] logits, values = logits[:-1], values[:-1] discounts = (~dones).float() * gamma vs, pg_advantages = vtrace.from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=logits, actions=actions, discounts=discounts, rewards=rewards, values=values, bootstrap_value=bootstrap_value) # policy gradient loss cross_entropy = F.cross_entropy(logits, actions, reduction='none') loss = (cross_entropy * pg_advantages.detach()).sum() # baseline_loss loss += baseline_cost * .5 * (vs - values).pow(2).sum() # entropy_loss loss += entropy_cost * -(-F.softmax(logits, 1) * F.log_softmax(logits, 1)).sum(-1).sum() loss.backward() optimizer.step() model.cpu() ps.push(model.state_dict()) if rewards.mean().item() > best: torch.save(model.state_dict(), save_path) if torch.cuda.is_available(): model.cuda() batch = []
def build_learner(agent, agent_state, env_outputs, agent_outputs): """Builds the learner loop. Args: agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an `unroll` call for computing the outputs for a whole trajectory. agent_state: The initial agent state for each sequence in the batch. env_outputs: A `StepOutput` namedtuple where each field is of shape [T+1, ...]. agent_outputs: An `AgentOutput` namedtuple where each field is of shape [T+1, ...]. Returns: A tuple of (done, infos, and environment frames) where the environment frames tensor causes an update. """ learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs, agent_state) # Use last baseline value (from the value function) to bootstrap. bootstrap_value = learner_outputs.baseline[-1] # At this point, the environment outputs at time step `t` are the inputs that # lead to the learner_outputs at time step `t`. After the following shifting, # the actions in agent_outputs and learner_outputs at time step `t` is what # leads to the environment outputs at time step `t`. agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs) rewards, infos, done, _ = nest.map_structure( lambda t: t[1:], env_outputs) learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs) if FLAGS.reward_clipping == 'abs_one': clipped_rewards = tf.clip_by_value(rewards, -1, 1) elif FLAGS.reward_clipping == 'soft_asymmetric': squeezed = tf.tanh(rewards / 5.0) # Negative rewards are given less weight than positive rewards. clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5. discounts = tf.to_float(~done) * FLAGS.discounting # Compute V-trace returns and weights. # Note, this is put on the CPU because it's faster than on GPU. It can be # improved further with XLA-compilation or with a custom TensorFlow operation. with tf.device('/cpu'): vtrace_returns = vtrace.from_logits( behaviour_policy_logits=agent_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=agent_outputs.action, discounts=discounts, rewards=clipped_rewards, values=learner_outputs.baseline, bootstrap_value=bootstrap_value) # Compute loss as a weighted sum of the baseline loss, the policy gradient # loss and an entropy regularization term. total_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, agent_outputs.action, vtrace_returns.pg_advantages) total_loss += FLAGS.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline) total_loss += FLAGS.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) # Optimization num_env_frames = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames, FLAGS.total_environment_frames, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay, FLAGS.momentum, FLAGS.epsilon) train_op = optimizer.minimize(total_loss) # Merge updating the network and environment frames into a single tensor. with tf.control_dependencies([train_op]): num_env_frames_and_train = num_env_frames.assign_add( FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats) # Adding a few summaries. tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('total_loss', total_loss) tf.summary.histogram('action', agent_outputs.action) return done, infos, num_env_frames_and_train
def test_vtrace_from_logits(self, batch_size): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. dummy_config = {"model": None} # Intentionally leaving shapes unspecified to test if V-trace can # deal with that. placeholders = { # T, B, NUM_ACTIONS "behaviour_policy_logits": tf.placeholder( dtype=tf.float32, shape=[None, None, None]), # T, B, NUM_ACTIONS "target_policy_logits": tf.placeholder( dtype=tf.float32, shape=[None, None, None]), "actions": tf.placeholder(dtype=tf.int32, shape=[None, None]), "discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]), "rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]), "values": tf.placeholder(dtype=tf.float32, shape=[None, None]), "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]), } from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, config=dummy_config, **placeholders) target_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders["target_policy_logits"], placeholders["actions"], dummy_config) behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders["behaviour_policy_logits"], placeholders["actions"], dummy_config) log_rhos = target_log_probs - behaviour_log_probs ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) values = { "behaviour_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "target_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "actions": np.random.randint( 0, num_actions - 1, size=(seq_len, batch_size)), "discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), "rewards": _shaped_arange(seq_len, batch_size), "values": _shaped_arange(seq_len, batch_size) / batch_size, "bootstrap_value": _shaped_arange(batch_size) + 1.0, # B } feed_dict = {placeholders[k]: v for k, v in values.items()} with self.test_session() as session: from_logits_output_v = session.run( from_logits_output, feed_dict=feed_dict) (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs, ground_truth_target_action_log_probs) = session.run( ground_truth, feed_dict=feed_dict) # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values["discounts"], rewards=values["rewards"], values=values["values"], bootstrap_value=values["bootstrap_value"], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) with self.test_session() as session: from_iw_v = session.run(from_iw) self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs) self.assertAllClose(from_iw_v.pg_advantages, from_logits_output_v.pg_advantages) self.assertAllClose(ground_truth_behaviour_action_log_probs, from_logits_output_v.behaviour_action_log_probs) self.assertAllClose(ground_truth_target_action_log_probs, from_logits_output_v.target_action_log_probs) self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)
def build_learner(agent, env_outputs, agent_outputs, env_id): """Builds the learner loop. Args: agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an `unroll` call for computing the outputs for a whole trajectory. agent_state: The initial agent state for each sequence in the batch. env_outputs: A `StepOutput` namedtuple where each field is of shape [T+1, ...]. agent_outputs: An `AgentOutput` namedtuple where each field is of shape [T+1, ...]. Returns: A tuple of (done, infos, and environment frames) where the environment frames tensor causes an update. """ # Need to map the game name, e.g 'BreakoutNoFrameSkip-v4' to an integer. def get_single_game_info(_tuple): single_env_id, game_info = _tuple return game_info[single_env_id] # Retrieve the specific games in the current batch. def get_batch_value(batch): return tf.map_fn(get_single_game_info, (env_id, batch), dtype=tf.float32) learner_outputs = agent.unroll(agent_outputs.action, env_outputs) un_normalized_vf = learner_outputs.un_normalized_vf normalized_vf = learner_outputs.normalized_vf game_specific_un_normalized_vf = tf.map_fn(get_batch_value, un_normalized_vf, dtype=tf.float32) # game_specific_un_normalized_vf = tf.reduce_sum(game) game_specific_normalized_vf = tf.map_fn(get_batch_value, normalized_vf, dtype=tf.float32) # Ensure the learner separates the value functions for each game. # According to equation (10) in (Hessel et al., 2018). learner_outputs = learner_outputs._replace(un_normalized_vf=game_specific_un_normalized_vf, normalized_vf=game_specific_normalized_vf) # Use last baseline value (from the value function) to bootstrap. bootstrap_value = learner_outputs.un_normalized_vf[-1] # At this point, the environment outputs at time step `t` are the inputs that # lead to the learner_outputs at time step `t`. After the following shifting, # the actions in agent_outputs and learner_outputs at time step `t` is what # leads to the environment outputs at time step `t`. agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs) rewards, infos, done, _ = nest.map_structure( lambda t: t[1:], env_outputs) learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs) if FLAGS.reward_clipping == 'abs_one': clipped_rewards = tf.clip_by_value(rewards, -1, 1) elif FLAGS.reward_clipping == 'soft_asymmetric': squeezed = tf.tanh(rewards / 5.0) # Negative rewards are given less weight than positive rewards. clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5. discounts = tf.to_float(~done) * FLAGS.discounting game_specific_mean = tf.gather(agent._mean, env_id) game_specific_std = tf.gather(agent._std, env_id) # Compute V-trace returns and weights. # Note, this is put on the CPU because it's faster than on GPU. It can be # improved further with XLA-compilation or with a custom TensorFlow operation. with tf.device('/cpu'): vtrace_returns = vtrace.from_logits( behaviour_policy_logits=agent_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=agent_outputs.action, discounts=discounts, rewards=clipped_rewards, un_normalized_values=learner_outputs.un_normalized_vf, normalized_values=learner_outputs.normalized_vf, mean=game_specific_mean, std=game_specific_std, bootstrap_value=bootstrap_value) # First term of equation (7) in (Hessel et al., 2018) normalized_vtrace = (vtrace_returns.vs - game_specific_mean) / game_specific_std normalized_vtrace = nest.map_structure(tf.stop_gradient, normalized_vtrace) # Compute loss as a weighted sum of the baseline loss, the policy gradient # loss and an entropy regularization term. total_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, agent_outputs.action, vtrace_returns.pg_advantages) baseline_loss = compute_baseline_loss( normalized_vtrace - learner_outputs.normalized_vf) # Using the average GvT baseline_loss = tf.divide(baseline_loss, FLAGS.unroll_length) total_loss += FLAGS.baseline_cost * baseline_loss total_loss += FLAGS.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) # Optimization num_env_frames = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames, FLAGS.total_environment_frames, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay, FLAGS.momentum, FLAGS.epsilon) # Use reward clipping for atari games only if FLAGS.gradient_clipping > 0.0: # gradients, variables = zip(*optimizer.compute_gradients(total_loss)) variables = tf.trainable_variables() gradients = tf.gradients(total_loss, variables) # print("VARIABLES: ", variables) gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.gradient_clipping) print("GRADIENTS: ", gradients) train_op = optimizer.apply_gradients(zip(gradients, variables)) else: train_op = optimizer.minimize(total_loss) # Merge updating the network and environment frames into a single tensor. with tf.control_dependencies([train_op]): num_env_frames_and_train = num_env_frames.assign_add( FLAGS.batch_size * FLAGS.unroll_length) # Adding a few summaries. tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('total_loss', total_loss) tf.summary.histogram('action', agent_outputs.action) return (done, infos, num_env_frames_and_train) + (agent.update_moments(vtrace_returns.vs, env_id))
def test_vtrace_from_logits(self, batch_size): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. # Intentionally leaving shapes unspecified to test if V-trace can # deal with that. placeholders = { # T, B, NUM_ACTIONS 'behaviour_policy_logits': tf.placeholder(dtype=tf.float32, shape=[None, None, None]), # T, B, NUM_ACTIONS 'target_policy_logits': tf.placeholder(dtype=tf.float32, shape=[None, None, None]), 'actions': tf.placeholder(dtype=tf.int32, shape=[None, None]), 'discounts': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'rewards': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'values': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'bootstrap_value': tf.placeholder(dtype=tf.float32, shape=[None]), } from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **placeholders) target_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders['target_policy_logits'], placeholders['actions']) behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders['behaviour_policy_logits'], placeholders['actions']) log_rhos = target_log_probs - behaviour_log_probs ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) values = { 'behaviour_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'target_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'actions': np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), 'discounts': np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), 'rewards': _shaped_arange(seq_len, batch_size), 'values': _shaped_arange(seq_len, batch_size) / batch_size, 'bootstrap_value': _shaped_arange(batch_size) + 1.0, # B } feed_dict = {placeholders[k]: v for k, v in values.items()} with self.test_session() as session: from_logits_output_v = session.run( from_logits_output, feed_dict=feed_dict) (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs, ground_truth_target_action_log_probs) = session.run( ground_truth, feed_dict=feed_dict) # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values['discounts'], rewards=values['rewards'], values=values['values'], bootstrap_value=values['bootstrap_value'], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) with self.test_session() as session: from_iw_v = session.run(from_iw) self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs) self.assertAllClose(from_iw_v.pg_advantages, from_logits_output_v.pg_advantages) self.assertAllClose(ground_truth_behaviour_action_log_probs, from_logits_output_v.behaviour_action_log_probs) self.assertAllClose(ground_truth_target_action_log_probs, from_logits_output_v.target_action_log_probs) self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)
def training_process(self): # sample a batch of trajectories from memory and stack them, default batch_size=16 # dim of trajectories: (batch, seq_len, -1) transitions = self.replay_memory.sample() batch = Transition(*zip(*transitions)) state_batch = torch.stack(batch.state, dim=0) action_batch = torch.stack(batch.action, dim=0) reward_batch = torch.stack(batch.reward, dim=0) done_batch = torch.stack(batch.done, dim=0) behavior_logits_batch = torch.stack(batch.logits, dim=0) # make time major, dim of trajectories: (seq_len, batch, -1), for further computation state_batch = torch.transpose(state_batch, 0, 1) action_batch = torch.transpose(action_batch, 0, 1) reward_batch = torch.transpose(reward_batch, 0, 1) done_batch = torch.transpose(done_batch, 0, 1) if len(behavior_logits_batch.shape) == 4: # in case logits in (batch, seq_len, 1, #num_action), squeeze then permute it to (seq, batch, #num_action) behavior_logits_batch = behavior_logits_batch.squeeze(2) behavior_logits_batch = behavior_logits_batch.permute(1, 0, 2) # feed in to neural network, get learner output target_logits, baseline = self.agent(x=state_batch, action=action_batch, reward=reward_batch, dones=done_batch, core_state=None, isactor=False) # make time major of learner output target_logits = target_logits.permute(1, 0, 2) baseline = torch.transpose(baseline, 0, 1) # Use last baseline value (from the baseline function) to bootstrap. bootstrap_value = baseline[-1] # At this point, the environment outputs at time step `t` are the inputs that # lead to the learner_outputs at time step `t`. After the following shifting, # the actions in agent_outputs and learner_outputs at time step `t` is what # leads to the environment outputs at time step `t`. actions, behaviour_logits, rewards, dones = action_batch.view(action_batch.shape[0], -1).type(torch.long)[1:], behavior_logits_batch[1:], \ reward_batch.view(reward_batch.shape[0], -1)[1:], done_batch.view(done_batch.shape[0], -1)[1:] target_logits, baseline = target_logits[:-1], baseline[:-1] discounts = (~dones).float() * self.gamma vs, pg_advantages = vtrace.from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=actions, discounts=discounts, rewards=rewards, values=baseline, bootstrap_value=bootstrap_value) self.optimizer.zero_grad() criterion = agent.MyLoss() loss = criterion.compute_policy_gradient_loss( target_logits, actions, pg_advantages) # policy_gradient_loss loss += self.baseline_cost * criterion.compute_baseline_loss( vs=vs, baseline=baseline) # baseline_loss loss += self.entropy_cost * criterion.compute_entropy_loss( target_logits) # entropy regularization # loss in RL is not like loss in traditional ML, # the value of loss in RL only means the amplitude of update and direction (award or punishment). ''' For comparing vtrace and loss in the IMPALA paper with our implementation ''' # vtrace_tf.from_logits( # behaviour_policy_logits=tf.convert_to_tensor(behaviour_logits.detach().numpy()), # target_policy_logits=tf.convert_to_tensor(target_logits.detach().numpy()), # actions=tf.convert_to_tensor(actions.int().detach().numpy()), # discounts=tf.convert_to_tensor(discounts.detach().numpy()), # rewards=tf.convert_to_tensor(rewards.detach().numpy()), # values1=tf.convert_to_tensor(baseline.detach().numpy()), # bootstrap_value=tf.convert_to_tensor(bootstrap_value.detach().numpy())) # # tf vs, tf pg_advantages will be printed in vtrace_tf.py # print('torch vs', vs) # print('torch pg_advantages', pg_advantages) # tf_loss = vtrace_tf.compute_policy_gradient_loss(tf.convert_to_tensor(target_logits.detach().numpy()), # tf.convert_to_tensor(actions.detach().numpy()), # tf.convert_to_tensor(pg_advantages.detach().numpy())) \ # + self.baseline_cost * vtrace_tf.compute_baseline_loss(tf.convert_to_tensor(vs.detach().numpy()), # tf.convert_to_tensor(baseline.detach().numpy())) \ # + self.entropy_cost * vtrace_tf.compute_entropy_loss(tf.convert_to_tensor(target_logits.detach().numpy())) # print('torch loss', loss) # print('tf loss', tf_loss) loss.backward() self.optimizer.step() self.loss_dict.append(loss.item()) return
def test_vtrace_from_logit(): """V-trace를 로짓에서 계산 테스트.""" seq_len = 5 # n-step num_actions = 3 batch_size = 2 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. np.random.seed(0) values = { 'behavior_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'target_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'actions': np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), 'discounts': np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), 'rewards': _shaped_arange(seq_len, batch_size), 'values': _shaped_arange(seq_len, batch_size) / batch_size, 'bootstrap_value': _shaped_arange(batch_size) + 1.0, # B } from_logit_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **values) ground_truth_target_log_probs = vtrace.log_probs_from_logits_and_actions( values['target_policy_logits'], values['actions']) ground_truth_behavior_log_probs = vtrace.log_probs_from_logits_and_actions( values['behavior_policy_logits'], values['actions']) ground_truth_log_rhos = ground_truth_target_log_probs - \ ground_truth_behavior_log_probs from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values['discounts'], rewards=values['rewards'], values=values['values'], bootstrap_value=values['bootstrap_value'], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) # 중요도 가중치 결과 == 로짓 결과 == ground truth for g, o in zip(from_iw.vs, from_logit_output.vs): assert np.allclose(g, o.data.tolist()) for g, o in zip(from_iw.pg_advantages, from_logit_output.pg_advantages): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_behavior_log_probs, from_logit_output.behavior_action_log_probs): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_target_log_probs, from_logit_output.target_action_log_probs): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_log_rhos, from_logit_output.log_rhos): assert np.allclose(g, o.data.tolist()) logits = torch.Tensor(values['behavior_policy_logits']) actions = torch.LongTensor(values['actions']) advantages = from_iw.pg_advantages import pdb pdb.set_trace() # breakpoint fd504776 // loss = calc_loss(logits, actions, advantages) pass
def build_learner(agent, agent_state, env_outputs, agent_outputs): """Builds the learner loop. Args: agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an `unroll` call for computing the outputs for a whole trajectory. agent_state: The initial agent state for each sequence in the batch. env_outputs: A `StepOutput` namedtuple where each field is of shape [T+1, ...]. agent_outputs: An `AgentOutput` namedtuple where each field is of shape [T+1, ...]. Returns: A tuple of (done, infos, and environment frames) where the environment frames tensor causes an update. """ learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs, agent_state) # Use last baseline value (from the value function) to bootstrap. bootstrap_value = learner_outputs.baseline[-1] # At this point, the environment outputs at time step `t` are the inputs that # lead to the learner_outputs at time step `t`. After the following shifting, # the actions in agent_outputs and learner_outputs at time step `t` is what # leads to the environment outputs at time step `t`. agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs) rewards, infos, done, _ = nest.map_structure( lambda t: t[1:], env_outputs) learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs) if FLAGS.reward_clipping == 'abs_one': clipped_rewards = tf.clip_by_value(rewards, -1, 1) elif FLAGS.reward_clipping == 'soft_asymmetric': squeezed = tf.tanh(rewards / 5.0) # Negative rewards are given less weight than positive rewards. clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5. discounts = tf.to_float(~done) * FLAGS.discounting # Compute V-trace returns and weights. # Note, this is put on the CPU because it's faster than on GPU. It can be # improved further with XLA-compilation or with a custom TensorFlow operation. with tf.device('/cpu'): vtrace_returns = vtrace.from_logits( behaviour_policy_logits=agent_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=agent_outputs.action, discounts=discounts, rewards=clipped_rewards, values=learner_outputs.baseline, bootstrap_value=bootstrap_value) # Compute loss as a weighted sum of the baseline loss, the policy gradient # loss and an entropy regularization term. total_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, agent_outputs.action, vtrace_returns.pg_advantages) total_loss += FLAGS.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline) total_loss += FLAGS.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) # Optimization num_env_frames = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames, FLAGS.total_environment_frames, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay, FLAGS.momentum, FLAGS.epsilon) train_op = optimizer.minimize(total_loss) # Merge updating the network and environment frames into a single tensor. with tf.control_dependencies([train_op]): num_env_frames_and_train = num_env_frames.assign_add( FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats) # Adding a few summaries. tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('total_loss', total_loss) tf.summary.histogram('action', agent_outputs.action) return done, infos, num_env_frames_and_train
def build_learner(agent, agent_state, env_outputs, agent_outputs, teacher_task_ph): """Builds the learner loop. Args: agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an `unroll` call for computing the outputs for a whole trajectory. agent_state: The initial agent state for each sequence in the batch. env_outputs: A `StepOutput` namedtuple where each field is of shape [T+1, ...]. agent_outputs: An `AgentOutput` namedtuple where each field is of shape [T+1, ...]. Returns: A tuple of (done, infos, and environment frames) where the environment frames tensor causes an update. """ learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs, agent_state) teacher_selected_task = tf.identity(teacher_task_ph) # Use last baseline value (from the value function) to bootstrap. bootstrap_value = learner_outputs.baseline[-1] # At this point, the environment outputs at time step `t` are the inputs that # lead to the learner_outputs at time step `t`. After the following shifting, # the actions in agent_outputs and learner_outputs at time step `t` is what # leads to the environment outputs at time step `t`. agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs) rewards, infos, done, _ = nest.map_structure( lambda t: t[1:], env_outputs) learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs) if FLAGS.reward_clipping == 'abs_one': clipped_rewards = tf.clip_by_value(rewards, -1, 1) elif FLAGS.reward_clipping == 'soft_asymmetric': squeezed = tf.tanh(rewards / 5.0) # Negative rewards are given less weight than positive rewards. # we don't have negative rewards so this is redundant clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5. discounts = tf.to_float(~done) * FLAGS.discounting # Compute V-trace returns and weights. # Note, this is put on the CPU because it's faster than on GPU. It can be # improved further with XLA-compilation or with a custom TensorFlow operation. with tf.device('/cpu'): vtrace_returns = vtrace.from_logits( behaviour_policy_logits=agent_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=agent_outputs.action, discounts=discounts, rewards=clipped_rewards, values=learner_outputs.baseline, bootstrap_value=bootstrap_value) # Compute loss as a weighted sum of the baseline loss, the policy gradient # loss and an entropy regularization term. total_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, agent_outputs.action, vtrace_returns.pg_advantages) total_loss += FLAGS.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline) total_loss += FLAGS.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) # Optimization num_env_frames = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames, FLAGS.total_environment_frames, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay, FLAGS.momentum, FLAGS.epsilon) train_op = optimizer.minimize(total_loss) # Compute progress signal if FLAGS.progress_signal == 'reward': # Keep returns at end of episodes. # Discard parts of the minibatch using other tasks than what the Teacher # expects. episode_returns_correct_task = tf.boolean_mask( rewards, tf.logical_and(done, tf.equal(infos.task_name, teacher_selected_task))) progress_signal = tf.where( tf.size(episode_returns_correct_task) > 0, x=tf.reduce_mean(episode_returns_correct_task, name='progress_reward'), y=0) elif FLAGS.progress_signal == 'advantage': # For Advantage, we will compute returns[t] - returns[t-k] below, when # preparing to update the Teacher. # So just return reward[t] (again handling the wrong tasks parts) episode_returns_correct_task = tf.boolean_mask( rewards, tf.logical_and(done, tf.equal(infos.task_name, teacher_selected_task))) progress_signal = tf.where( tf.size(episode_returns_correct_task) > 0, x=tf.reduce_mean(episode_returns_correct_task, name='progress_reward'), y=0) elif FLAGS.progress_signal == 'gradient_norm': # compute norm of gradients as the progress signal params = tf.trainable_variables() gradients = tf.gradients(total_loss, params) gradient_norm = tf.global_norm(gradients) # TODO renormalize gradients hack, should be done adaptively... progress_signal = tf.divide( gradient_norm, 500., name='progress_gradient_norm') else: progress_signal = tf.constant(0.) # Merge updating the network and environment frames into a single tensor. with tf.control_dependencies([train_op]): num_env_frames_and_train = num_env_frames.assign_add( FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats) # Adding a few summaries. tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('total_loss', total_loss) tf.summary.histogram('action', agent_outputs.action) tf.summary.scalar('progress_signal', progress_signal) return done, infos, num_env_frames_and_train, progress_signal