def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True): states, actions, rewards, dones, next_states = common.unpack_batch(batch) states_v = torch.tensor(states).to(device) next_states_v = torch.tensor(next_states).to(device) actions_v = torch.tensor(actions).to(device) rewards_v = torch.tensor(rewards).to(device) done_mask = torch.tensor(dones, dtype=torch.bool).to(device) state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) # If double # DQN is enabled, we calculate the best action to take in the next state using # our main trained network, but values corresponding to this action come from # the target network. if double: next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions.unsqueeze(-1)).squeeze(-1) else: next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 expected_state_action_values = next_state_values.detach( ) * gamma + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values)
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): """ :param batch: :param net: :param tgt_net: :param gamma: :param device: :type device Union[str, torch.device] :param save_prefix: :return: """ states, actions, rewards, dones, next_states = Common.unpack_batch(batch) batch_size = len(batch) state_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions, dtype=torch.long).to(device) next_states_v = torch.tensor(next_states).to(device) # need both probability distributions and Q-values for the next # states, so we use the both() call to the network, obtain the best actions to # take in the next state, apply softmax to the distribution, and convert it to the # array. next_distribute_v, next_qvals_v = tgt_net.both(next_states_v) next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() next_distribute = tgt_net.apply_softmax( next_distribute_v).data.cpu().numpy() # extract distributions of the best actions and perform their projection # using the Bellman operator. The result of the projection will be target # distribution about what we want our network output next_best_distribute = next_distribute[range(batch_size), next_actions] dones = dones.astype(torch.bool) proj_distribute = Common.distribute_projection(next_best_distribute, rewards, dones, v_min, v_max, N_ATOMS, gamma) # compute the output of the network and # calculate KL-divergence between projected distribution and the network's # output for the taken actions. KL-divergence shows how much two # distributions differ distribute_v = net(state_v) state_action_values = distribute_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distribute_v = torch.tensor(proj_distribute).to(device) loss_v = -state_log_sm_v * proj_distribute_v return loss_v.sum(dim=1).mean()
writer = SummaryWriter(comment="-" + params['run_name'] + "-noisy-net") net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) agent = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch,
help="Enable double DQN") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env) writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double)) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1)
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): """ Here we use a small trick to speed up our calculations a bit. As the double DQN method requires us to use our main network to select actions but use the target network to obtain values (in our case, value distributions) for those actions, we need to pass to our main network both the current states and the next states. Earlier, we calculated the network output in two calls, which is not very efficient on GPU. Now, we concatenate both current states and next states into one tensor and obtain the result in one network pass, splitting the result later. We need to calculate both Q-values and raw values' distributions, as our action selection policy is still greedy: we choose the action with the largest Q-value. :param batch: :param batch_weights: :param net: :param tgt_net: :param gamma: :param Union[str, torch.device] device: :return: """ states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions, dtype=torch.long).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] # we decide on actions to take in the next state and # obtain the distribution of those actions using our target network. So, the # above net/tgt_net shuffling implements the double DQN method. Then we # apply softmax to distribution for those best actions and copy the data into # CPU to perform the Bellman projection. next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) proj_distr = common.distribute_projection(next_best_distr, rewards, dones, v_min=Vmin, v_max=Vmax, n_atoms=N_ATOMS, gamma=gamma) # Here we obtain the distributions for taken actions and apply log_softmax to # calculate the loss state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) # calculate the KL-divergence loss, multiply # it by weights and return two quantities: combined loss to be used in the # optimizer step and individual loss values for batch, which will be used as # priorities in the replay buffer proj_distr_v = torch.tensor(proj_distr) loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 # Call to buffer.populate(1) will start the following # chain of actions # ExperienceReplayBuffer will ask the experience source to get the next # transition. # The experience source will feed the current observation to the agent to # obtain the action. # The agent will apply the NN to the observation to calculate Q-values, # then ask the action selector to choose the action to take. # The action selector (which is an epsilon-greedy selector) will generate # the random number to check how to act: greedily or randomly. In both # cases, it will decide which action to take. # The action will be returned to the experience source, which will feed it # into the environment to obtain the reward and the next observation. All
help="Count of steps to unroll Bellman") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env) writer = SummaryWriter(comment="-" + params['run_name'] + "-%d-step" % args.n) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=args.n) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx)