def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr) loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_actions] dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output distr_v = net(states_v) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) if save_prefix is not None: pred = F.softmax(state_action_values, dim=1).data.cpu().numpy() save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def calc_loss(batch, batch_weights, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = Variable(torch.from_numpy(states)) actions_v = Variable(torch.from_numpy(actions)) next_states_v = Variable(torch.from_numpy(next_states)) batch_weights_v = Variable(torch.from_numpy(batch_weights)) if cuda: states_v = states_v.cuda() actions_v = actions_v.cuda() next_states_v = next_states_v.cuda() batch_weights_v = batch_weights_v.cuda() # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values) proj_distr_v = Variable(torch.from_numpy(proj_distr)) if cuda: proj_distr_v = proj_distr_v.cuda() loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) batch_weights_v = torch.tensor(batch_weights).to(device) # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v loss_v = batch_weights_v * loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
def calc_loss(batch, net, tgt_net, gamma, device="cpu"): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution # dueling arch -- actions from main net, distr from tgt_net # calc at once both next and cur states distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) next_qvals_v = qvals_v[batch_size:] distr_v = distr_v[:batch_size] next_actions_v = next_qvals_v.max(1)[1] next_distr_v = tgt_net(next_states_v) next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) next_best_distr = next_best_distr_v.data.cpu().numpy() dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) loss_v = -state_log_sm_v * proj_distr_v loss_v = loss_v.sum(dim=1) return loss_v.mean(), loss_v + 1e-5
plt.bar(p, src, width=0.5) plt.title("Source") plt.subplot(2, 1, 2) plt.bar(p, proj, width=0.5) plt.title("Projected") plt.savefig(name + ".png") if __name__ == "__main__": np.random.seed(123) atoms = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z) # single peak distribution src_hist = np.zeros(shape=(1, N_ATOMS), dtype=np.float32) src_hist[0, N_ATOMS//2+1] = 1.0 proj_hist = common.distr_projection(src_hist, np.array([2], dtype=np.float32), np.array([False]), Vmin, Vmax, N_ATOMS, gamma=0.9) save_distr(src_hist[0], proj_hist[0], "peak-r=2") # normal distribution data = np.random.normal(size=1000, scale=3) hist = np.histogram(data, normed=True, bins=np.arange(Vmin - DELTA_Z/2, Vmax + DELTA_Z*3/2, DELTA_Z)) src_hist = hist[0] proj_hist = common.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([False]), Vmin, Vmax, N_ATOMS, gamma=0.9) save_distr(hist[0], proj_hist[0], "normal-r=2") # normal distribution, but done episode proj_hist = common.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([True]), Vmin, Vmax, N_ATOMS, gamma=0.9) save_distr(hist[0], proj_hist[0], "normal-done-r=2")