def __init__( self, action_str, model, gpu_id = -1, hidden_state_sz = 512 ): self.actions = action_str self.gpu_id = gpu_id self.model = model #self.learned_input = None self.hidden_state_sz = hidden_state_sz self.done = False#智能体是否提出done?当动作中不含'Done'时,一定一直为False if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.model = self.model.cuda() ################################################### self.hidden = [ gpuify(torch.zeros(1, self.hidden_state_sz), gpu_id), gpuify(torch.zeros(1, self.hidden_state_sz), gpu_id) ] self.probs = gpuify(torch.zeros((1, len(self.actions))), gpu_id) self.log_pi_batch = [] self.v_batch = [] self.entropies = [] self.embedding = None self.i_act = None self.learned_input = None
def reset_hidden(self): self.hidden_batch[0][-1] = gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id) self.hidden_batch[1][-1] = gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id) self.probs_batch[-1] = gpuify(torch.zeros((1, len(self.actions))), self.gpu_id)
def value_loss(batch_out, last_v, exps, gpu_id=-1, gamma=0.99, nsteps=100, on_off=1): """接受batch经验的带mask的简单loss计算,没有熵,没有gae""" value_loss = 0 v_batch = batch_out['value'] td_target = list() a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1) a_batch = gpuify(a_batch, gpu_id) threads = len(exps['action_idxs'][0]) index_ = len(exps['rewards']) - 1 if on_off: v2c = v_batch.gather(1, a_batch) last_a = torch.tensor(exps['last_action']).reshape(-1, 1) last_v = torch.tensor(last_v).reshape(threads, -1) R = last_v.gather(1, last_a).numpy().reshape(-1) else: v2c = v_batch.max(1)[0] R = last_v.reshape(threads, -1).max(1) v_batch = v_batch.gather(1, a_batch) v2c = v2c.detach().cpu().numpy().reshape(index_ + 1, -1) #v2c[-1]#last_v flag = 0 for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]): index_ -= 1 if flag < nsteps: R = r + gamma * R * mask else: tau = index_ + nsteps R = v2c[tau] for i in range(nsteps): R = exps['rewards'][tau - i - 1] + gamma * R * mask td_target.append(R) flag += 1 td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1) td_target = gpuify(td_target, gpu_id) value_loss = F.smooth_l1_loss(v_batch, td_target.detach(), reduction='sum') total_loss = value_loss return dict(total_loss=total_loss, value_loss=value_loss)
def a2c_loss( batch_out, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] nsteps=1, ): """接受batch经验的带mask的简单loss计算,没有熵,没有gae""" policy_loss = 0 value_loss = 0 pi_batch, v_batch = batch_out['policy'], batch_out['value'] R = last_v td_target = list() index_ = len(exps['rewards']) - 1 flag = 0 v2c = v_batch.detach().cpu().numpy() for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]): index_ -= 1 if flag < nsteps: R = r + gamma * R * mask else: tau = index_ + nsteps R = v2c[tau] for i in range(nsteps): R = exps['rewards'][tau - i - 1] + gamma * R * mask td_target.append(R) flag += 1 td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1) td_target = gpuify(td_target, gpu_id) a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1) a_batch = gpuify(a_batch, gpu_id) advantage = td_target - v_batch.detach() advantage = gpuify(advantage, gpu_id) pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch) policy_loss = (-torch.log(pi_a) * advantage.detach()).sum() value_loss = 0.5 * F.smooth_l1_loss( v_batch, td_target.detach(), reduction='sum') total_loss = policy_loss + value_loss return dict(total_loss=total_loss, policy_loss=policy_loss, value_loss=value_loss)
def loss_with_entro( batch_out, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] beta=1e-2, #entropy regularization term ): """接受batch经验的带mask的简单loss计算,有动作熵""" policy_loss = 0 value_loss = 0 R = last_v td_target = list() pi_batch, v_batch = batch_out['policy'], batch_out['value'] for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]): R = r + gamma * R * mask td_target.append(R) td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1) td_target = gpuify(td_target, gpu_id) a_batch = torch.tensor(exps['action_idxs']) threads = a_batch.shape[1] a_batch = gpuify(a_batch.reshape(-1, 1), gpu_id) advantage = td_target - v_batch.detach() advantage = gpuify(advantage, gpu_id) prob = F.softmax(pi_batch, dim=1) log_prob = torch.log(prob) #F.log_softmax(pi_batch, dim = 1) log_pi_a = log_prob.gather(1, a_batch) entropies = -(log_prob * prob) policy_loss = (-log_pi_a * advantage.detach()).sum() - beta * entropies.sum() value_loss = 0.5 * F.smooth_l1_loss( v_batch, td_target.detach(), reduction='sum') total_loss = policy_loss + value_loss return dict(total_loss=total_loss, policy_loss=policy_loss, value_loss=value_loss)
def basic_loss( #v_batch, #pi_batch, batch_out, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] ): """接受batch经验的带mask的简单loss计算,没有熵,没有gae""" policy_loss = 0 value_loss = 0 pi_batch, v_batch = batch_out['policy'], batch_out['value'] R = last_v td_target = list() for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]): R = r + gamma * R * mask td_target.append(R) td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1) td_target = gpuify(td_target, gpu_id) a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1) a_batch = gpuify(a_batch, gpu_id) advantage = td_target - v_batch.detach() advantage = gpuify(advantage, gpu_id) pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch) policy_loss = (-torch.log(pi_a) * advantage.detach()).sum() value_loss = 0.5 * F.smooth_l1_loss( v_batch, td_target.detach(), reduction='sum') total_loss = policy_loss + value_loss return dict(total_loss=total_loss, policy_loss=policy_loss, value_loss=value_loss)
def a3c_loss( done, v_batch, pi_batch, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] #tau = 1.00,#parameter for GAE #beta = 1e-2,#entropy regularization term ): R = 0.0 if done else last_v policy_loss = 0 value_loss = 0 td_target_lst = [] for reward in exps['rewards'][::-1]: R = gamma * R + reward td_target_lst.append([R]) td_target_lst.reverse() a_batch = torch.tensor(exps['action_idxs']) a_batch = gpuify(a_batch, gpu_id) td_target = torch.FloatTensor(td_target_lst) td_target = gpuify(td_target, gpu_id) advantage = td_target - v_batch.detach() advantage = gpuify(advantage, gpu_id) pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch) policy_loss = -torch.log(pi_a) * advantage.detach() value_loss = 0.5 * F.smooth_l1_loss(v_batch, td_target.detach()) total_loss = policy_loss + value_loss return dict(total_loss=total_loss, policy_loss=policy_loss, value_loss=value_loss)
def basic_loss_no_mask( v_batch, pi_batch, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] ): """接受batch经验的不带mask的简单loss计算,没有熵,没有gae""" R = last_v policy_loss = 0 value_loss = 0 td_target_lst = [] for reward in exps['rewards'][::-1]: R = gamma * R + reward td_target_lst.append([R]) td_target_lst.reverse() a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1) a_batch = gpuify(a_batch, gpu_id) td_target = torch.FloatTensor(td_target_lst).reshape(-1, 1) td_target = gpuify(td_target, gpu_id) advantage = td_target - v_batch.detach() advantage = gpuify(advantage, gpu_id) pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch) policy_loss = (-torch.log(pi_a) * advantage.detach()).sum() value_loss = 0.5 * F.smooth_l1_loss( v_batch, td_target.detach(), reduction='sum') total_loss = policy_loss + value_loss return dict(total_loss=total_loss, policy_loss=policy_loss, value_loss=value_loss)
def savn_loss( v_batch, log_pi_batch, entropies, last_v, exps, gpu_id=-1, gamma=0.99, #discount factor for exps['rewards'] tau=1.00, #parameter for GAE beta=1e-2, #entropy regularization term ): """savn使用的loss函数,经验是以list形式计算的,带熵和gae""" R = last_v v_batch.append(gpuify(torch.tensor(R), gpu_id)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() for i in reversed(range(len(exps['rewards']))): R = gamma * R + exps['rewards'][i] advantage = R - v_batch[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = (exps['rewards'][i] + gamma * v_batch[i + 1].detach() - v_batch[i].detach()) gae = gae * gamma * tau + delta_t policy_loss = (policy_loss - log_pi_batch[i] * gae - beta * entropies[i]) return dict(total_loss=policy_loss + 0.5 * value_loss, policy_loss=policy_loss, value_loss=value_loss)
def reset_hidden(self): self.hidden = [ gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id), gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id) ] self.probs = gpuify(torch.zeros((1, len(self.actions))), self.gpu_id)