class Agent(object): def __init__(self, state_size, action_size, access_size=ACCESS_SIZE): self.state_size = state_size self.action_size = action_size self.noise = Noise(action_size) self.access = Access(access_size) self.actor = ActorNet(state_size, action_size) self.target_actor = deepcopy(self.actor) self.actor_optimizer = Adam( self.actor.parameters(), LR_ACTOR) self.critic = CriticNet(state_size, action_size) self.target_critic = deepcopy(self.critic) self.critic_optimizer = Adam( self.critic.parameters(), LR_CRITIC) if torch.cuda.is_available(): self.actor.cuda() self.target_actor.cuda() self.critic.cuda() self.target_critic.cuda() @staticmethod def _soft_update(target, source, tau=1e-3): for t, s in zip(target.parameters(), source.parameters()): t.data.copy_(t.data * (1.0 - tau) + s.data * tau) @staticmethod def _hard_update(target, source): for t, s in zip(target.parameters(), source.parameters()): t.data.copy_(s.data) def __call__(self, *args, **kwargs): return self.get_policy(*args) def append(self, *args): self.access.append(*args) def sample(self, *args): return self.access.sample(*args) def get_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))).cuda() action = self.actor(state).detach() return action.data.cpu().numpy() def get_noise(self): return self.noise() def optimize(self, batch_size=64): batch = self.sample(batch_size) state, action, reward, _, next_state =\ [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch] next_action = self.target_actor.forward(next_state).detach() next_value = torch.squeeze( self.target_critic(next_state, next_action).detach()) target_value = reward + GAMMA * next_value value = torch.squeeze(self.critic(state, action)) loss_critic = nf.mse_loss(value, target_value) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() policy_action = self.actor(state) loss_actor = -1 * torch.sum(self.critic(state, policy_action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() self._soft_update(self.target_actor, self.actor, TAU) self._soft_update(self.target_critic, self.critic, TAU) def restore_models(self, num_episode): self.actor.load_state_dict(torch.load( "actor_{}.pkl".format(num_episode))) self.critic.load_state_dict(torch.load( "critic_{}.pkl".format(num_episode))) self._hard_update(self.target_actor, self.actor) self._hard_update(self.target_critic, self.critic) def save_models(self, num_episode): torch.save(self.target_actor.state_dict(), "actor_{}.pkl".format(num_episode)) torch.save(self.target_critic.state_dict(), "critic_{}.pkl".format(num_episode)) print('Models saved successfully')
class Agent(object): def __init__(self, image_shape, output_size, capacity=int(1e6), learning_rate=1e-3): self.output_size = output_size self.access = Access(capacity) self.value_net = DQN(image_shape, output_size) self.target_net = deepcopy(self.value_net) # 自动使用gpu self.gpu = torch.cuda.is_available() if self.gpu: self.value_net.cuda() self.target_net.cuda() self.optimizer = torch.optim.Adam(self.value_net.parameters(), lr=learning_rate) self.loss_func = nn.MSELoss() def get_deterministic_policy(self, x): x = Variable(torch.from_numpy(x.astype(np.float32))) if not self.gpu: out = self.value_net(x).data.numpy() return np.argmax(out, axis=1) else: x = x.cuda() out = self.value_net(x) out = out.cpu().data.numpy() return np.argmax(out, axis=1) def get_stochastic_policy(self, x): x = Variable(torch.from_numpy(x.astype(np.float32))) if not self.gpu: out = softmax(self.value_net(x), 1) out = out.data.numpy() return np.random.choice(self.output_size, 1, p=out[0])[0] else: x = x.cuda() out = softmax(self.value_net(x), 1) out = out.cpu().data.numpy() return np.random.choice(self.output_size, 1, p=out[0])[0] def get_epsilon_policy(self, x, epsilon=0.9): if np.random.uniform() > epsilon: return np.random.randint(self.output_size) else: return self.get_stochastic_policy(x) def optimize(self, batch_size=64, gamma=.9): batch = self.sample(batch_size) if self.gpu: state, action, reward, done, next_state = \ [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch] action = action.type(torch.LongTensor).cuda() else: state, action, reward, done, next_state = \ [Variable(torch.from_numpy(np.float32(i))) for i in batch] action = action.type(torch.LongTensor) value = self.value_net(state).gather(1, action.unsqueeze(1)) next_value = self.target_net(next_state).detach() next_value = next_value.max(1)[0].view([-1, 1]) value = value.squeeze(1) next_value = next_value.squeeze(1) target = done * reward + (1 - done) * (reward + gamma * next_value) loss = self.loss_func(value, target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def _update_target(self): # update target network parameters for t, s in zip(self.target_net.parameters(), self.value_net.parameters()): t.data.copy_(s.data) def append(self, *args): self.access.append(*args) def sample(self, batch_size=128): return self.access.sample(batch_size)
class Agent(object): def __init__(self, state_size, action_size, access_size=1024): self.state_size = state_size self.action_size = action_size self.noise = Noise(action_size) self.access = Access(access_size) self.actor = ActorNet(state_size, action_size) self.target_actor = deepcopy(self.actor) self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE) self.critic = CriticNet(state_size, action_size) self.target_critic = deepcopy(self.critic) self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE) if torch.cuda.is_available(): self.actor.cuda() self.target_actor.cuda() self.critic.cuda() self.target_critic.cuda() def __call__(self, *args, **kwargs): self.get_exploration_policy(*args) def append(self, *args): self.access.append(*args) def sample(self, *args): return self.access.sample(*args) def get_exploitation_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))).cuda() action = self.target_actor(state).detach() return action.data.cpu().numpy() def get_exploration_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))).cuda() action = self.actor(state).detach() return action.data.cpu().numpy() + self.noise() def optimize(self, batch_size=64): batch = self.sample(batch_size) state, action, reward, _, next_state =\ [Variable(torch.from_numpy(np.float32(i))).cuda() for i in batch] next_action = self.target_actor.forward(next_state).detach() next_value = torch.squeeze( self.target_critic(next_state, next_action).detach()) target_value = reward + GAMMA * next_value value = torch.squeeze(self.critic(state, action)) loss_critic = nf.smooth_l1_loss(value, target_value) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() policy_action = self.actor(state) loss_actor = -1 * torch.sum(self.critic(state, policy_action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() soft_update(self.target_actor, self.actor, TAU) soft_update(self.target_critic, self.critic, TAU) def restore_models(self, num_episode): self.actor.load_state_dict( torch.load("./Models/{}_actor.pkl".format(num_episode))) self.critic.load_state_dict( torch.load("./Models/{}_critic.pkl".format(num_episode))) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) def save_models(self, num_episode): torch.save(self.target_actor.state_dict(), "actor_{}.pkl".format(num_episode)) torch.save(self.target_critic.state_dict(), "critic_{}.pkl".format(num_episode)) print('Models saved successfully')