def main(): use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} torch.manual_seed(args.seed) # Fetch Some Data... train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
class PolicyGradient: def __init__( self, s_dim, a_num, device, hidden, lr, gamma, ): # Parameter Initialization self.s_dim = s_dim self.a_num = a_num self.device = device self.hidden = hidden self.lr = lr self.gamma = gamma # network initialization self.net = Net(s_dim, hidden, a_num).to(self.device) self.opt = torch.optim.Adam(self.net.parameters(), lr=lr) # the memory only need to store a trajectory self.memory_s = [] self.memory_a = [] self.memory_r = [] def get_action(self, s): s = torch.FloatTensor(s).to(self.device) prob_weights = self.net(s) # select action w.r.t the actions prob dist = Categorical(prob_weights) action = (dist.sample()).detach().item() return action def store_transition(self, s, a, r): self.memory_s.append(s) self.memory_a.append(a) self.memory_r.append(r) def learn(self): discounted_r = self._discounted_r(self.memory_r) s = torch.FloatTensor(self.memory_s).to(self.device) a = torch.LongTensor(self.memory_a).to(self.device) r = torch.FloatTensor(discounted_r).to(self.device) # calculate loss prob = self.net(s) dist = Categorical(prob) loss = -torch.sum(dist.log_prob(a) * r) # train on episode self.opt.zero_grad() loss.backward() self.opt.step() # empty episode data self.memory_s = [] self.memory_a = [] self.memory_r = [] def _discounted_r(self, r): length = len(r) discounted_r = np.zeros(length) running_add = 0 for t in range(length - 1, -1, -1): running_add = r[t] + running_add * self.gamma discounted_r[t] = running_add # normalize episode rewards discounted_r -= np.mean(discounted_r) discounted_r /= np.std(discounted_r) return discounted_r