-
Notifications
You must be signed in to change notification settings - Fork 1
/
vpg.py
89 lines (78 loc) · 2.87 KB
/
vpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import gym
from collections import deque
# define policy network
class policy_net(nn.Module):
def __init__(self, nS, nH, nA): # nS: state space size, nH: n. of neurons in hidden layer, nA: size action space
super(policy_net, self).__init__()
self.h = nn.Linear(nS, nH)
self.out = nn.Linear(nH, nA)
# define forward pass with one hidden layer with ReLU activation and sofmax after output layer
def forward(self, x):
x = F.relu(self.h(x))
x = F.softmax(self.out(x), dim=1)
return x
# create environment
env = gym.make("CartPole-v1")
# instantiate the policy
policy = policy_net(env.observation_space.shape[0], 20, env.action_space.n)
# create an optimizer
optimizer = torch.optim.Adam(policy.parameters())
# initialize gamma and stats
gamma=0.99
n_episode = 1
returns = deque(maxlen=100)
render_rate = 100 # render every render_rate episodes
while True:
rewards = []
actions = []
states = []
# reset environment
state = env.reset()
while True:
# render episode every render_rate epsiodes
if n_episode%render_rate==0:
env.render()
# calculate probabilities of taking each action
probs = policy(torch.tensor(state).unsqueeze(0).float())
# sample an action from that set of probs
sampler = Categorical(probs)
action = sampler.sample()
# use that action in the environment
new_state, reward, done, info = env.step(action.item())
# store state, action and reward
states.append(state)
actions.append(action)
rewards.append(reward)
state = new_state
if done:
break
# preprocess rewards
rewards = np.array(rewards)
# calculate rewards to go for less variance
R = torch.tensor([np.sum(rewards[i:]*(gamma**np.array(range(i, len(rewards))))) for i in range(len(rewards))])
# or uncomment following line for normal rewards
#R = torch.sum(torch.tensor(rewards))
# preprocess states and actions
states = torch.tensor(states).float()
actions = torch.tensor(actions)
# calculate gradient
probs = policy(states)
sampler = Categorical(probs)
log_probs = -sampler.log_prob(actions) # "-" because it was built to work with gradient descent, but we are using gradient ascent
pseudo_loss = torch.sum(log_probs * R) # loss that when differentiated with autograd gives the gradient of J(θ)
# update policy weights
optimizer.zero_grad()
pseudo_loss.backward()
optimizer.step()
# calculate average return and print it out
returns.append(np.sum(rewards))
print("Episode: {:6d}\tAvg. Return: {:6.2f}".format(n_episode, np.mean(returns)))
n_episode += 1
# close environment
env.close()