/
agent.py
144 lines (114 loc) · 5.41 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import math
import numpy as np
import torch
import torch.optim as optim
from torch.autograd import Variable
from replay_memory import ReplayMemory
from model import RainbowDQN
EPSILON = 0.01
V_MIN = -1000
V_MAX = 10
ATOM_SIZE = 51
gamma = 0.90
class Agent:
def __init__(self, max_memory, batch_size, action_size, atom_size, input_size, kernel_size):
self.z = np.linspace(V_MIN, V_MAX, ATOM_SIZE)
self.action_size = action_size
self.epsilon = EPSILON
self.batch_size = batch_size
self.atom_size = atom_size
self.memory = ReplayMemory(max_memory)
self.brain = RainbowDQN(action_size=action_size, atom_size=atom_size,
input_size=input_size, kernel_size=kernel_size)
self.target_brain = RainbowDQN(action_size=action_size, atom_size=atom_size,
input_size=input_size, kernel_size=kernel_size)
self.target_brain.load_state_dict(self.brain.state_dict())
self.optim = optim.Adam(self.brain.parameters(), lr=0.001)
def step(self, state_input):
probs = self.brain(state_input)
best_action = self.select_best_action(probs)
return best_action
def select_best_action(self, probs):
numpy_probs = self.variable_to_numpy(probs)
z_probs = np.multiply(numpy_probs, self.z)
best_action = np.sum(z_probs, axis=1).argmax()
# best_action = np.argmax(numpy_probs, axis=1)
return best_action
def store_states(self, states, best_action, reward, done, next_states):
td = self.calculate_td(states, best_action, reward, done, next_states)
self.memory.add_memory(states, best_action, reward, done, next_states, td=td)
def variable_to_numpy(self, probs):
# probs is a list of softmax prob
numpy_probs = probs.data.numpy()
return numpy_probs
#TODO find out why td does not get -100 reward
def calculate_td(self, states, best_action, reward, done, next_states):
probs = self.brain(states)
numpy_probs = self.variable_to_numpy(probs)
# states_prob = np.multiply(numpy_probs, self.z)
# states_q_value = np.sum(states_prob, axis=1)[best_action]
states_q_value = numpy_probs[0][best_action]
next_probs = self.brain(next_states)
numpy_next_probs = self.variable_to_numpy(next_probs)
# next_states_prob = np.multiply(numpy_next_probs, self.z)
# max_next_states_q_value = np.sum(next_states_prob, axis=1).max()
max_next_states_q_value = np.max(numpy_next_probs, axis=1)[0]
if done:
td = reward - states_q_value
else:
td = (reward + gamma * max_next_states_q_value) - states_q_value
return abs(td)
def learn(self):
# make sure that there is at least an amount of batch_size before training it
if self.memory.count < self.batch_size:
return
tree_indexes, tds, batches = self.memory.get_memory(self.batch_size)
total_loss = None
for index, batch in enumerate(batches):
# fixme fix this None type
if batch is None:
continue
state_input = batch[0]
best_action = batch[1]
reward = batch[2]
done = batch[3]
next_state_input = batch[4]
current_q = self.brain(state_input)
next_best_action = self.step(next_state_input)
# max_current_q = torch.max(current_q)
next_z_prob = self.target_brain(next_state_input)
next_z_prob = self.variable_to_numpy(next_z_prob)
# target = reward + (1 - done) * gamma * next_z_prob.data[0][next_best_action]
# target = Variable(torch.FloatTensor([target]))
#TODO finish single dqn with per
target_z_prob = np.zeros([self.action_size, ATOM_SIZE], dtype=np.float32)
if done:
Tz = min(V_MAX, max(V_MIN, reward))
b = (Tz - V_MIN) / (self.z[1] - self.z[0])
m_l = math.floor(b)
m_u = math.ceil(b)
target_z_prob[best_action][m_l] += (m_u - b)
target_z_prob[best_action][m_u] += (b - m_l)
else:
for z_index in range(len(next_z_prob)):
Tz = min(V_MAX, max(V_MIN, reward + gamma * self.z[z_index]))
b = (Tz - V_MIN) / (self.z[1] - self.z[0])
m_l = math.floor(b)
m_u = math.ceil(b)
target_z_prob[best_action][m_l] += next_z_prob[next_best_action][z_index] * (m_u - b)
target_z_prob[best_action][m_u] += next_z_prob[next_best_action][z_index] * (b - m_l)
target_z_prob = Variable(torch.from_numpy(target_z_prob))
# backward propagate
output_prob = self.brain(state_input)[0]
loss = -torch.sum(target_z_prob * torch.log(output_prob + 1e-8))
# loss = F.mse_loss(max_current_q, target)
total_loss = loss if total_loss is None else total_loss + loss
# update td
td = self.calculate_td(state_input, best_action, reward, done, next_state_input)
tds[index] = td
self.optim.zero_grad()
total_loss.backward()
self.optim.step()
# load brain to target brain
self.target_brain.load_state_dict(self.brain.state_dict())
self.memory.update_memory(tree_indexes, tds)