class ActorCritic: def __init__(self, env, discount): self.discount = discount self.memory = collections.deque(maxlen=MEMORY_SIZE) self.policy = Policy(env, HIDDEN_UNITS) self.value = Value(env, HIDDEN_UNITS) def get_action(self, s, sess): return self.policy.get_action(s, sess) def get_value(self, s, sess): return self.value.get_value(s, sess) def on_reward(self, s, a, r, s_, done): self.memory.append((s, a, r, s_, done)) def train(self, lr_policy, lr_value, sess): batch_size = min(len(self.memory), BATCH_SIZE) samples = random.sample(self.memory, batch_size) ss = np.zeros(shape=[batch_size, 2]) ss_ = np.zeros(shape=[batch_size, 2]) acts = np.zeros(shape=[batch_size, 1]) for i, (s, a, r, s_, done) in enumerate(samples): ss[i] = s ss_[i] = s_ acts[i] = a pvs = self.value.get_value(ss, sess) vs_ = self.value.get_value(ss_, sess) vs = np.zeros(shape=[batch_size, 1]) advantages = np.zeros(shape=[batch_size, 1]) for i, (s, a, r, s_, done) in enumerate(samples): vs[i] = r if not done: vs[i] += self.discount * vs_[i] advantages = vs - pvs self.value.train(ss, vs, lr_value, sess) self.policy.train(ss, acts, advantages, lr_policy, sess)