class Actor(object): def __init__(self, n_st, n_act): super(Actor, self).__init__() self.n_st = n_st self.n_act = n_act self.model = NN(n_st, n_act) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) self.noise = ou_process(np.zeros((n_act), dtype=np.float32)) def action(self, st, noise=False): a = self.model(st, norm=True) if noise: n = next(self.noise) a = np.clip(a.data + n, -1, 1) return a else: return a.data def update(self, st, dqda): mu = self.model(st, norm=True) self.model.cleargrads() mu.grad = -dqda mu.backward() self.optimizer.update() def update_target(self, tau, current_NN): self.model.weight_update(tau, current_NN) def save_model(self, outputfile): serializers.save_npz(outputfile, self.model) def load_model(self, inputfile): serializers.load_npz(inputfile, self.model)
class Critic(object): def __init__(self, n_st, n_act): super(Critic, self).__init__() self.n_st = n_st self.n_act = n_act self.model = NN(n_st + n_act, 1) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) self.log = [] def Q_value(self, st, act): state_action_vector = np.concatenate((st, act), axis=1) Q = self.model(state_action_vector).data return Q def return_dqda(self, st, act): state_action_vector = Variable(np.concatenate((st, act), axis=1)) self.model.cleargrads() Q = self.model(state_action_vector) Q.grad = np.ones((state_action_vector.shape[0], 1), dtype=np.float32) Q.backward() grad = state_action_vector.grad[:, self.n_st:] return grad def update(self, y, st, act): self.model.cleargrads() state_action_vector = np.concatenate((st, act), axis=1) Q = self.model(state_action_vector) loss = F.mean_squared_error(Q, Variable(y)) loss.backward() self.optimizer.update() self.log.append('Q:{0},y:{1}\n'.format(Q.data.T, y.T)) return loss.data def update_target(self, tau, current_NN): self.model.weight_update(tau, current_NN) def save_model(self, outputfile): serializers.save_npz(outputfile, self.model) def load_model(self, inputfile): serializers.load_npz(inputfile, self.model)