class DDPGAgent: def __init__(self, state_size=28, action_size=2, gamma=0.9, learning_rate_actor=0.0001, learning_rate_critic=0.01, tau=0.001, action_max=[1000, 2], batch_size=32): self.state_size = state_size self.action_size = action_size self.action_max = action_max self.batch_size = batch_size self.memory = deque(maxlen=5000) self.gamma = gamma # discount rate self.learning_rate_actor = learning_rate_actor # learning rate self.learning_rate_critic = learning_rate_critic self.tau = tau # target transfer factor self.gpu_options = tf.GPUOptions() self.config = tf.ConfigProto(gpu_options=self.gpu_options) self.config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.config) K.set_session(self.sess) self.actor = Actor(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_actor, tau=self.tau, sess=self.sess, batch_size=self.batch_size, action_max=self.action_max) self.critic = Critic(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_critic, gamma=self.gamma, tau=self.tau, sess=self.sess, batch_size=self.batch_size) self.grad_avg = 0 self.grad_a = [] self.critic_loss_a = [] #self.critic_2 = Critic_2(self.state_size, self.action_size, self.learning_rate_critic, self.gamma, self.tau, self.sess) def policy_action(self, state): ''' Actor predicts new action :param state: :return: action ''' return self.actor.predict(state)[0] def replay(self, batch_size): minibatch = random.sample(self.memory, batch_size) states = np.asarray([e[0] for e in minibatch]) actions = np.asarray([e[1] for e in minibatch]) rewards = np.asarray([e[2] for e in minibatch]) next_states = np.asarray([e[3] for e in minibatch]) states = np.asarray(states).reshape(batch_size, self.state_size) actions = np.asarray(actions).reshape(batch_size, self.action_size) rewards = np.asarray(rewards).reshape(batch_size, 1) next_states = np.asarray(next_states).reshape(batch_size, self.state_size) tar_pre = self.actor.target_predict(next_states) Qvals = self.critic.target_predict(next_states, tar_pre) Q_primes = rewards + (self.gamma * Qvals) # Bellman equation self.update_models(states, actions, Q_primes) def update_models(self, states, actions, critic_target): ''' Update actor and critic networks from sampled experience :param states: :param actions: :param critic_target: :return: ''' loss = self.critic.train_on_batch(states, actions, critic_target) # Train Critic self.critic_loss_a.append(loss) # loss = np.sum(-np.log10(loss), axis=0) act = self.actor.predict( states) # Q Value Gradient under Current Policy grads = self.critic.gradients(states, act) # actor loss self.grad_avg += np.sum(np.log10(np.absolute(grads)), axis=0) / self.batch_size self.grad_a = np.append(self.grad_a, np.sum(np.absolute(grads), axis=0) / self.batch_size, axis=0) # print('grad_a:', self.grad_a) self.actor.train_2(states, grads.reshape( (-1, self.action_size))) # Train actor self.actor.transfer_to_actor_model( ) # Transfer weights to target networks at rate tau self.critic.transfer_to_critic_model() def remember(self, state, action, reward, next_state): self.memory.append((state, action, reward, next_state)) def save_weights(self, directory, params): path_actor = directory + 'Weights' + params + '_LR'.format( self.learning_rate_actor) path_critic = directory + 'Weights' + params + '_LR'.format( self.learning_rate_critic) self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load_weigths(path_actor) self.critic.load_weights(path_critic) def load_model(self, path_actor, path_critic): self.actor.model.load_model(path_actor) self.critic.model.load_model(path_critic)
def train(): env = gym.make('LunarLander-v2') state = env.reset() actor = Actor(env.action_space, env.observation_space) critic = Critic(env.action_space, env.observation_space) actor.load() critic.load() replayMemory = ReplayMemory() summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 1 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix, prob = actor.predict(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action_matrix, reward, done, next_state, prob) state = next_state episode_reward += reward #train if replayMemory.size() % 128 == 0 or done == True: state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll( ) reward_b = reward_b[:, np.newaxis] c_pre = critic.predict(next_state_b) state_pre_value = reward_b + c_pre * 0.7 state_value = critic.predict(state_b) count = 5000 // step if count > 500: count = 500 if count < 1: count = 1 count = 10 for _ in range(count): critic.train(state_b, state_pre_value) for _ in range(count): actor.train(state_b, state_value, state_pre_value, action_matrix_b, prob_b) replayMemory.clear() ######################## if done: summary_str = tf.Session().run( summary_ops, feed_dict={summary_vars[0]: episode_reward}) writer.add_summary(summary_str, step) writer.flush() ##print("step = ", step, "episode_reward = ", episode_reward) state = env.reset() episode_reward = 0 step += 1 if step % 25 == 0: actor.save() critic.save()