def play(): print("play") env = gym.make('LunarLander-v2') state = env.reset() actor = Actor(env.action_space, env.observation_space) actor.load() #critic = Critic(env.action_space, env.observation_space) #replayMemory = ReplayMemory() #summary_ops, summary_vars = build_summaries() #writer = tf.summary.FileWriter("./log", tf.Session().graph) #episode_reward = 0 #step = 1 while True: env.render() state1 = state[np.newaxis, :] action, action_matrix, prob = actor.predict(state1) next_state, reward, done, info = env.step(action) #replayMemory.add(state, action_matrix, reward, done, next_state, prob) state = next_state if done: #summary_str = tf.Session().run(summary_ops, feed_dict={summary_vars[0]: episode_reward}) #writer.add_summary(summary_str, step) #writer.flush() state = env.reset() return 0
actor = Actor(sess, state_size, action_size) critic = Critic(sess, state_size, action_size) buffer = ReplayBuffer(BUFFER_SIZE) env.monitor.start('experiments/' + 'Pendulum-v0',force=True) for ep in range(10000): state = env.reset() Totoal = 0 # what if the action is beyond the scope? for iteration in range(100): # select the action with actor model. env.render() action = actor.predict([state])[0] + (np.random.randn(1)/(ep + iteration + 1)) newState, reward, terminated, _ = env.step(action) Totoal += reward buffer.add(state, action, reward, newState, terminated) #state, action, reward, new_state, done # update critic batch = buffer.getBatch(batch_size=BATCH_SIZE) states = np.array([e[0] for e in batch]) actions = np.array([e[1] for e in batch]) rewards = np.array([e[2] for e in batch]) newStates = np.array([e[3] for e in batch]) notTerminated = np.array([1.-e[4] for e in batch]) newStatesScores = critic.target_predict_method(newStates, actor.target_predict_method(newStates))
replayMemory = ReplayMemory() summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 0 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix = actor.predict(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action_matrix, reward, done, next_state) state = next_state episode_reward += reward #train if replayMemory.size() % 128 == 0 or done == True: state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll() reward_b = reward_b[:, np.newaxis]
state1 = state[np.newaxis, :] action, action_matrix, prob = actor.act(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action, reward, done, next_state, prob) state = next_state episode_reward += reward ##############################train###################### if replayMemory.size() >= 128: state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch( int(64)) next_state_b_value = actor.predict(next_state_b) state_b_value = actor.predict(state_b) length = state_b.shape[0] for i in range(length): target_next = reward_b[i] if not done_b[i]: action_values = next_state_b_value[i] target_next = (reward_b[i] + 0.7 * np.amax(action_values)) state_b_value[i][action_b[i]] = target_next actor.train(state_b, state_b_value) if done: summary_str = tf.Session().run( summary_ops, feed_dict={summary_vars[0]: episode_reward}) writer.add_summary(summary_str, step)
class Agent: def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC, NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU, EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF): self.env = env self.sess = sess self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.REWARD_DISCOUNT = REWARD_DISCOUNT self.TAU = TAU self.BATCH_SIZE = BATCH_SIZE self.noise_state = np.zeros(self.action_space) self.EXPLORATION_STEPS = EXPLORATION_STEPS self.VERBOSE = VERBOSE self.LOG_DIR_TF = LOG_DIR_TF #check if action_space is symmetric if all(env.action_space.high == abs(env.action_space.low)): action_scale = env.action_space.high else: raise ActionSpaceNotSymmetricException self.actor = Actor(self.sess, self.observation_space, self.action_space, LEARNING_RATE_ACTOR, NET_SIZE, TAU, action_scale) self.critic = Critic(self.sess, self.observation_space, self.action_space, LEARNING_RATE_CRITIC, NET_SIZE, TAU) actor_network_variables = self.actor.network.get_variables() critic_q_net_variables = self.critic.q_net.get_variables() self.actor_target_update = self.actor.target_network.update_variables( actor_network_variables) self.critic_target_update = self.critic.target_q_net.update_variables( critic_q_net_variables) self.reward_pl = tf.placeholder(tf.float32, [None, 1], name='Reward_PL') self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL') self.labels = tf.where( self.done_pl, self.reward_pl, self.reward_pl + tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction)) #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE) self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE, self.observation_space, self.action_space) self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl') self.reward_f = tf.add(0.0, self.log_reward_pl) tf.summary.scalar('reward', self.reward_f) init = tf.global_variables_initializer() self.sess.run(init) self.sess.run(self.actor.network.copy_to(self.actor.target_network)) self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net)) self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph) self.merged = tf.summary.merge_all() def select_action(self, observation, current_step): action = self.actor.predict(observation, self.actor.prediction) if current_step <= self.EXPLORATION_STEPS: noise = self.noise() else: noise = 0 return action + noise def noise(self): x = self.noise_state dx = 0.15 * (0 - x) + 0.2 * np.random.randn(len(x)) self.noise_state = x + dx return self.noise_state def calcError(self, observation, new_observation, reward, action): """ Calculates the error that determines the usefullness of a memory. High errors are better for training Args: observation: the old state new_observation: the current state reward: the reward received action: the action that was taken Returns: error: the difference between prediction and label """ prediction = self.critic.predict(observation, action, self.critic.prediction) label = reward + self.REWARD_DISCOUNT * self.critic.predict( new_observation, action, self.critic.target_prediction) error = abs(label - prediction) return error def summarize(self, episode, episode_reward, observation, new_observation, reward, done): next_action = self.actor.predict(new_observation, self.actor.target_prediction) feed_dict = { self.critic.input_pl: new_observation, self.critic.actions_pl: next_action, self.reward_pl: [[reward]], self.done_pl: [[done]] } label = self.sess.run(self.labels, feed_dict=feed_dict) feed_dict[self.critic.labels_pl] = label #sometimes the reward is an array and sometimes a scalar if isinstance(episode_reward, np.ndarray): episode_reward = max(episode_reward) feed_dict[self.log_reward_pl] = episode_reward summary = self.sess.run(self.merged, feed_dict=feed_dict) self.writer.add_summary(summary, episode) def train_with_batch(self, current_step): """ Call train_step with a sample batch from the replay memory Args: summary: boolean if the training results are to be saved in a logfile """ observations, actions, rewards, new_observations, dones = self.replay_memory.sample( ) #all of this requires ~3 seconds of computational time #improve the Q-Network next_actions = self.actor.predict(new_observations, self.actor.prediction) feed_dict = { self.critic.input_pl: new_observations, self.critic.actions_pl: next_actions, self.reward_pl: rewards, self.done_pl: dones } labels = self.sess.run(self.labels, feed_dict=feed_dict) self.critic.train(observations, actions, labels) actions = self.actor.predict(observations, self.actor.prediction) gradients = self.critic.get_gradients(observations, actions) #improve the policy with the calculated gradients self.actor.train(observations, gradients) #Update both target networks #requires ~1 second of time self.sess.run(self.actor_target_update) self.sess.run(self.critic_target_update) #Print debug information if verbose if current_step % 500 == 0 and self.VERBOSE: print("Observations: ", observations) print("Predicted Best-Actions: ", actions) print("Labels: ", labels) print("Gradients: ", gradients)
class DDPGAgent: def __init__(self, state_size=28, action_size=2, gamma=0.9, learning_rate_actor=0.0001, learning_rate_critic=0.01, tau=0.001, action_max=[1000, 2], batch_size=32): self.state_size = state_size self.action_size = action_size self.action_max = action_max self.batch_size = batch_size self.memory = deque(maxlen=5000) self.gamma = gamma # discount rate self.learning_rate_actor = learning_rate_actor # learning rate self.learning_rate_critic = learning_rate_critic self.tau = tau # target transfer factor self.gpu_options = tf.GPUOptions() self.config = tf.ConfigProto(gpu_options=self.gpu_options) self.config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.config) K.set_session(self.sess) self.actor = Actor(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_actor, tau=self.tau, sess=self.sess, batch_size=self.batch_size, action_max=self.action_max) self.critic = Critic(state_size=self.state_size, action_size=self.action_size, learning_rate=self.learning_rate_critic, gamma=self.gamma, tau=self.tau, sess=self.sess, batch_size=self.batch_size) self.grad_avg = 0 self.grad_a = [] self.critic_loss_a = [] #self.critic_2 = Critic_2(self.state_size, self.action_size, self.learning_rate_critic, self.gamma, self.tau, self.sess) def policy_action(self, state): ''' Actor predicts new action :param state: :return: action ''' return self.actor.predict(state)[0] def replay(self, batch_size): minibatch = random.sample(self.memory, batch_size) states = np.asarray([e[0] for e in minibatch]) actions = np.asarray([e[1] for e in minibatch]) rewards = np.asarray([e[2] for e in minibatch]) next_states = np.asarray([e[3] for e in minibatch]) states = np.asarray(states).reshape(batch_size, self.state_size) actions = np.asarray(actions).reshape(batch_size, self.action_size) rewards = np.asarray(rewards).reshape(batch_size, 1) next_states = np.asarray(next_states).reshape(batch_size, self.state_size) tar_pre = self.actor.target_predict(next_states) Qvals = self.critic.target_predict(next_states, tar_pre) Q_primes = rewards + (self.gamma * Qvals) # Bellman equation self.update_models(states, actions, Q_primes) def update_models(self, states, actions, critic_target): ''' Update actor and critic networks from sampled experience :param states: :param actions: :param critic_target: :return: ''' loss = self.critic.train_on_batch(states, actions, critic_target) # Train Critic self.critic_loss_a.append(loss) # loss = np.sum(-np.log10(loss), axis=0) act = self.actor.predict( states) # Q Value Gradient under Current Policy grads = self.critic.gradients(states, act) # actor loss self.grad_avg += np.sum(np.log10(np.absolute(grads)), axis=0) / self.batch_size self.grad_a = np.append(self.grad_a, np.sum(np.absolute(grads), axis=0) / self.batch_size, axis=0) # print('grad_a:', self.grad_a) self.actor.train_2(states, grads.reshape( (-1, self.action_size))) # Train actor self.actor.transfer_to_actor_model( ) # Transfer weights to target networks at rate tau self.critic.transfer_to_critic_model() def remember(self, state, action, reward, next_state): self.memory.append((state, action, reward, next_state)) def save_weights(self, directory, params): path_actor = directory + 'Weights' + params + '_LR'.format( self.learning_rate_actor) path_critic = directory + 'Weights' + params + '_LR'.format( self.learning_rate_critic) self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load_weigths(path_actor) self.critic.load_weights(path_critic) def load_model(self, path_actor, path_critic): self.actor.model.load_model(path_actor) self.critic.model.load_model(path_critic)
def train(): env = gym.make('LunarLander-v2') state = env.reset() actor = Actor(env.action_space, env.observation_space) critic = Critic(env.action_space, env.observation_space) actor.load() critic.load() replayMemory = ReplayMemory() summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter("./log", tf.Session().graph) episode_reward = 0 step = 1 while True: #env.render() state1 = state[np.newaxis, :] action, action_matrix, prob = actor.predict(state1) next_state, reward, done, info = env.step(action) replayMemory.add(state, action_matrix, reward, done, next_state, prob) state = next_state episode_reward += reward #train if replayMemory.size() % 128 == 0 or done == True: state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll( ) reward_b = reward_b[:, np.newaxis] c_pre = critic.predict(next_state_b) state_pre_value = reward_b + c_pre * 0.7 state_value = critic.predict(state_b) count = 5000 // step if count > 500: count = 500 if count < 1: count = 1 count = 10 for _ in range(count): critic.train(state_b, state_pre_value) for _ in range(count): actor.train(state_b, state_value, state_pre_value, action_matrix_b, prob_b) replayMemory.clear() ######################## if done: summary_str = tf.Session().run( summary_ops, feed_dict={summary_vars[0]: episode_reward}) writer.add_summary(summary_str, step) writer.flush() ##print("step = ", step, "episode_reward = ", episode_reward) state = env.reset() episode_reward = 0 step += 1 if step % 25 == 0: actor.save() critic.save()