def simple_replay_train(DQN, train_batch): x_stack = np.empty(0).reshape(0, DQN.input_size) y_stack = np.empty(0).reshape(0, DQN.output_size) for state, action, reward, next_state, done in train_batch: Q = DQN.predict(state) if done: Q[0, action] = reward else: Q[0, action] = reward + dis * np.max(DQN.predict(next_state)) y_stack = np.vstack([y_stack, Q]) x_stack = np.vstack([x_stack, state]) return DQN.update(x_stack, y_stack)
class Agent: """ Class representing a learning agent acting in an environment. """ def __init__(self, buffer_size, batch_size, alpha, gamma, epsilon, epsilon_min, epsilon_decay, lr, game="CartPole-v1", mean_bound=5, reward_bound=495.0, sync_model=1000, save_model=10): """ Constructor of the agent class. - game="CartPole-v1" : Name of the game environment - mean_bound=5 : Number of last acquired rewards considered for mean reward - reward_bound=495.0 : Reward acquired for completing an episode properly - sync_model=1000 : Interval for synchronizing model and target model - save_model=10 : Interval for saving model - buffer_size : Replay buffer size of the DQN model - batch_size : Batch size of the DQN model - alpha : Learning rate for Q-Learning - gamma : Discount factor for Q-Learning - epsilon : Threshold for taking a random action - epsilon_min : Minimal value allowed for epsilon - epsilon_decay : Decay rate for epsilon - lr : Learning rate for the DQN model """ # Environment variables self.game = game self.env = gym.make(self.game) self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.n # Agent variables self.buffer_size = buffer_size self.batch_size = batch_size self.buffer = ReplayBuffer(self.buffer_size, self.batch_size) self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.mean_bound = mean_bound self.reward_bound = reward_bound # DQN variables self.lr = lr self.model = DQN(self.num_states, self.num_actions, self.lr) self.target_model = DQN(self.num_states, self.num_actions, self.lr) self.target_model.update(self.model) self.sync_model = sync_model self.save_model = save_model # File paths dirname = os.path.dirname(__file__) self.path_model = os.path.join(dirname, "../models/dqn.h5") self.path_plot = os.path.join(dirname, "../plots/dqn.png") # Load model, if it already exists try: self.model.load(self.path_model) self.target_model.update(self.model) except: print("Model does not exist! Create new model...") def reduce_epsilon(self): """ Reduces the parameter epsilon up to a given minimal value where the speed of decay is controlled by some given parameter. """ epsilon = self.epsilon * self.epsilon_decay if epsilon >= self.epsilon_min: self.epsilon = epsilon else: self.epsilon = self.epsilon_min def get_action(self, state): """ Returns an action for a given state, based on the current policy. - state : Current state of the agent """ if np.random.random() < self.epsilon: action = self.env.action_space.sample() else: action = np.argmax(self.model.predict(state)) return action def train(self, num_episodes, report_interval): """ Trains the DQN model for a given number of episodes. Outputting report information is controlled by a given time interval. - num_episodes : Number of episodes to train - report_interval : Interval for outputting report information of training """ step = 0 total_rewards = [] for episode in range(1, num_episodes + 1): if episode % self.save_model == 0: self.model.save(self.path_model) state = self.env.reset() state = state.reshape((1, self.num_states)) total_reward = 0.0 while True: step += 1 action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape((1, self.num_states)) # Penalize agent if pole could not be balanced until end of episode if done and reward < 499.0: reward = -100.0 self.buffer.remember(state, action, reward, next_state, done) self.replay() self.reduce_epsilon() state = next_state total_reward += reward if step % self.sync_model == 0: self.target_model.update(self.model) if done: total_reward += 100.0 total_rewards.append(total_reward) mean_reward = np.mean(total_rewards[-self.mean_bound:]) if episode % report_interval == 0: print(f"Episode: {episode}/{num_episodes}" f"\tStep: {step}" f"\tMemory Size: {len(self.memory)}" f"\tEpsilon: {self.epsilon : .3f}" f"\tReward: {total_reward}" f"\tLast 5 Mean: {mean_reward : .2f}") self.plot_rewards(total_rewards) if mean_reward > self.reward_bound: self.model.save(self.path_model) return break self.model.save(self.path_model) def replay(self): """ Samples training data from the replay buffer and fits the DQN model. """ sample_size, states, actions, rewards, next_states, dones = self.memory.sample( ) q_values = self.model.predict(states) next_q_values = self.target_model.predict(next_states) for i in range(sample_size): action = actions[i] done = dones[i] if done: q_target = rewards[i] else: q_target = rewards[i] + self.gamma * np.max(next_q_values[i]) q_values[i][action] = (1 - self.alpha) * \ q_values[i][action] + self.alpha * q_target self.model.fit(states, q_values) def play(self, num_episodes): """ Renders the trained agent for a given number of episodes. - num_episodes : Number of episodes to render """ self.epsilon = self.epsilon_min for episode in range(1, num_episodes + 1): state = self.env.reset() state = state.reshape((1, self.num_states)) total_reward = 0.0 while True: self.env.render() action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape((1, self.num_states)) state = next_state total_reward += reward if done: print(f"Episode: {episode}/{num_episodes}" f"\tTotal Reward: {total_reward : .2f}") break def plot_rewards(self, total_rewards): """ Plots the rewards the agent has acquired during training. - total_rewards : Rewards the agent has gained per episode """ x = range(len(total_rewards)) y = total_rewards slope, intercept, _, _, _ = linregress(x, y) plt.plot(x, y, linewidth=0.8) plt.plot(x, slope * x + intercept, color="red", linestyle="-.") plt.xlabel("Episode") plt.ylabel("Reward") plt.title("DQN-Learning") plt.savefig(self.path_plot)
# -abs term代表鼓勵agent不要去移動車子, # 一直維持在中間才能獲得很高的獎賞! # 2. r2得到的是角度的資訊, # 儘量讓棒子跟垂直線的角度愈小愈好(就是讓棒子立正), # 角度愈小獎賞愈高 # 最後扣0.5是讓獎勵區間分布於[-1~1]之間 # ---------------------------------------------------- x, x_dot, theta, theta_dot = s_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r = r1 + r2 # Sotre the transition and update the network net.store_path(s, a, r, s_) net.update() # Judge if finish an episode (and decay the epsilon) if finish: print("Episode: %d \t Total reward: %d \t Eps: %f" % (i, total_reward, net.epsilon)) reward_list.append(total_reward) if net.epsilon > 0.01: net.episodeDecay() break # Update the current state as the future state of previous state s = s_ net.save() plt.plot(range(len(reward_list)), reward_list, '-') plt.show()
class Agent: """Our Wasted Agent :P """ def __init__(self, sess, config, environment, evaluation_enviroment): # Get the session, config, environment, and create a replaymemory self.sess = sess self.config = config self.environment = environment self.evaluation_enviroment = evaluation_enviroment if config.prm: self.memory = PrioritizedExperienceReplay(sess, config) else: self.memory = ReplayMemory(config.state_shape, config.rep_max_size) self.init_dirs() self.init_cur_epsiode() self.init_global_step() self.init_epsilon() self.init_summaries() # Intialize the DQN graph which contain 2 Networks Target and Q self.estimator = DQN(sess, config, self.environment.n_actions) # To initialize all variables self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init) self.saver = tf.train.Saver(max_to_keep=10) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) if config.is_train and not config.cont_training: pass elif config.is_train and config.cont_training: self.load() elif config.is_play: self.load() else: raise Exception("Please Set proper mode for training or playing") def load(self): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(self.sess, latest_checkpoint) def save(self): self.saver.save(self.sess, self.checkpoint_dir, self.global_step_tensor) def init_dirs(self): # Create directories for checkpoints and summaries self.checkpoint_dir = os.path.join(self.config.experiment_dir, "checkpoints/") self.summary_dir = os.path.join(self.config.experiment_dir, "summaries/") def init_cur_epsiode(self): """Create cur episode tensor to totally save the process of the training""" with tf.variable_scope('cur_episode'): self.cur_episode_tensor = tf.Variable(-1, trainable=False, name='cur_epsiode') self.cur_epsiode_input = tf.placeholder('int32', None, name='cur_episode_input') self.cur_episode_assign_op = self.cur_episode_tensor.assign( self.cur_epsiode_input) def init_global_step(self): """Create a global step variable to be a reference to the number of iterations""" with tf.variable_scope('step'): self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_input = tf.placeholder('int32', None, name='global_step_input') self.global_step_assign_op = self.global_step_tensor.assign( self.global_step_input) def init_epsilon(self): """Create an epsilon variable""" with tf.variable_scope('epsilon'): self.epsilon_tensor = tf.Variable(self.config.initial_epsilon, trainable=False, name='epsilon') self.epsilon_input = tf.placeholder('float32', None, name='epsilon_input') self.epsilon_assign_op = self.epsilon_tensor.assign( self.epsilon_input) def init_summaries(self): """Create the summary part of the graph""" with tf.variable_scope('summary'): self.summary_placeholders = {} self.summary_ops = {} self.scalar_summary_tags = [ 'episode.total_reward', 'episode.length', 'evaluation.total_reward', 'evaluation.length', 'epsilon' ] for tag in self.scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( tag, self.summary_placeholders[tag]) def init_replay_memory(self): # Populate the replay memory with initial experience print("initializing replay memory...") state = self.environment.reset() for i in itertools.count(): action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) if done: if self.config.prm: if i >= self.config.prm_init_size: break else: if i >= self.config.replay_memory_init_size: break state = self.environment.reset() else: state = next_state print("finished initializing replay memory") def policy_fn(self, fn_type, estimator, n_actions): """Function that contain definitions to various number of policy functions and choose between them""" def epsilon_greedy(sess, observation, epsilon): actions = np.ones(n_actions, dtype=float) * epsilon / n_actions q_values = estimator.predict(np.expand_dims(observation, 0))[0] best_action = np.argmax(q_values) actions[best_action] += (1.0 - epsilon) return actions def greedy(sess, observation): q_values = estimator.predict(np.expand_dims(observation, 0), type="target")[0] best_action = np.argmax(q_values) return best_action if fn_type == 'epsilon_greedy': return epsilon_greedy elif fn_type == 'greedy': return greedy else: raise Exception("Please Select a proper policy function") def take_action(self, state): """Take the action based on the policy function""" action_probs = self.policy(self.sess, state, self.epsilon_tensor.eval(self.sess)) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action def observe_and_save(self, state, action): """Function that observe the new state , reward and save it in the memory""" next_state, reward, done = self.environment.step(action) self.memory.push(state, next_state, action, reward, done) return next_state, reward, done def update_target_network(self): """Update Target network By copying paramter between the two networks in DQN""" self.estimator.update_target_network() def add_summary(self, summaries_dict, step): """Add the summaries to tensorboard""" summary_list = self.sess.run( [self.summary_ops[tag] for tag in summaries_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in summaries_dict.items() }) for summary in summary_list: self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def train_episodic(self): """Train the agent in episodic techniques""" # Initialize the epsilon step, it's step, the policy function, the replay memory self.epsilon_step = ( self.config.initial_epsilon - self.config.final_epsilon) / self.config.exploration_steps self.policy = self.policy_fn(self.config.policy_fn, self.estimator, self.environment.n_actions) self.init_replay_memory() for cur_episode in range( self.cur_episode_tensor.eval(self.sess) + 1, self.config.num_episodes, 1): # Save the current checkpoint self.save() # Update the Cur Episode tensor self.cur_episode_assign_op.eval( session=self.sess, feed_dict={ self.cur_epsiode_input: self.cur_episode_tensor.eval(self.sess) + 1 }) # Evaluate Now to see how it behave if cur_episode % self.config.evaluate_every == 0: self.evaluate(cur_episode / self.config.evaluate_every) state = self.environment.reset() total_reward = 0 # Take steps in the environment untill terminal state of epsiode for t in itertools.count(): # Update the Global step self.global_step_assign_op.eval( session=self.sess, feed_dict={ self.global_step_input: self.global_step_tensor.eval(self.sess) + 1 }) # time to update the target estimator if self.global_step_tensor.eval( self.sess ) % self.config.update_target_estimator_every == 0: self.update_target_network() # Calculate the Epsilon for this time step # Take an action ..Then observe and save self.epsilon_assign_op.eval( { self.epsilon_input: max( self.config.final_epsilon, self.epsilon_tensor.eval(self.sess) - self.epsilon_step) }, self.sess) action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) # Sample a minibatch from the replay memory if self.config.prm: indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample( ) else: state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch( self.config.batch_size) # Calculate targets Then Compute the loss q_values_next = self.estimator.predict(next_state_batch, type="target") targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * self.config.discount_factor * np.amax( q_values_next, axis=1) if self.config.prm: _ = self.estimator.update(state_batch, action_batch, targets_batch, weights_batch) else: _ = self.estimator.update(state_batch, action_batch, targets_batch) total_reward += reward if done: # IF terminal state so exit the episode # Add summaries to tensorboard summaries_dict = { 'episode.total_reward': total_reward, 'episode.length': t, 'epsilon': self.epsilon_tensor.eval(self.sess) } self.add_summary(summaries_dict, self.global_step_tensor.eval(self.sess)) break state = next_state print("Training Finished") def train_continous(self): # TODO implement on global step only pass def play(self, n_episode=10): """Function that play greedily on the policy learnt""" # Play Greedily self.policy = self.policy_fn('greedy', self.estimator, self.environment.n_actions) for cur_episode in range(n_episode): state = self.environment.reset() total_reward = 0 for t in itertools.count(): best_action = self.policy(self.sess, state) next_state, reward, done = self.environment.step( self.environment.valid_actions[best_action]) total_reward += reward if done: print("Total Reward in Epsiode " + str(cur_episode) + " = " + str(total_reward)) print("Total Length in Epsiode " + str(cur_episode) + " = " + str(t)) break state = next_state def evaluate(self, local_step): print('evaluation #{0}'.format(local_step)) policy = self.policy_fn('greedy', self.estimator, self.evaluation_enviroment.n_actions) for cur_episode in range(self.config.evaluation_episodes): state = self.evaluation_enviroment.reset() total_reward = 0 for t in itertools.count(): best_action = policy(self.sess, state) next_state, reward, done = self.evaluation_enviroment.step( self.evaluation_enviroment.valid_actions[best_action]) total_reward += reward if done: # Add summaries to tensorboard summaries_dict = { 'evaluation.total_reward': total_reward, 'evaluation.length': t } self.add_summary(summaries_dict, local_step * 5 + cur_episode) break state = next_state print('Finished evaluation #{0}'.format(local_step))