def train(): """Contains the training and evaluation loops""" my_replay_memory = ReplayMemory(size=Trainer.MEMORY_SIZE, batch_size=Trainer.BATCH_SIZE) # (★) eps_sched = EpsScheduler( replay_memory_start_size=Trainer.REPLAY_MEMORY_START_SIZE, max_frames=Trainer.MAX_FRAMES) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) self.global_frame_index = 0 self.rewards = [] self.loss_list = [] while self.global_frame_index < Trainer.MAX_FRAMES: self.epoch_frame_index = 0 while self.epoch_frame_index < Trainer.EPOCH_FRAME_COUNT: episode_done = self.game.reset(sess) episode_reward_sum = 0 for _ in range(Trainer.MAX_EPISODE_LENGTH): action = networks.main_dqn.get_action( sess, global_frame_index, game.state, evaluation=False) processed_new_frame, reward, terminal, episode_done, _ = game.step( sess, action) global_frame_index += 1 epoch_frame_index += 1 episode_reward_sum += reward # Clip the reward clipped_reward = Trainer.clip_reward(reward) # (7★) Store transition in the replay memory my_replay_memory.add_experience( action=action, frame=processed_new_frame[:, :, 0], reward=clipped_reward, terminal=episode_done) if global_frame_index % Trainer.UPDATE_FREQ == 0 and global_frame_index > Trainer.REPLAY_MEMORY_START_SIZE: loss = Trainer.learn( sess, my_replay_memory, networks, Trainer.BATCH_SIZE, gamma=Trainer.DISCOUNT_FACTOR) # (8★) loss_list.append(loss) if global_frame_index % Trainer.NETW_UPDATE_FREQ == 0 and global_frame_index > Trainer.REPLAY_MEMORY_START_SIZE: networks.update_target_network(sess) # (9★) if terminal: terminal = False break rewards.append(episode_reward_sum) # Output the progress: if len(rewards) % 10 == 0: # Scalar summaries for tensorboard if global_frame_index > REPLAY_MEMORY_START_SIZE: summ = sess.run(performance_summaries, feed_dict={ loss_ph: np.mean(loss_list), reward_ph: np.mean(rewards[-100:]) }) SUMM_WRITER.add_summary(summ, global_frame_index) loss_list = [] # Histogramm summaries for tensorboard summ_param = sess.run(param_summaries) SUMM_WRITER.add_summary(summ_param, global_frame_index) print(len(rewards), global_frame_index, np.mean(rewards[-100:])) with open('rewards.dat', 'a') as reward_file: print(len(rewards), global_frame_index, np.mean(rewards[-100:]), file=reward_file) ######################## ###### Evaluation ###### ######################## terminal = True gif = True frames_for_gif = [] eval_rewards = [] evaluate_frame_number = 0 for _ in range(EVAL_STEPS): if terminal: episode_done = game.reset(sess, evaluation=True) episode_reward_sum = 0 terminal = False # Fire (action 1), when a life was lost or the game just started, # so that the agent does not stand around doing nothing. When playing # with other environments, you might want to change this... action = 1 if episode_done else \ networks.main_dqn.get_action(sess, global_frame_index, game.state, evaluation=True) processed_new_frame, reward, terminal, episode_done, new_frame = game.step( sess, action) evaluate_frame_number += 1 episode_reward_sum += reward if gif: frames_for_gif.append(new_frame) if terminal: eval_rewards.append(episode_reward_sum) gif = False # Save only the first game of the evaluation as a gif print("Evaluation score:\n", np.mean(eval_rewards)) try: generate_gif(global_frame_index, frames_for_gif, eval_rewards[0], PATH) except IndexError: print("No evaluation game finished") #Save the network parameters saver.save(sess, PATH + '/my_model', global_step=global_frame_index) frames_for_gif = [] # Show the evaluation score in tensorboard summ = sess.run( eval_scope_summary, feed_dict={eval_scope_ph: np.mean(eval_rewards)}) SUMM_WRITER.add_summary(summ, global_frame_index) with open('rewardsEval.dat', 'a') as eval_reward_file: print(global_frame_index, np.mean(eval_rewards), file=eval_reward_file)
class QAgent: def __init__(self, session): self.time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") self.session = session self.action_size = 4 self.gamma = 0.99 self.epsilon = INITIAL_EPSILON self.batch_size = 32 self.learning_rate = 0.00001 self.replay_mem = ReplayMemory(size=1000000, frame_height=84, frame_width=84, agent_history_length=4, batch_size=32) self.tick = 0 self.episode = 0 self.total_reward = 0 self.last_n_game_reward = deque(maxlen=100) # create q network self.state_input, self.q_values, self.best_action = self.create_model( 'main') # create target q network self.state_input_t, self.q_values_t, self.best_action_t = self.create_model( 'target') # update dqn vars self.main_dqn_vars = tf.trainable_variables(scope='main') self.target_dqn_vars = tf.trainable_variables(scope='target') self.update_ops = [] for i, var in enumerate(self.main_dqn_vars): copy_op = self.target_dqn_vars[i].assign(var.value()) self.update_ops.append(copy_op) # create training self.create_trainig() # init self.session.run(tf.global_variables_initializer()) #self.update_target_network() # saver self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.session, checkpoint.model_checkpoint_path) print(f'Successfully loaded: {checkpoint.model_checkpoint_path}') else: print('Could not find old network weights') def create_trainig(self): self.target_q = tf.placeholder(shape=[None], dtype=tf.float32) self.action = tf.placeholder(shape=[None], dtype=tf.int32) self.Q = tf.reduce_sum(tf.multiply( self.q_values, tf.one_hot(self.action, self.action_size, dtype=tf.float32)), axis=1) # loss with tf.variable_scope("loss"): self.loss = tf.reduce_mean( tf.losses.huber_loss(labels=self.target_q, predictions=self.Q)) # train with tf.variable_scope("training"): self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) def train(self): states, actions, rewards, new_states, terminal_flags = self.replay_mem.get_minibatch( ) # test = states[0] # print(test.dtype) # print(test.shape) # fig = plt.figure(figsize=(1, 4)) # fig.add_subplot(1, 4, 1) # plt.imshow(test[:,:,0]) # fig.add_subplot(1, 4, 2) # plt.imshow(test[:,:,1]) # fig.add_subplot(1, 4, 3) # plt.imshow(test[:,:,2]) # fig.add_subplot(1, 4, 4) # plt.imshow(test[:,:,3]) # plt.show() # ttt # main dqn: predict best action for new state arg_q_max = self.session.run(self.best_action, feed_dict={self.state_input: new_states}) # target qdn: predict Q(s', a) q_vals = self.session.run(self.q_values_t, feed_dict={self.state_input_t: new_states}) # double q double_q = q_vals[range(self.batch_size), arg_q_max] # calc bellman equation target_q = rewards + (self.gamma * double_q * (1 - terminal_flags)) # train main dqn loss, _ = self.session.run( [self.loss, self.train_op], feed_dict={ self.state_input: states, self.target_q: target_q, self.action: actions }) def process(self, next_frame, action, reward, done, terminal_life_lost): self.replay_mem.add_experience(action=action, frame=next_frame[:, :, 0], reward=reward, terminal=terminal_life_lost) if self.tick > OBSERVE and self.tick % TRAIN_FREQ: self.train() # update target network if self.tick > OBSERVE and self.tick % UPDATE_TIME == 0: print('update target network') self.update_target_network() # save network every 100000 iteration if self.tick > 0 and self.tick % 100000 == 0: self.saver.save(self.session, f'saved_networks/dqn-{self.time}', global_step=self.tick) self.total_reward += reward if done: self.last_n_game_reward.append(self.total_reward) # print print( f'Episode: {self.episode}, Reward: {self.total_reward}, Avg. Reward: {np.mean(self.last_n_game_reward)}, Epsilon: {self.epsilon}, Step: {self.tick}' ) self.episode += 1 self.total_reward = 0 self.tick += 1 def get_action(self, state, training=True): if training: if random.random() < self.epsilon: action = np.random.randint(0, 4) else: action = self.session.run( self.best_action, feed_dict={self.state_input: [state]})[0] if self.epsilon > FINAL_EPSILON and self.tick > OBSERVE: self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE else: action = self.session.run(self.best_action, feed_dict={self.state_input: [state]})[0] return action def update_target_network(self): for copy_op in self.update_ops: self.session.run(copy_op) def create_model(self, name='main'): with tf.variable_scope(name): stateInput = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.float32) # Normalizing the input inputscaled = stateInput / 255 # Convolutional layers conv1 = tf.layers.conv2d( inputs=inputscaled, filters=32, kernel_size=[8, 8], strides=4, kernel_initializer=tf.variance_scaling_initializer(scale=2), padding="valid", activation=tf.nn.relu, use_bias=False) conv2 = tf.layers.conv2d( inputs=conv1, filters=64, kernel_size=[4, 4], strides=2, kernel_initializer=tf.variance_scaling_initializer(scale=2), padding="valid", activation=tf.nn.relu, use_bias=False) conv3 = tf.layers.conv2d( inputs=conv2, filters=64, kernel_size=[3, 3], strides=1, kernel_initializer=tf.variance_scaling_initializer(scale=2), padding="valid", activation=tf.nn.relu, use_bias=False) conv4 = tf.layers.conv2d( inputs=conv3, filters=1024, kernel_size=[7, 7], strides=1, kernel_initializer=tf.variance_scaling_initializer(scale=2), padding="valid", activation=tf.nn.relu, use_bias=False) # Splitting into value and advantage stream valuestream, advantagestream = tf.split(conv4, 2, 3) valuestream = tf.layers.flatten(valuestream) advantagestream = tf.layers.flatten(advantagestream) advantage = tf.layers.dense( inputs=advantagestream, units=4, kernel_initializer=tf.variance_scaling_initializer(scale=2)) value = tf.layers.dense( inputs=valuestream, units=1, kernel_initializer=tf.variance_scaling_initializer(scale=2)) # Combining value and advantage into Q-values q_values = value + tf.subtract( advantage, tf.reduce_mean(advantage, axis=1, keepdims=True)) best_action = tf.argmax(q_values, 1) return stateInput, q_values, best_action
def train(): environment = AtariEnvironment(env_name=ENV_NAME, frame_stack_length=FRAME_STACK_LENGTH) main_dqn = DQN(num_actions=environment.action_number, frame_height=FRAME_HEIGHT, frame_width=FRAME_WIDTH, frame_stack_length=FRAME_STACK_LENGTH, hidden=HIDDEN, batch_size=BATCH_SIZE, path=PATH_READ, path2=PATH_WRITE) target_dqn = DQN(num_actions=environment.action_number, frame_height=FRAME_HEIGHT, frame_width=FRAME_WIDTH, frame_stack_length=FRAME_STACK_LENGTH, hidden=HIDDEN, batch_size=BATCH_SIZE, path=PATH_READ, path2=PATH_WRITE) replay_memory = ReplayMemory(size=MEMORY_SIZE, frame_height=FRAME_HEIGHT, frame_width=FRAME_WIDTH, frame_stack_length=FRAME_STACK_LENGTH, batch_size=BATCH_SIZE) action_selector = ActionSelector( dqn=main_dqn, num_actions=environment.action_number, initial_epsilon=EPSILON_INITIAL, middle_epsilon=EPSILON_SECOND, finish_epsilon=EPSILON_FINAL, minimum_replay_size=REPLAY_MEMORY_START_SIZE, maximum_replay_size=MEMORY_SIZE, final_frame_number=MAX_FRAMES) target_dqn_updater = TargetDqnUpdater(main_dqn=main_dqn, target_dqn=target_dqn) total_frame_number = 0 rewards_per_episode = {} frames_per_episode = {} episode = 0 average_last_100_frames = 0 average_last_100_reward = 0 best_score = 0 open('scores/best_scores.txt', 'w').close() open('scores/averages.txt', 'w').close() #main_dqn.load_model(400) #target_dqn.load_model(400) while total_frame_number < MAX_FRAMES: episode += 1 rewards_per_episode[episode] = 0 frames_per_episode[episode] = 0 terminal_life_lost = environment.reset_environment(hard_reset=True) while frames_per_episode[episode] < MAX_EPISODE_LENGTH: action = 1 if terminal_life_lost else action_selector.act( environment.current_state, total_frame_number) processed_next_frame, reward, terminal, terminal_life_lost = environment.commit_action( action) replay_memory.add_experience(action, processed_next_frame[:, :, 0], clip(reward), terminal_life_lost) if REPLAY_MEMORY_START_SIZE < total_frame_number: if total_frame_number % UPDATE_FREQ == 0: states, actions, rewards, next_states, terminals = replay_memory.sample_minibatch( ) # calculate best actions in next states based on main dqn! best_actions = main_dqn.get_best_actions_batch(next_states) #calculate q_values of these actions in next states based on target network! #firstly, one hot encode best found actions ohe_best_actions_next_states = to_categorical( best_actions, num_classes=environment.action_number) ohe_best_actions_current_states = to_categorical( actions, num_classes=environment.action_number) next_states_q_values = target_dqn.predict_batch( next_states, ohe_best_actions_next_states) next_states_best_q_value = np.sum(next_states_q_values, axis=1) #the Bellman update -> target_q_values = rewards + ( 1 - terminals) * DISCOUNT_FACTOR * next_states_best_q_value #gradient descent main_dqn.fit_batch( states, ohe_best_actions_current_states, ohe_best_actions_current_states * np.expand_dims(target_q_values, axis=1)) if total_frame_number % NETW_UPDATE_FREQ == 0: target_dqn_updater.update_target_network() total_frame_number += 1 frames_per_episode[episode] += 1 rewards_per_episode[episode] += reward if terminal: break if terminal_life_lost: terminal_life_lost = environment.reset_environment( hard_reset=False ) #NAKON SVAKOG IZGUBLJENOG ZIVOTA PUCAJ SLUCAJNO KAKO BI SE STVORIO U NOVOJ SITUACIJI print("\nEpisode %d ended." % episode) print("Reward: %d" % rewards_per_episode[episode]) print("Frames: %d" % frames_per_episode[episode]) print("Replay memory size: %d" % replay_memory.get_size()) print("Current epsilon: %5f\n" % action_selector.eps_debug) average_last_100_reward += rewards_per_episode[episode] average_last_100_frames += frames_per_episode[episode] if best_score < rewards_per_episode[episode]: best_score = rewards_per_episode[episode] file = open("scores/best_scores.txt", 'a') file.write("Episode: " + str(episode) + " | New best score: " + str(best_score) + "\n") file.close() if episode % 100 == 0: file = open("scores/averages.txt", 'a') file.write("\nEpisodes %d - %d results:" % (episode - 100, episode)) average_last_100_reward /= 100 average_last_100_frames /= 100 file.write("\nAverage reward per episode: %.5f" % average_last_100_reward) file.write("\nAverage frames per episode: %.2f\n" % average_last_100_frames) file.close() average_last_100_reward = 0 average_last_100_frames = 0 if total_frame_number > REPLAY_MEMORY_START_SIZE: main_dqn.save_model(episode)