def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model, load_memory, save_memory, log_path): agent = DQNAgent(config) with agent.graph.as_default(): if load_model: step = agent.load_model(load_model) screen_log.info("Load model: {}".format(load_model)) screen_log.info("Start from step {}".format(step)) else: step = 0 if load_memory: agent.load_memory(load_memory) n_frames = len(agent.memory) screen_log.info("Load memory: {}".format(load_memory)) screen_log.info("Memory size: {}".format(n_frames)) log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6])) summary_writer = tf.summary.FileWriter(logdir=os.path.join( log_path, '{}'.format(log_name)), graph=agent.graph) episode = 0 rewards_per_episode = [] sum_Qs = .0 sum_losses = .0 try: while (step < max_num_of_steps and episode < max_num_of_episodes): episode += 1 episode_done = False next_observation = reset_random_env() next_observation = preprocess_observation(next_observation) rewards_per_episode.append(0) while not episode_done: observation = next_observation if len(agent.memory) < config['replay_start_size']: # init replay memory action = env.action_space.sample() next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation( next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) continue state = agent.get_recent_state(observation) Qs = agent.get_Q_values(state) Qs = Qs[0] # epsilon-greedy action selection epsilon = get_epsilon(config, step) if np.random.RandomState().rand() < epsilon: action = env.action_space.sample() else: action = agent.get_action_from_Q(Qs) next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation(next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) step += 1 rewards_per_episode[-1] += reward sum_Qs += Qs[action] # train step loss, loss_summary_str = agent.optimize_Q() summary_writer.add_summary(loss_summary_str, step) sum_losses += loss if step % 1000 == 0: ave_loss = sum_losses / step ave_reward = np.mean(rewards_per_episode) ave_Q = sum_Qs / step [Q_summary_str, reward_summary_str ] = agent.evaluate(ave_reward, ave_Q) summary_writer.add_summary(Q_summary_str, step) summary_writer.add_summary(reward_summary_str, step) screen_log.info( 'step: {}, ave. loss: {:g}, ' 'ave. reward: {:g}, ave. Q: {:g}'.format( step, ave_loss, ave_reward, ave_Q, )) if step % 10000 == 0: agent.save_model(save_model, step) if step % 1000000 == 0: agent.save_memory(save_memory, step) except KeyboardInterrupt: print("\nUser interrupted training...") finally: summary_writer.close() agent.save_model(save_model, step) agent.save_memory(save_memory, step) screen_log.info( 'Finished: the number of steps {}, the number of episodes {}.'. format(step, episode))
def main(config, screen_log, frame_output, max_episodes, load_model): game_name = 'BreakoutDeterministic-v4' env = gym.make(game_name) agent = DQNAgent(config) with agent.graph.as_default(): if load_model: _ = agent.load_model(load_model) screen_log.info("Load model: {}".format(load_model)) rewards_per_episode = [] play_images = [] try: for episode in range(max_episodes): init_frame = env.reset() play_images.append(Image.fromarray(init_frame)) next_observation = preprocess_observation(init_frame) env.render() episode_done = False rewards_per_episode.append(0) while not episode_done: # sleep for the duration of the frame so we can see what happens sleep(1. / 30) observation = next_observation if len(agent.memory) < config['agent_history_length']: # init replay memory action = env.action_space.sample() next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation( next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) continue state = agent.get_recent_state(observation) Qs = agent.get_Q_values(state) Qs = Qs[0] # epsilon-greedy action selection if np.random.RandomState().rand( ) < config['evaluation_exploration']: action = env.action_space.sample() else: action = agent.get_action_from_Q(Qs) next_observation, reward, episode_done, info = env.step( action) play_images.append(Image.fromarray(next_observation)) next_observation = preprocess_observation(next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) rewards_per_episode[-1] += reward env.render() screen_log.info( 'episode: {}, reward: {:g}, ave. reward: {:g}, '.format( episode + 1, rewards_per_episode[-1], np.mean(rewards_per_episode), )) play_images[0].save( frame_output, save_all=True, append_images=play_images[1:], duration=30, ) except KeyboardInterrupt: print("\nUser interrupted playinging...") finally: env.close() screen_log.info( 'Finished: the best reward {:g}, the ave. reward {:g}.'.format( np.max(rewards_per_episode), np.mean(rewards_per_episode), ))