def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if FLAGS.cpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.save_weight: agent.save_weight_to_pkl() if FLAGS.load_weight: agent.load_weight_from_pkl(cpu_mode=FLAGS.cpu) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) ACPconfig = ACPConfig(env) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # Becuase of code shittines, these steps should be after each other! acpAgent = acp.acp(sess, ACPconfig) agentDQN = Agent(config, env, acpAgent, sess) acpAgent.setdir(agentDQN.model_dir) sess.run(tf.initializers.global_variables()) # Load both models if exist any checkpoint acpAgent.load() agentDQN.load() if FLAGS.is_train: agentDQN.train() else: raise Exception('agentDQN.play() is Not Implemented') agentDQN.play()
def main(_): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True #sess = tf.Session(config=config) with tf.Session(config=config) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.mode == "train": agent.train() elif FLAGS.mode == "test": agent.play() elif FLAGS.mode == "ale": agent.play2()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS # 通过config.py的get_config方法加载配置选项 if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available( ) and FLAGS.use_gpu: # 如果能检查到就使用GPU;如果设置了使用GPU但是没有检测到GPU则报错。 raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # 输入的格式:[batch, in_height, in_width, in_channels] # 另一种数据输入格式NCHW:[batch, in_channels, in_height, in_width] agent = Agent(config, env, sess) # 新建DQN的智能体 if FLAGS.is_train: agent.train() else: agent.play() # 不进行训练(仅仅演示)
def main(_): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: K.set_session(sess) config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' # Create a single instance of Agent to be multi-threaded agent = Agent(config, env, sess, threading.Lock()) if FLAGS.is_train: init_threads(agent, config) else: agent.play(env)
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) config.cnn_format = 'NHWC' agent = MyAgent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): if FLAGS.gpu_fraction == "1/1": FLAGS.gpu_fraction = "0.999/1.0" gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: #Set ratio of usage for GPU or tensorflow would report error #config = tf.ConfigProto() #config.gpu_options.allow_growth = True #with tf.Session(config=config) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if FLAGS.poison: config.poison_line = input("input the number of poison line:") if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: if FLAGS.poison: agent.train_poison() else: agent.train() else: if FLAGS.poison: agent.play_poison() else: agent.play()
def main(_): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', help='RNG seed', type=int, default=123) parser.add_argument('--test', action="store_true") parser.add_argument("--use-gpu", action="store_true") parser.add_argument("--mode", help="Bonus mode", default="pixelcnn") args = parser.parse_args() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: config = get_config(args) if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and args.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if args.mode == "pixelcnn": from dqn.agent import Agent agent = Agent(config, env, sess) elif args.mode == "autoencoder": from dqn.agent_model import Agent agent = Agent(config, env, sess) elif args.mode == "top-pixelcnn": from dqn.agent_top import Agent agent = Agent(config, env, sess) else: raise ValueError("No such mode") print("CNN format", config.cnn_format) if not args.test: print("training ...") agent.train() else: print("testing ...") agent.play()
def main(_): #设置每个进程所占用的GPU内存比例 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True)) as sess: # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not FLAGS.use_gpu: config.cnn_format = 'NHWC' with tf.device('/gpu:2'): agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list='0') with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' agent = Agent(config, env, sess) if FLAGS.is_train: agent.train() else: agent.play()
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' roms = 'roms/Pong2PlayerVS.bin' ale = ALEInterface(roms.encode('utf-8')) width = ale.ale_getScreenWidth() height = ale.ale_getScreenHeight() game_screen = GameScreen() ale.ale_resetGame() (display_width, display_height) = (width * 2, height * 2) pygame.init() screen_ale = pygame.display.set_mode((display_width, display_height)) pygame.display.set_caption("Arcade Learning Environment Random Agent Display") pygame.display.flip() game_surface = pygame.Surface((width, height), depth=8) clock = pygame.time.Clock() # Clear screen screen_ale.fill((0, 0, 0)) agent = Agent(config, env, sess, 'A') agent2 = Agent2(config, env, sess, 'B') if FLAGS.is_train: start_epoch = agent.epoch_op.eval() start_step = agent.step_op.eval() start_time = time.time() # Loop for epochs for agent.epoch in range(start_epoch, agent.max_epoch): agent2.epoch = agent.epoch # Initialize information of gameplay num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0. total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0. max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0 ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], [] # Get first frame of gameplay numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # Add first frame of gameplay into both agents' replay history for _ in range(agent.history_length): agent.history.add(scaled_pooled_screen) agent2.history.add(scaled_pooled_screen) # Loop for training iterations for agent.step in tqdm(range(start_step, agent.max_step), ncols=70, initial=start_step): agent2.step = agent.step # End of burn in period, start to learn from frames if agent.step == agent.learn_start: num_game, agent.update_count, agent2.update_count, ep_rewardA, ep_rewardB = 0, 0, 0, 0., 0. total_rewardA, total_rewardB, agent.total_loss, agent2.total_loss, agent.total_q, agent2.total_q = 0., 0., 0., 0., 0., 0. max_avg_ep_rewardA, max_avg_ep_rewardB = 0, 0 ep_rewardsA, ep_rewardsB, actionsA, actionsB = [], [], [], [] # 1. predict action1 = agent.predict(agent.history.get()) action2 = agent2.predict(agent2.history.get()) # 2. act ale.ale_act2(action1, action2) terminal = ale.ale_isGameOver() # End of end epoch, finish up training so that game statistics can be collected without training data being messed up if agent.step == agent.max_step - 1: terminal = True rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) agent.observe(scaled_pooled_screen, rewardA, action1, terminal) agent2.observe(scaled_pooled_screen, rewardB, action2, terminal) # Print frame onto display screen screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0)) # Update the display screen pygame.display.flip() # Check if current episode ended if terminal: ale.ale_resetGame() terminal = ale.ale_isGameOver() rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # End of an episode num_game += 1 ep_rewardsA.append(ep_rewardA) ep_rewardsB.append(ep_rewardB) ep_rewardA = 0. ep_rewardB = 0. else: ep_rewardA += rewardA ep_rewardB += rewardB actionsA.append(action1) actionsB.append(action2) total_rewardA += rewardA total_rewardB += rewardB # Do a test to get statistics so far if agent.step >= agent.learn_start: if agent.step % agent.test_step == agent.test_step - 1: avg_rewardA = total_rewardA / agent.test_step avg_rewardB = total_rewardB / agent2.test_step avg_lossA = agent.total_loss / agent.update_count avg_lossB = agent2.total_loss / agent2.update_count avg_qA = agent.total_q / agent.update_count avg_qB = agent2.total_q / agent2.update_count try: max_ep_rewardA = np.max(ep_rewardsA) min_ep_rewardA = np.min(ep_rewardsA) avg_ep_rewardA = np.mean(ep_rewardsA) max_ep_rewardB = np.max(ep_rewardsB) min_ep_rewardB = np.min(ep_rewardsB) avg_ep_rewardB = np.mean(ep_rewardsB) except: max_ep_rewardA, min_ep_rewardA, avg_ep_rewardA, max_ep_rewardB, min_ep_rewardB, avg_ep_rewardB = 0, 0, 0, 0, 0, 0 print('\nFor Agent A at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (agent.epoch, avg_rewardA, avg_lossA, avg_qA, avg_ep_rewardA, max_ep_rewardA, min_ep_rewardA, num_game)) print('\nFor Agent B at Epoch %d: avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (agent2.epoch, avg_rewardB, avg_lossB, avg_qB, avg_ep_rewardB, max_ep_rewardB, min_ep_rewardB, num_game)) if max_avg_ep_rewardA * 0.9 <= avg_ep_rewardA: agent.step_assign_op.eval({agent.step_input: agent.step + 1}) agent.save_model(agent.step + 1) max_avg_ep_rewardA = max(max_avg_ep_rewardA, avg_ep_rewardA) if max_avg_ep_rewardB * 0.9 <= avg_ep_rewardB: agent2.step_assign_op.eval({agent2.step_input: agent2.step + 1}) agent2.save_model(agent2.step + 1) max_avg_ep_rewardB = max(max_avg_ep_rewardB, avg_ep_rewardB) if agent.step > 180: agent.inject_summary({ 'average.reward': avg_rewardA, 'average.loss': avg_lossA, 'average.q': avg_qA, 'episode.max reward': max_ep_rewardA, 'episode.min reward': min_ep_rewardA, 'episode.avg reward': avg_ep_rewardA, 'episode.num of game': num_game, 'episode.rewards': ep_rewardsA, 'episode.actions': actionsA, 'training.learning_rate': agent.learning_rate_op.eval({agent.learning_rate_step: agent.step}), }, agent.step) if agent2.step > 180: agent2.inject_summary({ 'average.reward': avg_rewardB, 'average.loss': avg_lossB, 'average.q': avg_qB, 'episode.max reward': max_ep_rewardB, 'episode.min reward': min_ep_rewardB, 'episode.avg reward': avg_ep_rewardB, 'episode.num of game': num_game, 'episode.rewards': ep_rewardsB, 'episode.actions': actionsB, 'training.learning_rate': agent2.learning_rate_op.eval({agent2.learning_rate_step: agent2.step}), }, agent2.step) # Reset statistics num_game = 0 total_rewardA, total_rewardB = 0., 0. agent.total_loss, agent2.total_loss = 0., 0. agent.total_q, agent2.total_q = 0., 0. agent.update_count, agent2.update_count = 0, 0 ep_rewardA, ep_rewardB = 0., 0. ep_rewardsA, ep_rewardsB = [], [] actionsA, actionsB = [], [] # Play 10 games at the end of epoch to get game statistics total_points, paddle_bounce, wall_bounce, serving_time = [], [], [], [] for _ in range(10): cur_total_points, cur_paddle_bounce, cur_wall_bounce, cur_serving_time = 0, 0, 0, 0 # Restart game ale.ale_resetGame() # Get first frame of gameplay numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) # Create history for testing purposes test_history = History(config) # Fill first 4 images with initial screen for _ in range(agent.history_length): test_history.add(scaled_pooled_screen) while not ale.ale_isGameOver(): # 1. predict action1 = agent.predict(agent.history.get()) action2 = agent2.predict(agent2.history.get()) # 2. act ale.ale_act2(action1, action2) terminal = ale.ale_isGameOver() rewardA = ale.ale_getRewardA() rewardB = ale.ale_getRewardB() # Record game statistics of current episode cur_total_points = ale.ale_getPoints() cur_paddle_bounce = ale.ale_getSideBouncing() if ale.ale_getWallBouncing(): cur_wall_bounce += 1 if ale.ale_getServing(): cur_serving_time += 1 # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) agent.observe(scaled_pooled_screen, rewardA, action1, terminal) agent2.observe(scaled_pooled_screen, rewardB, action2, terminal) # Print frame onto display screen screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0)) # Update the display screen pygame.display.flip() # Append current episode's statistics into list total_points.append(cur_total_points) paddle_bounce.append(cur_paddle_bounce / cur_total_points) if cur_paddle_bounce == 0: wall_bounce.append(cur_wall_bounce / (cur_paddle_bounce + 1)) else: wall_bounce.append(cur_wall_bounce / cur_paddle_bounce) serving_time.append(cur_serving_time / cur_total_points) # Save results of test after current epoch cur_paddle_op = agent.paddle_op.eval() cur_paddle_op[agent.epoch] = sum(paddle_bounce) / len(paddle_bounce) agent.paddle_assign_op.eval({agent.paddle_input: cur_paddle_op}) cur_wall_op = agent.wall_op.eval() cur_wall_op[agent.epoch] = sum(wall_bounce) / len(wall_bounce) agent.wall_assign_op.eval({agent.wall_input: cur_wall_op}) cur_serving_op = agent.serving_op.eval() cur_serving_op[agent.epoch] = sum(serving_time) / len(serving_time) agent.serving_assign_op.eval({agent.serving_input: cur_serving_op}) agent.save_model(agent.step + 1) else: agent.play() agent2.play()
def main(_): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', help='RNG seed', type=int, default=123) parser.add_argument("--use-gpu", action="store_true") parser.add_argument("--mode", help="Bonus mode", default="autoencoder") parser.add_argument("--model-dir", help="the path of the model", default="ae_model/model.p") parser.add_argument("--img-dir", help="the path to save image", default="imgs/") parser.add_argument("--n", help="the number of episodes", default=10) args = parser.parse_args() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: config = get_config(args) if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and args.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") # Build the density model density_model = AutoEncoder("ae", sess, config) loadFromFlat(density_model.get_variables(), args.model_dir) na = config.n_action last_screen, reward, action, terminal = env.new_random_game() last_screen42x42 = imresize(last_screen, (42, 42), order=1) pi = RandomPolicy(na) if not os.path.exists(args.img_dir): os.mkdir(args.img_dir) # At first, use random action taker. for i in tqdm(range(args.n)): ep_steps = 0 prefix = args.img_dir + "ep%i/" % i if not os.path.exists(prefix): os.mkdir(prefix) prefix = prefix + 'img' while True: action = pi.action(last_screen) screen, reward, terminal = env.act(action) screen42x42 = imresize(screen, (42, 42), order=1) oh_action = np.zeros(na) oh_action[action] = 1 density_model.memory.add_sample(last_screen42x42, action, terminal) ep_steps += 1 if ep_steps >= 4: pscreen42x42 = density_model.predict().reshape(42, 42) img = concat2imgs(screen42x42, pscreen42x42) saveimg(img, ep_steps, prefix) # Update last_screen42x42 = screen42x42 last_screen = screen if terminal: last_screen, reward, action, terminal = env.new_random_game( ) last_screen42x42 = imresize(last_screen, (42, 42), order=1) break
def main(_): with tf.Session() as sess: config = get_config(FLAGS) or FLAGS if config.env_type == 'simple': env = SimpleGymEnvironment(config) else: env = GymEnvironment(config) if not tf.test.is_gpu_available() and FLAGS.use_gpu: raise Exception("use_gpu flag is true when no GPUs are available") if not FLAGS.use_gpu: config.cnn_format = 'NHWC' roms = 'roms/Pong2PlayerVS.bin' ale = ALEInterface(roms.encode('utf-8')) width = ale.ale_getScreenWidth() height = ale.ale_getScreenHeight() game_screen = GameScreen() ale.ale_resetGame() (display_width, display_height) = (width * 2, height * 2) pygame.init() screen_ale = pygame.display.set_mode((display_width, display_height)) pygame.display.set_caption( "Arcade Learning Environment Random Agent Display") pygame.display.flip() game_surface = pygame.Surface((width, height), depth=8) clock = pygame.time.Clock() # Clear screen screen_ale.fill((0, 0, 0)) agent = Agent(config, env, sess) if FLAGS.is_train: start_step = agent.step_op.eval() start_time = time.time() num_game, agent.update_count, ep_reward = 0, 0, 0. total_reward, agent.total_loss, agent.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) for _ in range(agent.history_length): agent.history.add(scaled_pooled_screen) for agent.step in tqdm(range(start_step, agent.max_step), ncols=70, initial=start_step): if agent.step == agent.learn_start: num_game, agent.update_count, ep_reward = 0, 0, 0. total_reward, agent.total_loss, agent.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = agent.predict(agent.history.get()) # 2. act ale.ale_act2(action, np.random.choice([20, 21, 23, 24])) terminal = ale.ale_isGameOver() reward = ale.ale_getRewardA() # screen, reward, terminal = agent.env.act(action, is_training=True) # 3. observe # Both agents perform random actions # Agent A : [NOOP, FIRE, RIGHT, LEFT] # Agent B : [NOOP, FIRE, RIGHT, LEFT] # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) agent.observe(scaled_pooled_screen, reward, action, terminal) # Print frame onto display screen screen_ale.blit(pygame.transform.scale2x(game_surface), (0, 0)) #Update the display screen pygame.display.flip() if terminal: ale.ale_resetGame() terminal = ale.ale_isGameOver() reward = ale.ale_getRewardA() numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if agent.step >= agent.learn_start: if agent.step % agent.test_step == agent.test_step - 1: avg_reward = total_reward / agent.test_step avg_loss = agent.total_loss / agent.update_count avg_q = agent.total_q / agent.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 print('\navg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game)) if max_avg_ep_reward * 0.9 <= avg_ep_reward: agent.step_assign_op.eval( {agent.step_input: agent.step + 1}) agent.save_model(agent.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if agent.step > 180: agent.inject_summary( { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': agent.learning_rate_op.eval( {agent.learning_rate_step: agent.step }), }, agent.step) num_game = 0 total_reward = 0. agent.total_loss = 0. agent.total_q = 0. agent.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] else: while not ale.ale_isGameOver(): # Fill buffer of game screen with current frame numpy_surface = np.frombuffer(game_surface.get_buffer(), dtype=np.uint8) rgb = getRgbFromPalette(ale, game_surface, numpy_surface) del numpy_surface game_screen.paint(rgb) pooled_screen = game_screen.grab() scaled_pooled_screen = scale_image(pooled_screen) ale.ale_act2(agent.predict(pooled_screen), np.random.choice([20, 21, 23, 24])) print(ale.ale_getRewardA()) # Print frame onto display screen screen.blit(pygame.transform.scale2x(game_surface), (0, 0)) # Update the display screen pygame.display.flip() # delay to 60fps clock.tick(60.)