def main(): env = gym.make('PongDeterministic-v4') network_input_shape = (4, 210, 160 ) # Dimension ordering: 'th' (channels first) actions = env.action_space.n agent = DQN_CNN(actions, network_input_shape, learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR) frame_counter = 0 for ep in range(1, EPISODES + 1): # Start episode score = 0 # Observe reward and initialize first state obs = preprocess_observation(env.reset()) # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) frame_counter += 1 for t in range(MAX_STEPS): # Select an action using the DQA action = agent.act(np.asarray([current_state])) pdb.set_trace() # Observe reward and next state obs, reward, done, info = env.step(action) obs = preprocess_observation(obs) next_state = get_next_state(current_state, obs) frame_counter += 1 # Store transition in replay memory clipped_reward = np.clip(reward, -1, 1) # Clip the reward agent.add_experience(np.asarray([current_state]), action, clipped_reward, np.asarray([next_state]), done)
# Stop the episode if it takes too long if frame_counter > args.max_frames_number: DQA.quit() # Select an action using the DQA action = DQA.get_action(np.asarray([current_state])) # Observe reward and next state obs, reward, done, info = env.step(action) # Render the game if args.video: imshow('state', obs) waitKey(1) #vid.write(state) obs = utils.preprocess_observation(obs) next_state = utils.get_next_state(current_state, obs) frame_counter += 1 # Store transition in replay memory clipped_reward = np.clip(reward, -1, 1) # Clip the reward DQA.add_experience(np.asarray([current_state]), action, clipped_reward, np.asarray([next_state]), done) # Train the agent if t % args.update_freq == 0 and len( DQA.experiences) >= args.replay_start_size: DQA.train() # Every C DQN updates, update DQN_target if DQA.training_count % args.target_network_update_freq == 0 and DQA.training_count >= args.target_network_update_freq: DQA.reset_target_network()
def evaluate(DQA, args, logger, env): global max_mean_score evaluation_csv = 'evaluation.csv' logger.to_csv(evaluation_csv, 'length,score') scores = list() frame_counter = 0 while frame_counter < args.validation_frames: remaining_random_actions = args.initial_random_actions obs = utils.preprocess_observation(env.reset()) frame_counter += 1 # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) t = 0 episode = 0 score = 0 # Start episode while True: # Render the game if video output is not suppressed if args.video: env.render() action = DQA.get_action(np.asarray([current_state]), testing=True, force_random=remaining_random_actions > 0) obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) current_state = utils.get_next_state(current_state, obs) if remaining_random_actions > 0: remaining_random_actions -= 1 score += reward t += 1 frame_counter += 1 # End episode if done or t > args.max_episode_length: episode += 1 print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter)) print('Length: %d, Score: %.1f\n\n' % (t, score)) # Save episode data in the evaluation csv logger.to_csv(evaluation_csv, [t, score]) break scores.append([t, score]) scores = np.asarray(scores) max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel() max_idx = np.random.choice(max_indices) # Save best model if max_mean_score < np.mean(scores): max_mean_score = np.mean(scores) DQA.DQN.save(append='_best') return scores[max_idx, :].ravel()
def evaluate(DQA, args, logger): global max_mean_score evaluation_csv = 'evaluation.csv' logger.to_csv(evaluation_csv, 'length,score') env = gym.make(args.environment) scores = list() frame_counter = 0 while frame_counter < args.validation_frames: remaining_random_actions = args.initial_random_actions obs = utils.preprocess_observation(env.reset()) frame_counter += 1 # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) t = 0 episode = 0 score = 0 # Start episode while True: # Render the game if video output is not suppressed if args.video: env.render() action = DQA.get_action(np.asarray([current_state]), testing=True, force_random=remaining_random_actions > 0) obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) current_state = utils.get_next_state(current_state, obs) if remaining_random_actions > 0: remaining_random_actions -= 1 score += reward t += 1 frame_counter += 1 # End episode if done or t > args.max_episode_length: episode += 1 print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter)) print('Length: %d\n, Score: %f\n\n' % (t, score)) # Save episode data in the evaluation csv logger.to_csv(evaluation_csv, [t, score]) break scores.append([t, score]) scores = np.asarray(scores) max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel() max_idx = np.random.choice(max_indices) # Save best model if max_mean_score < np.mean(scores): max_mean_score = np.mean(scores) DQA.DQN.save(append='_best') return scores[max_idx, :].ravel()
while t < args.max_episode_length: # Stop the episode if it takes too long if frame_counter > args.max_frames_number: DQA.quit() # Render the game if args.video: env.render() # Select an action using the DQA action = DQA.get_action(np.asarray([current_state])) # Observe reward and next state obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) next_state = utils.get_next_state(current_state, obs) frame_counter += 1 # Store transition in replay memory clipped_reward = np.clip(reward, -1, 1) # Clip the reward DQA.add_experience(np.asarray([current_state]), action, clipped_reward, np.asarray([next_state]), done) # Train the agent if t % args.update_freq == 0 and len(DQA.experiences) >= args.replay_start_size: DQA.train() # Every C DQN updates, update DQN_target