def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time() stateReward = 0 state = None while not environment.isGameOver(): # Choose next action if evalWithEpsilon is None: epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6) else: epsilon = evalWithEpsilon if state is None or random.random() > (1 - epsilon): action = random.randrange(environment.getNumActions()) else: screens = np.reshape(state.getScreens(), (1, 84, 84, 4)) action = dqn.inference(screens) # Make the move oldState = state reward, state, isTerminal = environment.step(action) # Record experience in replay memory and train if isTraining and oldState is not None: maxReward = reward if reward > maxReward else maxReward clippedReward = min(1, max(-1, reward)) / maxReward replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal)) if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0: batch = replayMemory.drawBatch(32) dqn.train(batch, environment.getStepNumber()) if time.time() - lastLogTime > 60: print(' ...frame %d' % environment.getEpisodeFrameNumber()) lastLogTime = time.time() if isTerminal: state = None episodeTime = time.time() - startTime print('%s %d ended with score: %d (%d frames in %fs for %d fps)' % ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(), environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime)) epochTotalScore += environment.getGameScore() environment.resetGame() # return the average score return epochTotalScore / (environment.getGameNumber() - startGameNumber)
def run_epoch(min_epoch_steps, eval_with_epsilon=None): global train_epsilon global train_episodes global eval_episodes global episode_train_reward_list global episode_eval_reward_list is_training = True if eval_with_epsilon is None else False step_start = environment.get_step_number() start_game_number = environment.get_game_number() epoch_total_score = 0 stuck_count = 0 time_list = [] while environment.get_step_number( ) - step_start < min_epoch_steps and not stop: state_reward = 0 state = None episode_losses = [] save_net = False while not environment.is_game_over() and not stop: # epsilon selection and update if is_training: epsilon = train_epsilon if train_epsilon > args.epsilon_min: train_epsilon = train_epsilon * args.epsilon_decay if train_epsilon < args.epsilon_min: train_epsilon = args.epsilon_min else: epsilon = eval_with_epsilon # action selection if state is None or random.random() < epsilon: action = random.randrange(environment.get_num_actions()) else: action = dqn.inference(state.get_data()) # we can't skip frames as in a game # we need to wait the evolution of the environment, but we don't want to waste GPU time # we can use a training sweep (which requires some time) instead of using a sleep old_state = state for i in range(0, args.history_length * (args.repeat_action + 1)): if environment.get_step_number() % args.save_model_freq == 0: save_net = True # Make the move reward, state, is_terminal = environment.step(action) # train if is_training and old_state is not None: if environment.get_step_number() > args.observation_steps: if args.show_gpu_time: start_time_train = datetime.datetime.now() batch = replay_memory.draw_batch(args.batch_size) loss = dqn.train(batch, environment.get_step_number()) episode_losses.append(loss) if args.show_gpu_time: training_time = (datetime.datetime.now() - start_time_train).total_seconds() time_list.insert(0, training_time) if len(time_list) > 100: time_list = time_list[:-1] print("Training time: %fs, Avg time:%fs" % (training_time, np.mean(time_list))) if args.slowdown_cycle: time.sleep(args.gpu_time) else: time.sleep(args.gpu_time) else: time.sleep(args.gpu_time) if is_terminal: break # Record experience in replay memory if is_training and old_state is not None: replay_memory.add_sample( replay.Sample(old_state, action, reward, state, is_terminal)) if is_terminal: state = None if args.simulator: if reward == -1: stuck_count = stuck_count + 1 else: stuck_count = 0 if stuck_count > 2: print("Car stuck, resetting simulator position...") environment.control.reset_simulator() stuck_count = 0 if save_net: dqn.save_network() ################################# # logging ################################# episode_time = datetime.datetime.now() - start_time if is_training: train_episodes += 1 episode_train_reward_list.insert(0, environment.get_game_score()) if len(episode_train_reward_list) > 100: episode_train_reward_list = episode_train_reward_list[:-1] avg_rewards = np.mean(episode_train_reward_list) episode_avg_loss = 0 if episode_losses: episode_avg_loss = np.mean(episode_losses) log = ( 'Episode %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f Avg loss: %.5f' % (environment.get_game_number(), environment.get_game_score(), str(episode_time), environment.get_step_number(), avg_rewards, episode_avg_loss)) print(log) print(" epsilon " + str(train_epsilon)) if args.logging: with summary_writer.as_default(): tf.summary.scalar('train episode reward', environment.get_game_score(), step=train_episodes) tf.summary.scalar('train avg reward(100)', avg_rewards, step=train_episodes) tf.summary.scalar('average loss', episode_avg_loss, step=train_episodes) tf.summary.scalar('epsilon', train_epsilon, step=train_episodes) tf.summary.scalar('steps', environment.get_step_number(), step=train_episodes) else: eval_episodes += 1 episode_eval_reward_list.insert(0, environment.get_game_score()) if len(episode_eval_reward_list) > 100: episode_eval_reward_list = episode_eval_reward_list[:-1] avg_rewards = np.mean(episode_eval_reward_list) log = ( 'Eval %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f' % (environment.get_game_number(), environment.get_game_score(), str(episode_time), environment.get_step_number(), avg_rewards)) print(log) if args.logging: with summary_writer.as_default(): tf.summary.scalar('eval episode reward', environment.get_game_score(), step=eval_episodes) tf.summary.scalar('eval avg reward(100)', avg_rewards, step=eval_episodes) epoch_total_score += environment.get_game_score() environment.reset_game() while pause and not stop: time.sleep(1) if environment.get_game_number() - start_game_number == 0: return 0 return epoch_total_score / (environment.get_game_number() - start_game_number)
def runEpoch(minEpochSteps, evalWithEpsilon=None): global train_epsilon stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps and not stop: stateReward = 0 state = None while not environment.isGameOver() and not stop: # Choose next action if evalWithEpsilon is None: epsilon = train_epsilon else: epsilon = evalWithEpsilon if train_epsilon > args.epsilon_min: train_epsilon = train_epsilon * args.epsilon_decay if train_epsilon < args.epsilon_min: train_epsilon = args.epsilon_min if state is None or random.random() < (epsilon): action = random.randrange(environment.getNumActions()) else: screens = np.reshape( state.getScreens(), (1, State.IMAGE_SIZE, State.IMAGE_SIZE, args.frame)) action = dqn.inference(screens) # Make the move oldState = state reward, state, isTerminal = environment.step(action) # Record experience in replay memory and train if isTraining and oldState is not None: clippedReward = min(1, max(-1, reward)) replayMemory.addSample( replay.Sample(oldState, action, clippedReward, state, isTerminal)) if environment.getStepNumber( ) > args.observation_steps and environment.getEpisodeStepNumber( ) % args.frame == 0: batch = replayMemory.drawBatch(32) dqn.train(batch, environment.getStepNumber()) if isTerminal: state = None episodeTime = datetime.datetime.now() - startTime print( '%s %d ended with score: %d (%s elapsed)' % ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(), str(episodeTime))) if isTraining: print("epsilon " + str(train_epsilon)) epochTotalScore += environment.getGameScore() environment.resetGame() # return the average score if environment.getGameNumber() - startGameNumber == 0: return 0 return epochTotalScore / (environment.getGameNumber() - startGameNumber)