def evaluate_model(agent_opts, model_opts, best_model=True): agent = DQAgent(env, **agent_opts) agent.build_model(**model_opts) if best_model: filename = agent_opts.get('BEST_MODEL_FILE')[:-3] agent.load_weights(filename) else: agent.load_weights(f'{root_path}sx_pur_rec') # results = agent.evaluate(100, render=False, verbose=False) # print(f'Average Reward: {sum(sum(results,[]))/len(results)}') results = agent.evaluate(1, render=True, verbose=True)
def main(environment, file_out, weight_file, action_value, f_duration, watch, save): use_CNN = False env = gym.make(environment) if use_CNN is True: state_size = (88, 80, 1) else: state_size = env.observation_space.shape[0] action_size = env.action_space.n online_dqn = DQAgent(state_size, action_size, loss="huber_loss", action=action_value, use_CNN=use_CNN) target_dqn = DQAgent(state_size, action_size, loss="huber_loss", action=action_value, use_CNN=use_CNN) online_dqn.model.load_weights(weight_file) target_dqn.update_target_weights(online_dqn.model) print("Playing {} using weights {} and action {}").format( environment, weight_file, action_value) epsilon_max = .1 online_dqn.epsilon = epsilon_max done = False state = env.reset() state = np.reshape(state, [1, state_size]) cumulative_reward = 0 global_step = 0 if save is True: images = [] while not done: global_step += 1 q_values = online_dqn.model.predict(state)[0] action = online_dqn.action(q_values, online_dqn.epsilon) next_state, reward, done, info = env.step(action) next_state = np.reshape(next_state, [1, state_size]) cumulative_reward += reward if watch is True: env.render() if save is True: images.append(env.render(mode="rgb_array")) state = next_state if done: print("Score {}, Total steps {}").format(cumulative_reward, global_step) break if save is True: imageio.mimsave(file_out, images, duration=f_duration) return 0
def main(environment, file_out, weight_file, action_value, f_duration, watch, save): use_CNN = True env = gym.make(environment) if use_CNN is True: state_size = (88, 80, 1) else: state_size = env.observation_space.shape[0] action_size = env.action_space.n # Stack group_size number of atari images group_size = 4 # The following are hard-coded for now, but original image # is scaled by preprocssing down to 88, 80, 1 and we combine # 4 of them to get a batch of images # Note that the "1" argument is the number of copies of environment to train simultaneously runner = Runner(environment, 1, group_size) online_dqn = DQAgent(state_size, action_size, loss="huber_loss", action=action_value, use_CNN=True) target_dqn = DQAgent(state_size, action_size, loss="huber_loss", action=action_value, use_CNN=True) online_dqn.model.load_weights(weight_file) target_dqn.update_target_weights(online_dqn.model) print("Playing {} using weights {} and action {}").format( environment, weight_file, action_value) epsilon_max = .1 online_dqn.epsilon = epsilon_max done = False done_flags = True lives = 5 state = runner.reset_all() cumulative_reward = 0 global_step = 0 if save is True: images = [] while not done: global_step += 1 q_values = online_dqn.model.predict(state)[0] if done_flags is False: action = online_dqn.action(q_values, online_dqn.epsilon) else: random_fire_actions = np.random.randint(1, 3) for i in range(random_fire_actions): action = 1 next_state, reward, done, info = runner.step([action]) state = next_state done_flags = False continue next_state, reward, done, info = runner.step([action]) if watch is True: runner.render() sleep(.05) if save is True: images.append(runner.render(mode="rgb_array")) cumulative_reward += reward # Losing a life is bad, so say so remaining_lives = info[0]["ale.lives"] life_lost_flag = bool(lives - remaining_lives) lives = remaining_lives done_flags = False if life_lost_flag or done: done_flags = True state = next_state if done: print("Score {}, Total steps {}").format(cumulative_reward, global_step) break if save is True: imageio.mimsave(file_out, images, duration=f_duration) return 0
def main(environment, loss_function, action_value, use_CNN, total_games, burn_in, training_interval, target_update_interval, save_interval, num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval): # Set up logging start_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') log_dir, parameter_file, score_file = setup_logs(environment, start_time) ################################ # Save our training parameters # line = "loss_function: {}\nactionvalue: {}\ntotal_games: {}\ntraining_interval: {}\ntarget_update_interval: {}\nsave_interval: {}\nnum_epochs: {}\nbatch_size: {}\nlearning_rate: {}\nepsilon_max: {}\nepsilon_min: {}\nepsilon_decay_steps: {}\ngamma: {}\nmemory_size: {}\nlog_interval: {}\n".format( loss_function, action_value, total_games, training_interval, target_update_interval, save_interval, num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval) os.write(parameter_file, line) ################################ # Set up our environment env = gym.make(environment) if use_CNN is True: state_size = (88, 80, 1) else: state_size = env.observation_space.shape[0] action_size = env.action_space.n # Stack group_size number of atari images group_size = 4 # The following are hard-coded for now, but original image # is scaled by preprocssing down to 88, 80, 1 and we combine # 4 of them to get a batch of images # Note that the "1" argument is the number of copies of environment to train simultaneously runner = Runner(environment, 1, group_size) # Note that if use_CNN = True, then the state_size is ignored! online_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate, epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN) target_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate, epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN) target_dqn.update_target_weights(online_dqn.model) # Include a threshold value to stop training solved_thresh = 500 print("Playing {} using loss {} and action {}").format( environment, loss_function, action_value) done = False score_history = deque([], maxlen=log_interval) max_score = 0 global_step = 0 game_num = 1 state = runner.reset_all() cumulative_reward = 0 lives = 5 done_flags = True while game_num < total_games: # Use target_dqn to make Q-values # online_dqn then takes epsilon-greedy action global_step += 1 q_values = online_dqn.model.predict(state)[0] # If we lose a life, start with a few FIRE actions # to get started again. Random to avoid learning # fixed sequence of actions if done_flags is False: action = online_dqn.action(q_values, online_dqn.epsilon) else: random_fire_actions = np.random.randint(1, 3) for i in range(random_fire_actions): action = FIRE_ACTION_NUMBER next_state, reward, done, info = runner.step([action]) state = next_state done_flags = False continue next_state, reward, done, info = runner.step([action]) cumulative_reward += reward[0] # Losing a life is bad, so say so remaining_lives = info[0]["ale.lives"] life_lost_flag = bool(lives - remaining_lives) lives = remaining_lives done_flags = False if life_lost_flag or done: done_flags = True # Store the result in memory so we can replay later online_dqn.remember(state, action, reward, next_state, done_flags) state = next_state if done: score_history.append(cumulative_reward) if cumulative_reward > max_score: max_score = cumulative_reward if game_num % log_interval == 0: os.write(score_file, str(list(score_history)) + '\n') print( "Completed game {}/{}, global step {}, last {} games average: {:.3f}, max: {}, min: {}. Best so far {}. Epsilon: {:.3f}" .format(game_num, total_games, global_step, log_interval, np.average(score_history), np.max(score_history), np.min(score_history), max_score, online_dqn.epsilon)) game_num += 1 cumulative_reward = 0 lives = 5 state = runner.reset_all() # If we have an average score > 195.0 over 100 consecutive rounds, we have solved CartPole! if game_num > 100: avg_last_100 = np.average(score_history) if avg_last_100 > solved_thresh: stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') print("Congratulations! {} has been solved after {} games." ).format(environment, game_num) online_dqn.model.save( os.path.join( log_dir, "online_dqn_{}_solved.h5".format(environment))) line = "Training start: {}\nTraining ends: {}\n".format( start_time, stop_time) os.write(parameter_file, line) os.write(score_file, str(list(score_history)) + '\n') os.close(parameter_file) os.close(score_file) return 0 # For the first burn_in number of rounds, just populate memory if global_step < burn_in: continue # Once we are past the burn_in exploration period, we start to train # This is a linear decay that goes from epsilon_max to epsion_min in epsilon_decay_steps online_dqn.epsilon = max( epsilon_max + ((global_step - burn_in) / float(epsilon_decay_steps)) * (epsilon_min - epsilon_max), epsilon_min) if (global_step % training_interval == 0): replay_from_memory(online_dqn, target_dqn, batch_size, num_epochs) if (global_step % target_update_interval == 0): target_dqn.update_target_weights(online_dqn.model) if global_step % save_interval == 0: online_dqn.model.save(os.path.join(log_dir, "online_dqn" + ".h5")) ################################################################## # If we're here, then we finished our training without solution # # Let's save the most recent models and make the plots anyway # ################################################################# stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') online_dqn.model.save( os.path.join(log_dir, "online_dqn_" + str(global_step) + ".h5")) print("Done! Completed game {}/{}, global_step {}".format( game_num, total_games, global_step)) line = "\n \nTraining start: {}\nTraining ends: {}\n \n".format( start_time, stop_time) os.write(parameter_file, line) if game_num % log_interval != 0: os.write(score_file, str(list(score_history)[:game_num % log_interval]) + '\n') os.close(parameter_file) os.close(score_file) return 0
# Variables test_scores = [] test_mean_q = [] test_states = [] # Setup from breakout_env import Breakout env = Breakout({}) network_input_shape = (4, 110, 84) # Dimension ordering: 'th' (channels first) DQA = DQAgent(env.actions, network_input_shape, replay_memory_size=args.replay_memory_size, minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger) # Initial logging logger.log({ 'Action space': env.actions, #env.action_space.n, 'Observation space': env.obs_base #env.observation_space.shape }) logger.log(vars(args)) training_csv = 'training_info.csv' eval_csv = 'evaluation_info.csv' test_csv = 'test_score_mean_q_info.csv'
"buckets": buckets, "input_layer_mult": input_layer_mult, "learning_rate": learning_rate, "epsilon": epsilon, "gamma": gamma, "replay_step_size": replay_step_size, "epsilon_decay": epsilon_decay, "epsilon_min": epsilon_min, "batch_size": batch_size, "memory_size": memory_size, "name": nameDQAgent }) config = wandb.config agent1 = DQAgent(env, config) # agent2 = QAgent(env, config) # New change agent1_run_config = { "training_episodes": training_episodes, "steps": steps, "render": render, "early_stop": early_stop, "episode_time_limit": episode_time_limit } Train(agent1, agent1_run_config, goal, min_reward) gamma_experiment_config = { "experiment_episodes": experiment_episodes,
def train_model(agent_opts, model_opts): agent = DQAgent(env, **agent_opts) agent.build_model(**model_opts) agent.train(n_epochs=75, render=False) agent.save_weights(f'{root_path}sx_pur_rec') agent.show_plots() agent.show_plots('loss') env.close()
minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger) else: DQA = DQAgent(env.action_space.n, network_input_shape, replay_memory_size=args.replay_memory_size, minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger) # Initial logging logger.log({ 'Action space': env.action_space.n, 'Observation space': env.observation_space.shape }) logger.log(vars(args)) training_csv = 'training_info.csv' eval_csv = 'evaluation_info.csv'
atexit.register(exit_handler) # Make sure to always save the model when exiting # Variables test_scores = [] test_mean_q = [] test_states = [] # Setup env = gym.make(args.environment) network_input_shape = (4, 110, 84) # Dimension ordering: 'th' (channels first) DQA = DQAgent(env.action_space.n, network_input_shape, replay_memory_size=args.replay_memory_size, minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger) # Initial logging logger.log({ 'Action space': env.action_space.n, 'Observation space': env.observation_space.shape }) logger.log(vars(args)) training_csv = 'training_info.csv' eval_csv = 'evaluation_info.csv' test_csv = 'test_score_mean_q_info.csv'
def main(environment, loss_function, action_value, use_CNN, total_games, max_time_per_game, burn_in, training_interval, target_update_interval, save_interval, num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval): # Set up logging start_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') log_dir, parameter_file, score_file = setup_logs(environment, start_time) ################################ # Save our training parameters # line = "loss_function: {}\nactionvalue: {}\ntotal_games: {}\nmax_time_per_game: {}\ntraining_interval: {}\ntarget_update_interval: {}\nsave_interval: {}\nnum_epochs: {}\nbatch_size: {}\nlearning_rate: {}\nepsilon_max: {}\nepsilon_min: {}\nepsilon_decay_steps: {}\ngamma: {}\nmemory_size: {}\nlog_interval: {}\n".format(loss_function, action_value, total_games, max_time_per_game, training_interval, target_update_interval, save_interval, num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval) os.write(parameter_file, line) ################################ # Set up our environment env = gym.make(environment) if use_CNN is True: state_size = (88, 80, 1) else: state_size = env.observation_space.shape[0] action_size = env.action_space.n # The following are hard-coded for now, but original image # is scaled by preprocssing down to 88, 80, 1 and we combine # 4 of them to get a batch of images # Note that the "1" argument is the number of copies of environment to train simultaneously #runner = Runner(environment, 1, group_size) # Note that if use_CNN = True, then the state_size is ignored! online_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate, epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN) target_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate, epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN) target_dqn.update_target_weights(online_dqn.model) # Solved criterion for CartPole, LunarLander, etc if environment == "CartPole-v0": solved_thresh = 195.0 max_time_per_game = 200 elif environment == "LunarLander-v2": solved_thresh = 200.0 max_time_per_game = 1000 else: solved_thresh = 500.0 print("Not sure solution condition for {}; using average of 100 rounds > {}".format(environment, solved_thresh)) print("Playing {} using loss {} and action {}").format(environment, loss_function, action_value) done = False score_history = deque([], maxlen=log_interval) max_score = 0 global_step = 0 game_num = 1 state = env.reset() state = np.reshape(state, [1, state_size]) cumulative_reward = 0 done = False while game_num < total_games: # Use target_dqn to make Q-values # online_dqn then takes epsilon-greedy action global_step += 1 q_values = online_dqn.model.predict(state)[0] action = online_dqn.action(q_values, online_dqn.epsilon) next_state, reward, done, info = env.step(action) next_state = np.reshape(next_state, [1, state_size]) cumulative_reward += reward # Store the result in memory so we can replay later online_dqn.remember(state, action, reward, next_state, done) state = next_state if done: score_history.append(cumulative_reward) if cumulative_reward > max_score: max_score = cumulative_reward if game_num % log_interval == 0: os.write(score_file, str(list(score_history))+'\n') print("Completed game {}/{}, global step {}, last {} games average: {:.3f}, max: {}, min: {}. Best so far {}. Epsilon: {:.3f}".format(game_num, total_games, global_step, log_interval, np.average(score_history), np.max(score_history), np.min(score_history), max_score, online_dqn.epsilon)) game_num += 1 cumulative_reward = 0 state = env.reset() state = np.reshape(state, [1, state_size]) # If we have an average score > 195.0 over 100 consecutive rounds, we have solved CartPole! if game_num > 100: avg_last_100 = np.average(score_history) if avg_last_100 > solved_thresh: stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') print("Congratulations! {} has been solved after {} games.").format(environment, game_num) online_dqn.model.save(os.path.join(log_dir, "{}_online_dqn_solved.h5".format(environment))) line = "Training start: {}\nTraining ends: {}\n".format(start_time, stop_time) os.write(parameter_file, line) os.write(score_file, str(list(score_history))+'\n') os.close(parameter_file) os.close(score_file) return 0 # For the first burn_in number of rounds, just populate memory if global_step < burn_in: continue # Once we are past the burn_in exploration period, we start to train # This is a linear decay that goes from epsilon_max to epsion_min in epsilon_decay_steps online_dqn.epsilon = max(epsilon_max + ((global_step-burn_in)/float(epsilon_decay_steps))*(epsilon_min-epsilon_max), epsilon_min) if (global_step % training_interval == 0): replay_from_memory(online_dqn, target_dqn, batch_size, num_epochs) if (global_step % target_update_interval == 0): target_dqn.update_target_weights(online_dqn.model) if global_step % save_interval == 0: online_dqn.model.save(os.path.join(log_dir, "online_dqn" + ".h5")) ################################################################## # If we're here, then we finished our training without solution # # Let's save the most recent models and make the plots anyway # ################################################################# stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') online_dqn.model.save(os.path.join(log_dir, "online_dqn_" + str(global_step) + ".h5")) print("Done! Completed game {}/{}, global_step {}".format(game_num, total_games, global_step)) line = "\n \nTraining start: {}\nTraining ends: {}\n \n".format(start_time, stop_time) os.write(parameter_file, line) if game_num % log_interval != 0: os.write(score_file, str(list(score_history)[:game_num % log_interval])+'\n') os.close(parameter_file) os.close(score_file) return 0
logger = Logger(debug=args.debug) logger.log({ 'Action space': ACTIONS, 'Reward apple': 'snake lenght' if APPLE_REWARD is None else APPLE_REWARD, 'Reward death': DEATH_REWARD, 'Reward life': LIFE_REWARD }) logger.to_csv('test_data.csv', ['score,episode_length,episode_reward']) logger.to_csv('train_data.csv', ['ecore,episode_length,episode_reward']) logger.to_csv('loss_history.csv', ['loss']) # Agent DQA = DQAgent( ACTIONS, gamma=args.gamma, dropout_prob=args.dropout, load_path=args.load, logger=logger ) experience_buffer = [] # This will store the SARS tuples at each episode # Stats score = 0 episode_length = 0 episode_reward = 0 episode_nb = 0 exp_backup_counter = 0 global_episode_counter = 0 # Keeps track of how many episodes there were between traning iterations must_test = False # Initialize the game variables
#Initialize Environment print("Loading environment from", PATH_TO_ENV) env = UnityEnvironment(file_name=PATH_TO_ENV) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # get dimensions of action space and state space action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # Init agent agent = DQAgent(state_size=state_size, action_size=action_size, hidden_layers = [64, 64],double_ql = True) if len(sys.argv)>=2 and sys.argv[1]=="retrain": print("Retraining agent") scores = train_agent(agent, env, brain_name, max_score = 16.1) np.save("train_scores.npy",np.array(scores)) else: # Run the agent with pretrained weights print("Running agent with pretrained weights") # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('banana_weights.pth')) scores = test_agent(agent, env, brain_name) np.save("test_scores.npy",np.array(scores)) # Create Plot of Scores plt.figure()