Пример #1
0
def evaluate_model(agent_opts, model_opts, best_model=True):
    agent = DQAgent(env, **agent_opts)
    agent.build_model(**model_opts)
    if best_model:
      filename = agent_opts.get('BEST_MODEL_FILE')[:-3]
      agent.load_weights(filename)
    else:
      agent.load_weights(f'{root_path}sx_pur_rec')
#    results = agent.evaluate(100, render=False, verbose=False)
#    print(f'Average Reward: {sum(sum(results,[]))/len(results)}')
    results = agent.evaluate(1, render=True, verbose=True)
Пример #2
0
def main(environment, file_out, weight_file, action_value, f_duration, watch,
         save):
    use_CNN = False
    env = gym.make(environment)
    if use_CNN is True:
        state_size = (88, 80, 1)
    else:
        state_size = env.observation_space.shape[0]

    action_size = env.action_space.n

    online_dqn = DQAgent(state_size,
                         action_size,
                         loss="huber_loss",
                         action=action_value,
                         use_CNN=use_CNN)
    target_dqn = DQAgent(state_size,
                         action_size,
                         loss="huber_loss",
                         action=action_value,
                         use_CNN=use_CNN)
    online_dqn.model.load_weights(weight_file)
    target_dqn.update_target_weights(online_dqn.model)

    print("Playing {} using weights {} and action {}").format(
        environment, weight_file, action_value)

    epsilon_max = .1
    online_dqn.epsilon = epsilon_max
    done = False

    state = env.reset()
    state = np.reshape(state, [1, state_size])
    cumulative_reward = 0
    global_step = 0

    if save is True:
        images = []
    while not done:
        global_step += 1

        q_values = online_dqn.model.predict(state)[0]

        action = online_dqn.action(q_values, online_dqn.epsilon)

        next_state, reward, done, info = env.step(action)

        next_state = np.reshape(next_state, [1, state_size])
        cumulative_reward += reward

        if watch is True:
            env.render()
        if save is True:
            images.append(env.render(mode="rgb_array"))

        state = next_state

        if done:
            print("Score {}, Total steps {}").format(cumulative_reward,
                                                     global_step)
            break
    if save is True:
        imageio.mimsave(file_out, images, duration=f_duration)
    return 0
Пример #3
0
def main(environment, file_out, weight_file, action_value, f_duration, watch,
         save):
    use_CNN = True
    env = gym.make(environment)
    if use_CNN is True:
        state_size = (88, 80, 1)
    else:
        state_size = env.observation_space.shape[0]

    action_size = env.action_space.n

    # Stack group_size number of atari images
    group_size = 4

    # The following are hard-coded for now, but original image
    # is scaled by preprocssing down to 88, 80, 1 and we combine
    # 4 of them to get a batch of images
    # Note that the "1" argument is the number of copies of environment to train simultaneously
    runner = Runner(environment, 1, group_size)

    online_dqn = DQAgent(state_size,
                         action_size,
                         loss="huber_loss",
                         action=action_value,
                         use_CNN=True)
    target_dqn = DQAgent(state_size,
                         action_size,
                         loss="huber_loss",
                         action=action_value,
                         use_CNN=True)
    online_dqn.model.load_weights(weight_file)
    target_dqn.update_target_weights(online_dqn.model)

    print("Playing {} using weights {} and action {}").format(
        environment, weight_file, action_value)

    epsilon_max = .1
    online_dqn.epsilon = epsilon_max
    done = False

    done_flags = True
    lives = 5

    state = runner.reset_all()
    cumulative_reward = 0
    global_step = 0
    if save is True:
        images = []
    while not done:
        global_step += 1

        q_values = online_dqn.model.predict(state)[0]

        if done_flags is False:
            action = online_dqn.action(q_values, online_dqn.epsilon)
        else:
            random_fire_actions = np.random.randint(1, 3)
            for i in range(random_fire_actions):
                action = 1
                next_state, reward, done, info = runner.step([action])
            state = next_state
            done_flags = False
            continue

        next_state, reward, done, info = runner.step([action])
        if watch is True:
            runner.render()
            sleep(.05)
        if save is True:
            images.append(runner.render(mode="rgb_array"))
        cumulative_reward += reward

        # Losing a life is bad, so say so
        remaining_lives = info[0]["ale.lives"]
        life_lost_flag = bool(lives - remaining_lives)
        lives = remaining_lives

        done_flags = False
        if life_lost_flag or done:
            done_flags = True

        state = next_state

        if done:
            print("Score {}, Total steps {}").format(cumulative_reward,
                                                     global_step)
            break
    if save is True:
        imageio.mimsave(file_out, images, duration=f_duration)
    return 0
def main(environment, loss_function, action_value, use_CNN, total_games,
         burn_in, training_interval, target_update_interval, save_interval,
         num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min,
         epsilon_decay_steps, gamma, memory_size, log_interval):
    # Set up logging
    start_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    log_dir, parameter_file, score_file = setup_logs(environment, start_time)

    ################################
    # Save our training parameters #
    line = "loss_function: {}\nactionvalue: {}\ntotal_games: {}\ntraining_interval: {}\ntarget_update_interval: {}\nsave_interval: {}\nnum_epochs: {}\nbatch_size: {}\nlearning_rate: {}\nepsilon_max: {}\nepsilon_min: {}\nepsilon_decay_steps: {}\ngamma: {}\nmemory_size: {}\nlog_interval: {}\n".format(
        loss_function, action_value, total_games, training_interval,
        target_update_interval, save_interval, num_epochs, batch_size,
        learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma,
        memory_size, log_interval)
    os.write(parameter_file, line)
    ################################

    # Set up our environment
    env = gym.make(environment)
    if use_CNN is True:
        state_size = (88, 80, 1)
    else:
        state_size = env.observation_space.shape[0]

    action_size = env.action_space.n

    # Stack group_size number of atari images
    group_size = 4

    # The following are hard-coded for now, but original image
    # is scaled by preprocssing down to 88, 80, 1 and we combine
    # 4 of them to get a batch of images
    # Note that the "1" argument is the number of copies of environment to train simultaneously
    runner = Runner(environment, 1, group_size)

    # Note that if use_CNN = True, then the state_size is ignored!
    online_dqn = DQAgent(state_size,
                         action_size,
                         loss=loss_function,
                         action=action_value,
                         learning_rate=learning_rate,
                         epsilon=epsilon_max,
                         gamma=gamma,
                         memory_size=memory_size,
                         use_CNN=use_CNN)
    target_dqn = DQAgent(state_size,
                         action_size,
                         loss=loss_function,
                         action=action_value,
                         learning_rate=learning_rate,
                         epsilon=epsilon_max,
                         gamma=gamma,
                         memory_size=memory_size,
                         use_CNN=use_CNN)

    target_dqn.update_target_weights(online_dqn.model)

    # Include a threshold value to stop training
    solved_thresh = 500

    print("Playing {} using loss {} and action {}").format(
        environment, loss_function, action_value)

    done = False
    score_history = deque([], maxlen=log_interval)
    max_score = 0
    global_step = 0
    game_num = 1

    state = runner.reset_all()
    cumulative_reward = 0
    lives = 5
    done_flags = True

    while game_num < total_games:
        # Use target_dqn to make Q-values
        # online_dqn then takes epsilon-greedy action
        global_step += 1

        q_values = online_dqn.model.predict(state)[0]

        # If we lose a life, start with a few FIRE actions
        # to get started again. Random to avoid learning
        # fixed sequence of actions
        if done_flags is False:
            action = online_dqn.action(q_values, online_dqn.epsilon)
        else:
            random_fire_actions = np.random.randint(1, 3)
            for i in range(random_fire_actions):
                action = FIRE_ACTION_NUMBER
                next_state, reward, done, info = runner.step([action])
            state = next_state
            done_flags = False
            continue

        next_state, reward, done, info = runner.step([action])
        cumulative_reward += reward[0]

        # Losing a life is bad, so say so
        remaining_lives = info[0]["ale.lives"]
        life_lost_flag = bool(lives - remaining_lives)
        lives = remaining_lives

        done_flags = False
        if life_lost_flag or done:
            done_flags = True

        # Store the result in memory so we can replay later
        online_dqn.remember(state, action, reward, next_state, done_flags)
        state = next_state

        if done:
            score_history.append(cumulative_reward)

            if cumulative_reward > max_score:
                max_score = cumulative_reward

            if game_num % log_interval == 0:
                os.write(score_file, str(list(score_history)) + '\n')
                print(
                    "Completed game {}/{}, global step {}, last {} games average: {:.3f}, max: {}, min: {}. Best so far {}. Epsilon: {:.3f}"
                    .format(game_num, total_games, global_step, log_interval,
                            np.average(score_history), np.max(score_history),
                            np.min(score_history), max_score,
                            online_dqn.epsilon))

            game_num += 1
            cumulative_reward = 0
            lives = 5
            state = runner.reset_all()

            # If we have an average score > 195.0 over 100 consecutive rounds, we have solved CartPole!
            if game_num > 100:
                avg_last_100 = np.average(score_history)

                if avg_last_100 > solved_thresh:
                    stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
                    print("Congratulations! {} has been solved after {} games."
                          ).format(environment, game_num)
                    online_dqn.model.save(
                        os.path.join(
                            log_dir,
                            "online_dqn_{}_solved.h5".format(environment)))
                    line = "Training start: {}\nTraining ends:  {}\n".format(
                        start_time, stop_time)
                    os.write(parameter_file, line)
                    os.write(score_file, str(list(score_history)) + '\n')
                    os.close(parameter_file)
                    os.close(score_file)
                    return 0

        # For the first burn_in number of rounds, just populate memory
        if global_step < burn_in:
            continue
        # Once we are past the burn_in exploration period, we start to train
        # This is a linear decay that goes from epsilon_max to epsion_min in epsilon_decay_steps
        online_dqn.epsilon = max(
            epsilon_max +
            ((global_step - burn_in) / float(epsilon_decay_steps)) *
            (epsilon_min - epsilon_max), epsilon_min)

        if (global_step % training_interval == 0):
            replay_from_memory(online_dqn, target_dqn, batch_size, num_epochs)

        if (global_step % target_update_interval == 0):
            target_dqn.update_target_weights(online_dqn.model)

        if global_step % save_interval == 0:
            online_dqn.model.save(os.path.join(log_dir, "online_dqn" + ".h5"))

    ##################################################################
    # If we're here, then we finished our training without solution #
    # Let's save the most recent models and make the plots anyway   #
    #################################################################
    stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    online_dqn.model.save(
        os.path.join(log_dir, "online_dqn_" + str(global_step) + ".h5"))

    print("Done! Completed game {}/{}, global_step {}".format(
        game_num, total_games, global_step))
    line = "\n \nTraining start: {}\nTraining ends:  {}\n \n".format(
        start_time, stop_time)
    os.write(parameter_file, line)
    if game_num % log_interval != 0:
        os.write(score_file,
                 str(list(score_history)[:game_num % log_interval]) + '\n')
    os.close(parameter_file)
    os.close(score_file)
    return 0
Пример #5
0
# Variables
test_scores = []
test_mean_q = []
test_states = []

# Setup
from breakout_env import Breakout
env = Breakout({})
network_input_shape = (4, 110, 84)  # Dimension ordering: 'th' (channels first)
DQA = DQAgent(env.actions,
              network_input_shape,
              replay_memory_size=args.replay_memory_size,
              minibatch_size=args.minibatch_size,
              learning_rate=args.learning_rate,
              discount_factor=args.discount_factor,
              dropout_prob=args.dropout,
              epsilon=args.epsilon,
              epsilon_decrease_rate=args.epsilon_decrease,
              min_epsilon=args.min_epsilon,
              load_path=args.load,
              logger=logger)

# Initial logging
logger.log({
    'Action space': env.actions,  #env.action_space.n,
    'Observation space': env.obs_base  #env.observation_space.shape
})
logger.log(vars(args))
training_csv = 'training_info.csv'
eval_csv = 'evaluation_info.csv'
test_csv = 'test_score_mean_q_info.csv'
Пример #6
0
                         "buckets": buckets,
                         "input_layer_mult": input_layer_mult,
                         "learning_rate": learning_rate,
                         "epsilon": epsilon,
                         "gamma": gamma,
                         "replay_step_size": replay_step_size,
                         "epsilon_decay": epsilon_decay,
                         "epsilon_min": epsilon_min,
                         "batch_size": batch_size,
                         "memory_size": memory_size,
                         "name": nameDQAgent
                     })

    config = wandb.config

    agent1 = DQAgent(env, config)

    # agent2 = QAgent(env, config)
    # New change

    agent1_run_config = {
        "training_episodes": training_episodes,
        "steps": steps,
        "render": render,
        "early_stop": early_stop,
        "episode_time_limit": episode_time_limit
    }
    Train(agent1, agent1_run_config, goal, min_reward)

    gamma_experiment_config = {
        "experiment_episodes": experiment_episodes,
Пример #7
0
def train_model(agent_opts, model_opts):
    agent = DQAgent(env, **agent_opts)
    agent.build_model(**model_opts)
    agent.train(n_epochs=75, render=False)
    agent.save_weights(f'{root_path}sx_pur_rec')
    agent.show_plots()
    agent.show_plots('loss')
    env.close()
                minibatch_size=args.minibatch_size,
                learning_rate=args.learning_rate,
                discount_factor=args.discount_factor,
                dropout_prob=args.dropout,
                epsilon=args.epsilon,
                epsilon_decrease_rate=args.epsilon_decrease,
                min_epsilon=args.min_epsilon,
                load_path=args.load,
                logger=logger)
else:
  DQA = DQAgent(env.action_space.n,
                network_input_shape,
                replay_memory_size=args.replay_memory_size,
                minibatch_size=args.minibatch_size,
                learning_rate=args.learning_rate,
                discount_factor=args.discount_factor,
                dropout_prob=args.dropout,
                epsilon=args.epsilon,
                epsilon_decrease_rate=args.epsilon_decrease,
                min_epsilon=args.min_epsilon,
                load_path=args.load,
                logger=logger)


# Initial logging
logger.log({
    'Action space': env.action_space.n,
    'Observation space': env.observation_space.shape
})
logger.log(vars(args))
training_csv = 'training_info.csv'
eval_csv = 'evaluation_info.csv'
Пример #9
0
atexit.register(exit_handler)  # Make sure to always save the model when exiting

# Variables
test_scores = []
test_mean_q = []
test_states = []

# Setup
env = gym.make(args.environment)
network_input_shape = (4, 110, 84)  # Dimension ordering: 'th' (channels first)
DQA = DQAgent(env.action_space.n,
              network_input_shape,
              replay_memory_size=args.replay_memory_size,
              minibatch_size=args.minibatch_size,
              learning_rate=args.learning_rate,
              discount_factor=args.discount_factor,
              dropout_prob=args.dropout,
              epsilon=args.epsilon,
              epsilon_decrease_rate=args.epsilon_decrease,
              min_epsilon=args.min_epsilon,
              load_path=args.load,
              logger=logger)

# Initial logging
logger.log({
    'Action space': env.action_space.n,
    'Observation space': env.observation_space.shape
})
logger.log(vars(args))
training_csv = 'training_info.csv'
eval_csv = 'evaluation_info.csv'
test_csv = 'test_score_mean_q_info.csv'
Пример #10
0
def main(environment, loss_function, action_value, use_CNN,
         total_games, max_time_per_game, burn_in,
         training_interval, target_update_interval, save_interval,
         num_epochs, batch_size, learning_rate,
         epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval):
    # Set up logging
    start_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    log_dir, parameter_file, score_file = setup_logs(environment, start_time)

    ################################
    # Save our training parameters #
    line = "loss_function: {}\nactionvalue: {}\ntotal_games: {}\nmax_time_per_game: {}\ntraining_interval: {}\ntarget_update_interval: {}\nsave_interval: {}\nnum_epochs: {}\nbatch_size: {}\nlearning_rate: {}\nepsilon_max: {}\nepsilon_min: {}\nepsilon_decay_steps: {}\ngamma: {}\nmemory_size: {}\nlog_interval: {}\n".format(loss_function, action_value, total_games, max_time_per_game, training_interval, target_update_interval,
            save_interval, num_epochs, batch_size, learning_rate, epsilon_max, epsilon_min, epsilon_decay_steps, gamma, memory_size, log_interval)
    os.write(parameter_file, line)
    ################################

    # Set up our environment
    env = gym.make(environment)
    if use_CNN is True:
        state_size = (88, 80, 1)
    else:
        state_size = env.observation_space.shape[0]

    action_size = env.action_space.n

    # The following are hard-coded for now, but original image
    # is scaled by preprocssing down to 88, 80, 1 and we combine
    # 4 of them to get a batch of images
    # Note that the "1" argument is the number of copies of environment to train simultaneously
    #runner = Runner(environment, 1, group_size)

    # Note that if use_CNN = True, then the state_size is ignored!
    online_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate,
                         epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN)
    target_dqn = DQAgent(state_size, action_size, loss=loss_function, action=action_value, learning_rate=learning_rate,
                         epsilon=epsilon_max, gamma=gamma, memory_size=memory_size, use_CNN=use_CNN)

    target_dqn.update_target_weights(online_dqn.model)

    # Solved criterion for CartPole, LunarLander, etc
    if environment == "CartPole-v0":
        solved_thresh = 195.0
        max_time_per_game = 200
    elif environment == "LunarLander-v2":
        solved_thresh = 200.0
        max_time_per_game = 1000
    else:
        solved_thresh = 500.0
        print("Not sure solution condition for {}; using average of 100 rounds > {}".format(environment, solved_thresh))

    print("Playing {} using loss {} and action {}").format(environment, loss_function, action_value)

    done = False
    score_history = deque([], maxlen=log_interval)
    max_score = 0
    global_step = 0
    game_num = 1

    state = env.reset()
    state = np.reshape(state, [1, state_size])
    cumulative_reward = 0
    done = False

    while game_num < total_games:
        # Use target_dqn to make Q-values
        # online_dqn then takes epsilon-greedy action
        global_step += 1

        q_values = online_dqn.model.predict(state)[0]

        action = online_dqn.action(q_values, online_dqn.epsilon)

        next_state, reward, done, info = env.step(action)

        next_state = np.reshape(next_state, [1, state_size])
        cumulative_reward += reward

        # Store the result in memory so we can replay later
        online_dqn.remember(state, action, reward, next_state, done)
        state = next_state

        if done:
            score_history.append(cumulative_reward)

            if cumulative_reward > max_score:
                max_score = cumulative_reward

            if game_num % log_interval == 0:
                os.write(score_file, str(list(score_history))+'\n')
                print("Completed game {}/{}, global step {}, last {} games average: {:.3f}, max: {}, min: {}. Best so far {}. Epsilon: {:.3f}".format(game_num, total_games, global_step, log_interval, np.average(score_history), np.max(score_history), np.min(score_history), max_score, online_dqn.epsilon))

            game_num += 1
            cumulative_reward = 0
            state = env.reset()
            state = np.reshape(state, [1, state_size])
            # If we have an average score > 195.0 over 100 consecutive rounds, we have solved CartPole!
            if game_num > 100:
                avg_last_100 = np.average(score_history)

                if avg_last_100 > solved_thresh:
                    stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
                    print("Congratulations! {} has been solved after {} games.").format(environment, game_num)
                    online_dqn.model.save(os.path.join(log_dir, "{}_online_dqn_solved.h5".format(environment)))
                    line = "Training start: {}\nTraining ends:  {}\n".format(start_time, stop_time)
                    os.write(parameter_file, line)
                    os.write(score_file, str(list(score_history))+'\n')
                    os.close(parameter_file)
                    os.close(score_file)
                    return 0

        # For the first burn_in number of rounds, just populate memory
        if global_step < burn_in:
            continue
        # Once we are past the burn_in exploration period, we start to train
        # This is a linear decay that goes from epsilon_max to epsion_min in epsilon_decay_steps
        online_dqn.epsilon = max(epsilon_max + ((global_step-burn_in)/float(epsilon_decay_steps))*(epsilon_min-epsilon_max), epsilon_min)

        if (global_step % training_interval == 0):
            replay_from_memory(online_dqn, target_dqn, batch_size, num_epochs)

        if (global_step % target_update_interval == 0):
            target_dqn.update_target_weights(online_dqn.model)

        if global_step % save_interval == 0:
            online_dqn.model.save(os.path.join(log_dir, "online_dqn" + ".h5"))

    ##################################################################
    # If we're here, then we finished our training without solution #
    # Let's save the most recent models and make the plots anyway   #
    #################################################################
    stop_time = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    online_dqn.model.save(os.path.join(log_dir, "online_dqn_" + str(global_step) + ".h5"))

    print("Done! Completed game {}/{}, global_step {}".format(game_num, total_games, global_step))
    line = "\n \nTraining start: {}\nTraining ends:  {}\n \n".format(start_time, stop_time)
    os.write(parameter_file, line)
    if game_num % log_interval != 0:
        os.write(score_file, str(list(score_history)[:game_num % log_interval])+'\n')
    os.close(parameter_file)
    os.close(score_file)
    return 0
Пример #11
0
logger = Logger(debug=args.debug)
logger.log({
    'Action space': ACTIONS,
    'Reward apple': 'snake lenght' if APPLE_REWARD is None else APPLE_REWARD,
    'Reward death': DEATH_REWARD,
    'Reward life': LIFE_REWARD
})
logger.to_csv('test_data.csv', ['score,episode_length,episode_reward'])
logger.to_csv('train_data.csv', ['ecore,episode_length,episode_reward'])
logger.to_csv('loss_history.csv', ['loss'])

# Agent
DQA = DQAgent(
    ACTIONS,
    gamma=args.gamma,
    dropout_prob=args.dropout,
    load_path=args.load,
    logger=logger
)
experience_buffer = []  # This will store the SARS tuples at each episode

# Stats
score = 0
episode_length = 0
episode_reward = 0
episode_nb = 0
exp_backup_counter = 0
global_episode_counter = 0  # Keeps track of how many episodes there were between traning iterations
must_test = False

# Initialize the game variables
Пример #12
0
    #Initialize Environment
    print("Loading environment from", PATH_TO_ENV)
    env = UnityEnvironment(file_name=PATH_TO_ENV)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    
    # get dimensions of action space and state space
    action_size = brain.vector_action_space_size
    state = env_info.vector_observations[0]
    state_size = len(state)

    # Init agent
    agent = DQAgent(state_size=state_size, action_size=action_size, hidden_layers = [64, 64],double_ql = True)

    if len(sys.argv)>=2 and sys.argv[1]=="retrain":
        print("Retraining agent")
        scores = train_agent(agent, env, brain_name, max_score = 16.1)
        np.save("train_scores.npy",np.array(scores))

    else: # Run the agent with pretrained weights
        print("Running agent with pretrained weights")

        # load the weights from file
        agent.qnetwork_local.load_state_dict(torch.load('banana_weights.pth'))
        scores = test_agent(agent, env, brain_name)
        np.save("test_scores.npy",np.array(scores))
        # Create Plot of Scores
        plt.figure()