示例#1
0
def run_q_agent(policy='ε–greedy', save=False):
    """Runs a q agent according to a policy"""
    agent = Q_Agent()
    all_iterations, all_rewards, step_count = agent.train(env,
                                                          iter_n=1000,
                                                          policy=policy)
    plot_reward(all_iterations, all_rewards)
    plot_steps(all_iterations, step_count)
示例#2
0
def random_search():
    """Random search to determine starting point for the model and best params"""
    gamma = 0.7
    alpha = 0.3
    epsilon = 1
    exploration_rate_decay = 0.87

    max_tries = 10
    best_score = -1000
    scores = {}

    for attempt in range(max_tries):

        agent = Q_Agent(epsilon=1,
                        alpha=alpha,
                        gamma=gamma,
                        exploration_rate_decay=exploration_rate_decay)
        _, rewards, steps = agent.train(env,
                                        iter_n=300,
                                        policy='ε–greedy',
                                        print_results=False)
        print(np.mean(rewards))
        scores[attempt] = np.mean(rewards)

        print(
            "Score:{}, gamma {}, alpha {}, epsilon {}, e_decay_rate{}".format(
                scores[attempt], gamma, alpha, epsilon,
                exploration_rate_decay))

        if scores[attempt] > best_score:
            best_score = scores[attempt]
            print(best_score)
            best_gamma = gamma
            best_alpha = alpha
            best_epsilon = epsilon
            best_decay = exploration_rate_decay

        gamma = best_gamma + (np.random.randint(-1, 2) / 10)
        gamma = min(1, gamma)
        gamma = max(0, gamma)
        alpha = best_alpha + (np.random.randint(-1, 2) / 10)
        alpha = min(1, alpha)
        alpha = max(0, alpha)
        epsilon = 1
        exploration_rate_decay = best_decay + np.random.randint(-1, 2) / 100
        exploration_rate_decay = min(0.99, exploration_rate_decay)
        exploration_rate_decay = max(0.7, exploration_rate_decay)

    print("Best validation_accuracy:", best_score)
    print("Best settings:")
    print("best gamma:", best_gamma)
    print("best alpha:", best_alpha)
    print("best epsilon:", best_epsilon)
    print("best decay:", best_decay)
示例#3
0
def grid_search_param(environmnet, policy='ε–greedy', parameter='alpha'):
    """Grid search for alpha or gamma adjustable via the parameter field"""

    parameter_values = []
    avg_scores = []
    avg_steps = []

    count = 1

    for param_num in np.linspace(0.2, 1, 9):
        if parameter == 'alpha':
            agent = Q_Agent(alpha=param_num)
        elif parameter == 'gamma':
            agent = Q_Agent(gamma=param_num)

        all_iterations, all_rewards, step_count = agent.train(
            environmnet, print_results=True, iter_n=1000, policy=policy)
        avg_scores.append(np.mean(all_rewards))
        avg_steps.append(np.mean(step_count))
        parameter_values.append(param_num)
        rewards_data = np.array([all_iterations, all_rewards])
        step_data = np.array([all_iterations, step_count])

        np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' +
                   parameter + '_inv/' + parameter + '_rewards_' +
                   str(param_num) + '.csv',
                   rewards_data.transpose(),
                   delimiter=",")
        np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' +
                   parameter + '_inv/' + parameter + '_steps_' +
                   str(param_num) + '.csv',
                   step_data.transpose(),
                   delimiter=",")
        if count % 50 == 0:
            print('iteration {} of 10'.format(count))

        count += 1
    results = {
        'alpha_values': parameter_values,
        'avg_scores': avg_scores,
        'avg_steps': avg_steps,
    }
    print(results)
    return pd.DataFrame(results)
示例#4
0
def grid_search_epsilon(environmnet, policy='ε–greedy', parameter='epsilon'):
    """Grid search for epsilon values"""
    parameter_values = []
    avg_scores = []
    avg_steps = []

    count = 1
    decay_search = [0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.99]
    for param_num in decay_search:

        agent = Q_Agent(exploration_rate_decay=param_num, epsilon=1)
        all_iterations, all_rewards, step_count = agent.train(
            environmnet, print_results=True, iter_n=1000, policy=policy)
        avg_scores.append(np.mean(all_rewards))
        avg_steps.append(np.mean(step_count))
        parameter_values.append(param_num)
        rewards_data = np.array([all_iterations, all_rewards])
        step_data = np.array([all_iterations, step_count])

        np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' +
                   parameter + '_inv/' + parameter + '_rewards_' +
                   str(param_num) + '.csv',
                   rewards_data.transpose(),
                   delimiter=",")
        np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' +
                   parameter + '_inv/' + parameter + '_steps_' +
                   str(param_num) + '.csv',
                   step_data.transpose(),
                   delimiter=",")
        if count % 50 == 0:
            print('iteration {} of 10'.format(count))

        count += 1
    results = {
        'param_values': parameter_values,
        'avg_scores': avg_scores,
        'avg_steps': avg_steps,
    }
    print(results)
    return pd.DataFrame(results)
from pybricks.tools import print
import utils_motor
import random
import time

# Play a beep sound
brick.sound.beep()
print('Should display on VisualStudio')
seedling = int(round(time.time()))
random.seed(seedling)

# Initialize environment
# If we invert the reward during training the robot should change direction
env = CrawlingRobotEnv(step_angle=45, invert_reward=True)
current_state = env.reset()
agent = Q_Agent(env, gamma=0.9, alpha=0.2)

# Do the right sequence (1,5,2,4,0)
# 0: LEG NEUTRAL
# 1: LEG UP
# 2: LEG DOWN
# 3: FEET NEUTRAL
# 4: FEET UP
# 5: FEET DOWN
print('Distance:', env.read_sensor())
# Backward
#list_actions = [1, 4, 2, 5, 1, 4, 2, 5, 1, 4, 2, 5, 1, 4, 2, 5,1, 4, 2, 5, 1, 4, 2, 5, 0, 3]
# Forward
list_actions = [
    1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 0,
    3
# Bigger values decay faster
e_greedy_decay = 1.0 / num_iterations_train
# Initial agent action probability (just try things at random)
initial_e_greedy_prob = 1.0
# Number of iterations before check statitics of reward
num_steps_eval = num_iterations_train // 10

if __name__ == '__main__':
    # Initialize environment
    env = CrawlingRobotEnv(invert_reward=True,
                           run_on_lego=running_on_lego,
                           step_angle=40)
    current_state = env.reset()
    agent = Q_Agent(env,
                    gamma=0.9,
                    alpha=0.2,
                    e_greedy_prob=initial_e_greedy_prob,
                    e_greedy_decay=e_greedy_decay)
    print(agent.q_val_table)

    # Train
    sum_rewards = 0
    sum_rewards_vec = []
    for steps in range(num_iterations_train):
        action = agent.choose_action(current_state)
        current_state_str = str(env)
        next_state, reward, done, info = env.step(action)
        next_state_str = env.state_idx_to_str(next_state)
        action_str = env.action_idx_to_str(action)
        agent.update_q_table(current_state, action, reward, next_state)
        print('steps:', steps, '\n\tcurrent_state:', current_state_str,