示例#1
0
def perfect_features_test():
    from src.env.Amatrix_task import Amatrix

    n = 20
    m = 3
    env = Amatrix(n, m)

    features = env.Amatrix  # perfect features
    weights = np.random.rand(n)

    config = Config()
    config.parameter_size = n
    config.init_alpha = 0.001
    adam = Adam(config)

    sample_size = 100000
    for i in range(sample_size):
        rand_row = np.random.randint(n)
        target = env.sample_target(rand_row, noisy=True)

        pred_features = features[rand_row, :]
        prediction = np.dot(pred_features, weights)
        error = target - prediction
        gradient, new_stepsize, new_weight_vector = adam.update_weight_vector(
            error, pred_features, weights)
        weights = new_weight_vector
        if (i + 1) % 10000 == 0:
            print("Sample number: {0}".format(i + 1))
            print("\tPrediction error:{0}".format(error))

    print("Theta star:\n{0}".format(env.theta_star))
    print("Estimated theta:\n{0}".format(weights))
    difference = np.sqrt(np.sum(np.square(env.theta_star - weights)))
    print("L2 norm of difference:\n{0}".format(difference))
示例#2
0
def imperfect_features_test():
    from src.env.Amatrix_task import Amatrix

    n = 4
    m = 2
    env = Amatrix(n, m)

    features = env.get_approx_A()  # first m features
    weights = np.random.rand(m)

    config = Config()
    config.parameter_size = m
    config.init_alpha = 0.001
    adam = Adam(config)

    sample_size = 50000
    for i in range(sample_size):
        rand_row = np.random.randint(n)
        target = env.sample_target(rand_row, noisy=True)

        pred_features = features[rand_row, :]
        prediction = np.dot(pred_features, weights)
        error = target - prediction
        gradient, new_stepsize, new_weight_vector = adam.update_weight_vector(
            error, pred_features, weights)
        weights = new_weight_vector
        print("Sample number: {0}".format(i + 1))
        print("\tPrediction error:{0}".format(error))

    print("Theta star:\n{0}".format(env.theta_star))
    print("Estimated theta:\n{0}".format(weights))
示例#3
0
def adding_bad_features_test():
    from src.env.Amatrix_task import Amatrix

    n = 10
    m = 5
    env = Amatrix(n, m)

    features = env.get_approx_A()  # first m features
    weights = np.zeros(m)

    config = Config()
    config.parameter_size = m
    config.theta = 0.1
    config.init_beta = np.log(0.0001)
    idbd = SIDBD(config)

    sample_size = 50000
    additional_features = 30
    for k in range(additional_features + 1):
        print("Number of features in the representation: {0}".format(
            idbd.parameter_size))
        for i in range(sample_size):
            rand_row = np.random.randint(n)
            target = env.sample_target(rand_row, noisy=True)

            pred_features = features[rand_row, :]
            prediction = np.dot(pred_features, weights)
            error = target - prediction
            gradient, new_stepsize, new_weight_vector = idbd.update_weight_vector(
                error, pred_features, weights)
            weights = new_weight_vector
            if ((i + 1) % 25000) == 0:
                print("\tSample number: {0}".format(i + 1))
                print("\t\tPrediction error: {0}".format(error))

        print("Theta star:\n{0}".format(env.theta_star))
        print("Estimated theta:\n{0}".format(weights))

        if k < additional_features:
            print("Adding new feature...")
            new_feature = env.get_new_bad_features(1)
            features = np.hstack((features, new_feature))
            idbd.increase_size(1)

            new_weights = np.zeros(m + 1)
            new_weights[:m] = weights
            m += 1
            weights = new_weights
def test_function_approximator(num_features=20,
                               initial_features=20,
                               num_iterations=10000,
                               chkpt=100,
                               plot_mse=True,
                               noisy=True,
                               add_features=False,
                               add_true_features=True,
                               feature_add_interval=100,
                               mixed_features=False):

    from src.step_size_methods import SGD
    config = Config()
    # task setup
    config.num_true_features = num_features
    config.num_obs_features = initial_features  # same as function approximator
    config.max_num_features = 20000  # same as function approximator
    task = RandomFeatures(config)

    # function approximator setup
    approximator = LinearFunctionApproximator(config)

    # optimizer setup
    config.parameter_size = initial_features
    config.alpha = 0.001
    optimizer = SGD(config)

    # for plotting
    mse_per_chpt = np.zeros(num_iterations // chkpt, dtype=np.float64)
    mse = 0
    current_chpt = 0

    # training loop
    for i in range(num_iterations):
        target, observable_features, best_approximation = task.sample_observation(
            noisy=noisy)
        prediction = approximator.get_prediction(observable_features)
        error = target - prediction
        _, _, new_weights = optimizer.update_weight_vector(
            error, observable_features, approximator.get_weight_vector())
        approximator.update_weight_vector(new_weights)

        squared_loss = np.square(error)
        mse += squared_loss / chkpt
        if (i + 1) % chkpt == 0:
            # reporting and saving
            print("Iteration number: {0}".format(i + 1))
            print("\tTarget: {0:.4f}".format(target))
            print("\tPrediction: {0:.4f}".format(prediction))
            print("\tMean Squared Error: {0:.4f}".format(mse))
            mse_per_chpt[current_chpt] += mse
            mse *= 0
            current_chpt += 1

        if add_features and (i + 1) % feature_add_interval == 0:
            task.add_new_feature(k=1, true_feature=add_true_features)
            approximator.increase_num_features(k=1)
            optimizer.increase_size(k=1)
            if mixed_features:
                add_true_features = not add_true_features

    if plot_mse:
        # plots
        import matplotlib.pyplot as plt
        x_axis = np.arange(num_iterations // chkpt)
        plt.plot(x_axis, mse_per_chpt)
        plt.show()
        plt.close()
示例#5
0
def boyan_chain_test(steps=50000):
    from src.env.BoyanChain import BoyanChain
    from src.env.RandomFeatures_task import LinearFunctionApproximator
    from src.util import Config
    import matplotlib.pyplot as plt

    config = Config()
    checkpoint = 100
    """ Environment Setup """
    config.init_noise_var = 0.1
    config.num_obs_features = 4
    config.max_num_features = 9
    """ AutoTIDBD Setup """
    config.parameter_size = 4
    config.theta = 0.001
    config.tau = 10000
    config.init_stepsize = 0.001
    # to keep track of learning progress
    run_avg_msve = np.zeros(steps // checkpoint, dtype=np.float64)
    current_checkpoint = 0
    avg_msve = 0

    env = BoyanChain(config)
    approximator = LinearFunctionApproximator(config)
    optimizer = AutoTIDBD(config)
    """ Start of Learning"""
    curr_obs_feats = env.get_observable_features()
    for s in range(steps):
        state_value = approximator.get_prediction(curr_obs_feats)
        optimal_value = env.compute_true_value()
        # step in the environment
        _, r, next_obs_feats, term = env.step()
        next_state_value = approximator.get_prediction(next_obs_feats)
        # compute td error
        td_error = r + (1 - term) * next_state_value - state_value
        # update weights
        _, _, new_weights = optimizer.update_weight_vector(
            td_error,
            features=curr_obs_feats,
            weights=approximator.get_weight_vector(),
            discounted_next_features=next_obs_feats)
        approximator.update_weight_vector(new_weights)
        # update features
        curr_obs_feats = next_obs_feats
        # keep track of progress
        avg_msve += np.square(state_value - optimal_value) / checkpoint
        # check if terminal state
        if term:
            env.reset()
            curr_obs_feats = env.get_observable_features()
        # store learning progress so far
        if (s + 1) % checkpoint == 0:
            run_avg_msve[current_checkpoint] += avg_msve
            avg_msve *= 0
            current_checkpoint += 1

        if (s + 1) == (steps // 2):
            env.add_feature(k=4, noise=0.0, fake_feature=False)
            approximator.increase_num_features(4)
            optimizer.increase_size(4)
            curr_obs_feats = env.get_observable_features()

    print("The average MSVE is: {0:0.4f}".format(np.average(run_avg_msve)))

    xaxis = np.arange(run_avg_msve.size) + 1
    plt.plot(xaxis, run_avg_msve)
    plt.show()
    plt.close()
示例#6
0
def sarsa_zero_test(steps=10000,
                    add_new_centers=False,
                    number_of_irrelevant_features=0):
    import matplotlib.pyplot as plt
    from src.env.RandomFeatures_task import LinearFunctionApproximator
    from src.step_size_methods.sgd import SGD

    # epsilon greedy policy
    def choose_action(av_array: np.ndarray, epsilon):
        p = np.random.rand()
        if p > epsilon:
            argmax_av = np.random.choice(
                np.flatnonzero(av_array == av_array.max()))
            return argmax_av
        else:
            return np.random.randint(av_array.size)

    # for computing action values
    def get_action_values(n, features, approximator_list):
        action_values = np.zeros(n, dtype=np.float64)
        for k in range(n):
            action_values[k] += approximator_list[k].get_prediction(features)
        return action_values

    completed_episodes_per_run = []
    for _ in range(1):
        print("==== Results for Sarsa(0) with Epsilon Greedy Policy ====")
        config = Config()

        # setting up feature function
        config.state_dims = 2
        config.state_lims = np.array(((-1, 1), (-1, 1)), dtype=np.float64)
        # config.initial_centers = np.array(((0.0,0.0), (-1.8,0), (1.8,0), (0.0,-1.8), (0.0,1.8)), dtype=np.float64)
        config.initial_centers = np.array(
            ((0.0, 0.0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25),
             (-0.25, 0.25)),
            dtype=np.float64)
        config.sigma = 0.5
        config.init_noise_mean = 0.0
        config.init_noise_var = 0.01
        feature_function = RadialBasisFunction(config)

        # setting up environment
        config.norm_state = True
        env = MountainCar(config)

        # function approximator and optimizer parameters
        num_actions = 3
        random_action_prob = 0.1
        gamma = 0.99
        config.num_obs_features = feature_function.num_features
        config.max_num_features = 200  # as long as this is more than 12
        config.num_actions = num_actions
        config.alpha = 0.005
        config.rescale = False
        config.parameter_size = feature_function.num_features
        function_approximator = []
        optimizer = []
        # one instance for each action
        for i in range(num_actions):
            function_approximator.append(LinearFunctionApproximator(config))
            optimizer.append(SGD(config))

        # setting up summaries
        all_episodes_return = []
        episode_return = 0

        # setting up initial state, action, features, and action values
        curr_s = env.get_current_state()
        curr_features = feature_function.get_observable_features(curr_s)
        curr_avs = get_action_values(num_actions, curr_features,
                                     function_approximator)
        curr_a = choose_action(curr_avs, random_action_prob)
        midpoint_episode = 0
        for i in range(steps):
            # get current action values
            curr_avs = get_action_values(num_actions, curr_features,
                                         function_approximator)
            # execute current action
            next_s, r, terminal = env.step(curr_a)
            next_features = feature_function.get_observable_features(next_s)
            # get next action values and action
            next_action_values = get_action_values(num_actions, next_features,
                                                   function_approximator)
            next_action = choose_action(next_action_values, random_action_prob)
            # compute TD error for Sarsa(0)
            td_error = r + gamma * (
                1 -
                terminal) * next_action_values[next_action] - curr_avs[curr_a]
            # update weight vector
            _, ss, new_weights = optimizer[curr_a].update_weight_vector(
                td_error, curr_features,
                function_approximator[curr_a].get_weight_vector())
            function_approximator[curr_a].update_weight_vector(new_weights)
            # set current features and action
            curr_features = next_features
            curr_a = next_action
            # keep track of sum of rewards
            episode_return += r
            # if terminal state
            if terminal:
                env.reset()
                all_episodes_return.append(episode_return)
                episode_return *= 0
                curr_s = env.get_current_state()
                curr_features = feature_function.get_observable_features(
                    curr_s)
                curr_avs = get_action_values(num_actions, curr_features,
                                             function_approximator)
                curr_a = choose_action(curr_avs, random_action_prob)
            # if midpoint of training
            if (i + 1) == (steps // 2):
                if add_new_centers:
                    new_centers = np.array(
                        ((0, 0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25),
                         (-0.25, 0.25)),
                        dtype=np.float64)
                    feature_function.add_centers(new_centers,
                                                 noise_var=0,
                                                 noise_mean=0)
                    for k in range(num_actions):
                        function_approximator[k].increase_num_features(
                            new_centers.shape[0])
                        optimizer[k].increase_size(new_centers.shape[0],
                                                   init_stepsize=0.25)
                if number_of_irrelevant_features > 0:
                    new_feature_mean = 0.0
                    new_feature_var = 0.05
                    fake_features = True
                    feature_function.add_feature(number_of_irrelevant_features,
                                                 noise_mean=new_feature_mean,
                                                 noise_var=new_feature_var,
                                                 fake_feature=fake_features)
                    for k in range(num_actions):
                        function_approximator[k].increase_num_features(
                            number_of_irrelevant_features)
                        optimizer[k].increase_size(
                            number_of_irrelevant_features)
                curr_features = feature_function.get_observable_features(
                    curr_s)
                midpoint_episode = len(all_episodes_return)
        completed_episodes_per_run.append(len(all_episodes_return))
        print("Number of episodes completed: {0}".format(
            len(all_episodes_return)))
    print("Average episodes completed: {0:0.4f}".format(
        np.average(completed_episodes_per_run)))

    print("Return per episode:\n", all_episodes_return)
    plt.plot(np.arange(len(all_episodes_return)) + 1, all_episodes_return)
    plt.vlines(x=midpoint_episode, ymin=-800, ymax=0)
    plt.ylim((-800, 0))
    plt.show()
    plt.close()