示例#1
0
文件: Q4.py 项目: ohad-se/RL3
def run_Q_learning(seed,
                   epsilon_current=0.1,
                   max_episodes=10000,
                   epsilon_decrease=1.,
                   start_at_bottom=False):
    env = MountainCarWithResetEnv()
    np.random.seed(seed)
    env.seed(seed)

    gamma = 0.999
    learning_rate = 0.05
    epsilon_min = 0.05

    solver = Solver(
        # learning parameters
        gamma=gamma,
        learning_rate=learning_rate,
        # feature extraction parameters
        number_of_kernels_per_dim=[7, 5],
        # env dependencies (DO NOT CHANGE):
        number_of_actions=env.action_space.n,
    )

    bottom_state = np.asarray([-0.5, 0])
    bottom_state_val = []
    success_rates = []
    episodes_gain = []
    episodes_bellman_err = []
    for episode_index in range(1, max_episodes + 1):
        episode_gain, mean_delta = run_episode(env,
                                               solver,
                                               is_train=True,
                                               epsilon=epsilon_current,
                                               start_at_bottom=start_at_bottom)
        episodes_gain.append(episode_gain)
        # reduce epsilon if required
        epsilon_current *= epsilon_decrease
        epsilon_current = max(epsilon_current, epsilon_min)
        episodes_bellman_err.append(mean_delta)
        bottom_state_features = solver.get_features(bottom_state)
        bottom_state_max_action = solver.get_max_action(bottom_state)
        bottom_state_val.append(
            solver.get_q_val(bottom_state_features, bottom_state_max_action))

        # termination condition:
        if episode_index % 10 == 9:
            test_gains = [
                run_episode(env, solver, is_train=False, epsilon=0.)[0]
                for _ in range(10)
            ]
            mean_test_gain = np.mean(test_gains)
            success_rates.append(np.mean(np.asarray(test_gains) > -200))
            print(f'tested 10 episodes: mean gain is {mean_test_gain}')
            if mean_test_gain >= -75.:
                print(f'solved in {episode_index} episodes')
                break

    return episodes_gain, success_rates, bottom_state_val, episodes_bellman_err
示例#2
0
    def __init__(self):
        self.game = MountainCarWithResetEnv()
        self.reset_theta()

        # Constants used for data standardization
        self.pos_mu = (self.game.min_position + self.game.max_position)/2
        self.pos_sigma = (self.game.max_position - self.game.min_position)/np.sqrt(12)
        self.speed_mu = 0
        self.speed_sigma = 2*self.game.max_speed/np.sqrt(12)

        # Cache of samples used for visualizing the policy
        self.vis_samples = None
示例#3
0
def mean_std_of_states():
    env = MountainCarWithResetEnv()
    samples_to_collect = 100000
    states, actions, rewards, next_states, done_flags = DataCollector(env).collect_data(samples_to_collect)
    all_states = np.concatenate((states, next_states))
    states_mean = np.mean(all_states, axis=0)
    states_std = np.std(all_states, axis=0)
    print("states_mean: {}, states_std: {}".format(states_mean, states_std))
示例#4
0
def __discretize_state_space(n: int = 100):
    states = MountainCarWithResetEnv().observation_space
    positions = np.linspace(states.low[0], states.high[0], n)
    velocities = np.linspace(states.low[1], states.high[1], n)

    position_v, velocity_v = np.meshgrid(positions, velocities, sparse=False, indexing='ij')

    return position_v, velocity_v, n
示例#5
0
def training_the_model(samples_to_collect=100000, seed=100):
    number_of_kernels_per_dim = [10, 8]
    gamma = 0.999
    w_updates = 20
    evaluation_number_of_games = 50
    evaluation_max_steps_per_game = 300
    np.random.seed(seed)

    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print(f'Data Success Rate {data_success_rate}')
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
    encoded_states = feature_extractor.encode_states_with_radial_basis_functions(
        states)
    encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions(
        next_states)
    # set a new linear policy
    linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3,
                                 True)
    # but set the weights as random
    linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape))
    # start an object that evaluates the success rate over time
    evaluator = GamePlayer(env, data_transformer, feature_extractor,
                           linear_policy)

    success_rate_vs_iteration = list()

    for lspi_iteration in range(w_updates):
        print(f'Starting LSPI iteration {lspi_iteration}')

        new_w = compute_lspi_iteration(encoded_states, encoded_next_states,
                                       actions, rewards, done_flags,
                                       linear_policy, gamma)
        norm_diff = linear_policy.set_w(new_w)

        success_rate = evaluator.play_games(evaluation_number_of_games,
                                            evaluation_max_steps_per_game)

        success_rate_vs_iteration.append(success_rate)

        if norm_diff < 0.00001:
            break

    print('LSPI Done')
    return success_rate_vs_iteration
示例#6
0
def lspi_data_sample(N=3000):
    env = MountainCarWithResetEnv()
    goal_pos = 0.5
    min_pos = -1.2
    max_pos = 0.6
    min_speed = -0.07
    max_speed = 0.07
    data = []
    rewards = np.zeros([N, 1])
    states = np.zeros([N, 2])
    actions = np.zeros(N)
    next_states = np.zeros([N, 2])
    for i in range(N):
        #for pos in np.linspace(min_pos, max_pos, num=N_pos):
        #for speed in np.linspace(min_speed, max_speed, num=N_speed):
        #for action in [0, 1, 2]:
        pos = (max_pos - min_pos) * np.random.sample() + min_pos
        speed = (max_speed - min_speed) * np.random.sample() + min_speed
        action = np.random.choice(3)
        #res = {'s' : np.array([pos, speed]), 'a' : action}
        states[i, :] = np.array([pos, speed])
        actions[i] = action
        if pos >= goal_pos:
            #res['r'] = 1
            rewards[i, 0] = 1
            #res['s_next'] = np.array([pos, speed])
            next_states[i, :] = np.array([pos, speed])
        else:
            env.reset_specific(pos, speed)
            s_next, reward, _, _ = env.step(action)
            #res['r'] = reward
            rewards[i, 0] = reward
            #res['s_next'] = s_next
            next_states[i, :] = s_next

    return data, states, actions, rewards, next_states
示例#7
0
def test_lspi(N_list=[3000]):
    results = []
    fig, ax = plt.subplots()
    #plt.rcParams.update({'font.size': 10})
    for N in N_list:
        N = int(N)
        env = MountainCarWithResetEnv()
        high = -0.4
        low = -0.6
        init_states = [((high - low) * np.random.sample() + low, 0)
                       for i in range(10)]
        max_iter = 1000
        total_success = 5 * [[]]
        for i in range(5):
            #print("Starting iteration i=", i)
            np.random.seed(seed=i)
            data, states, actions, rewards, next_states = lspi_data_sample(N)
            theta_n = list(
                train_lspi(data, states, actions, rewards, next_states))
            success_theta = []
            for theta in theta_n:
                #print("New theta")
                success_rate = 0
                for init_s in init_states:
                    #print("New init state")
                    env.reset_specific(*init_s)
                    #env.render()
                    is_done = False
                    a = next_a(np.array(init_s).reshape([1, 2]),
                               theta)  # First step
                    for j in range(max_iter):
                        #print("Game iteration:", j)
                        next_s, r, is_done, _ = env.step(int(a))
                        a = next_a(next_s.reshape([1, 2]), theta)
                        if is_done:
                            success_rate += 1.0
                            break
                success_theta.append(success_rate / 10.0)
            total_success[i] = success_theta
        max_len = max(
            [len(total_success[i]) for i in range(len(total_success))])
        for i in range(len(total_success)):
            l = len(total_success[i])
            if l < max_len:
                for j in range(max_len - l):
                    total_success[i].append(total_success[i][-1])
        res = np.array(total_success)
        res_mean = np.mean(res, axis=0)
        results.append(list(res_mean))

    max_len = max([len(results[i]) for i in range(len(results))])
    for i in range(len(results)):
        l = len(results[i])
        if l < max_len:
            for j in range(max_len - l):
                results[i].append(results[i][-1])
    for N, res_mean in zip(N_list, results):
        it = list(range(1, len(res_mean) + 1))

        ax.plot(it, res_mean, label='N = ' + str(int(N)))
    ax.grid(True)
    if len(N_list) > 1:
        plt.title(
            'Average success rate per iteration for different amounts of samples'
        )
    else:
        plt.title('Average success rate per iteration')
    plt.xlabel('Iteration')
    plt.ylabel('Success rate')
    plt.legend(loc='lower left')
    plt.ylim(-0.2, 1.2)
    plt.show()

    return res_mean
示例#8
0
    plt.xlabel('Position', fontsize=fsize)
    plt.ylabel('Velocity', fontsize=fsize)

    plt.show()


#if __name__ == '__main__':
#    import sys
#    if len(sys.argv) > 1:
#        N = int(sys.argv[1])
#        print(test_lspi(N))
#    else:
#        print(test_lspi())
# %%
if __name__ == '__main__':
    env = MountainCarWithResetEnv()
    # # run no force
    # env.reset()
    # env.render()
    # is_done = False
    # while not is_done:
    #     _, r, is_done, _ = env.step(1)
    #     env.render()
    #     print(r)
    # # run random forces
    # env.reset()
    # env.render()
    # is_done = False
    # while not is_done:
    #     _, r, is_done, _ = env.step(env.action_space.sample())  # take a random action
    #     env.render()

if __name__ == '__main__':
    samples_to_collect = 100000
    # samples_to_collect = 150000
    # samples_to_collect = 10000
    number_of_kernels_per_dim = [10, 8]
    gamma = 0.99
    w_updates = 100
    evaluation_number_of_games = 10
    evaluation_max_steps_per_game = 1000

    np.random.seed(123)
    # np.random.seed(234)

    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print(f'success rate {data_success_rate}')
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
示例#10
0
if __name__ == "__main__":

    seeds = [123]
    epsilons = [1]

    gamma = 0.999
    learning_rate = 0.05
    epsilon_decrease = 0.99
    epsilon_min = 0.01

    max_episodes = 10000

    seed_rewards, seed_performance, seed_bottom_val, seed_bellman_err_avg = [], [], [], []
    for seed in seeds:
        env = MountainCarWithResetEnv()
        np.random.seed(seed)
        env.seed(seed)

        solver = Solver(
            # learning parameters
            gamma=gamma,
            learning_rate=learning_rate,
            # feature extraction parameters
            number_of_kernels_per_dim=[5, 7],
            # env dependencies (DO NOT CHANGE):
            number_of_actions=env.action_space.n,
        )

        for epsilon_current in epsilons:
            rewards, performance, bottom_val, bellman_err_avg, bellman_err = [], [], [], [], []
                # action = int(np.sign(state[1]) + 1)
                # w=np.array([-0.1,0,0,0.1,0.1,0])
                action = self.greedy_policy(state, [False])[0]
                state, reward, done, _ = env.step(action)
                rewards += reward
                if done or itr > 10e2:
                    break
                itr += 1

            rew.append(rewards)

        return np.mean(rew), np.std(rew)


if __name__ == '__main__':
    env = MountainCarWithResetEnv()
    # # run no force
    # env.reset()
    # env.render()
    # is_done = False
    # while not is_done:
    #     _, r, is_done, _ = env.step(1)
    #     env.render()
    #     print(r)
    # # run random forces
    # env.reset()
    # env.render()
    # is_done = False
    # while not is_done:
    #     _, r, is_done, _ = env.step(env.action_space.sample())  # take a random action
    #     env.render()
示例#12
0
class QLearningAgent:
    def __init__(self):
        self.game = MountainCarWithResetEnv()
        self.reset_theta()

        # Constants used for data standardization
        self.pos_mu = (self.game.min_position + self.game.max_position)/2
        self.pos_sigma = (self.game.max_position - self.game.min_position)/np.sqrt(12)
        self.speed_mu = 0
        self.speed_sigma = 2*self.game.max_speed/np.sqrt(12)

        # Cache of samples used for visualizing the policy
        self.vis_samples = None

    def reset_theta(self):
        self.theta = np.random.normal(size=(1, 78))

    def reset(self, state=None):
        if state is None:
            return self.game.reset()
        return self.game.reset_specific(*state)

    def next_a(self, state):
        N = np.shape(state)[0]

        Q_est = np.zeros([N, 3])
        Q_est[:, 0] = self.theta.dot(self.extract_features(state, np.zeros(N)).T)
        Q_est[:, 1] = self.theta.dot(self.extract_features(state, np.ones(N)).T)
        Q_est[:, 2] = self.theta.dot(self.extract_features(state, 2*np.ones(N)).T)

        action = np.argmax(Q_est, axis=1)

        return 2 - action

    def q_max(self, state):
        N = np.shape(state)[0]

        Q_est = np.zeros([N, 3])
        Q_est[:, 0] = self.theta.dot(self.extract_features(state, np.zeros(N)).T)
        Q_est[:, 1] = self.theta.dot(self.extract_features(state, np.ones(N)).T)
        Q_est[:, 2] = self.theta.dot(self.extract_features(state, 2*np.ones(N)).T)

        return np.max(Q_est, axis=1)

    def q(self, state, action):
        Q_est = self.theta.dot(self.extract_features(state, action * np.ones(1)).T)

        return Q_est

    def extract_features(self, s, actions):
        N_a = 3
        e_s = self.rbf(s)
        N_f = np.shape(e_s)[1]
        feats = np.zeros([np.shape(e_s)[0], N_a * N_f])
        for i, a in enumerate(actions):
            np.put(feats[i, :], range(int(a)*N_f, int(a+1)*N_f), e_s[i, :])
        return feats
     
    def rbf(self, s):
        # Implementation of RBF features
        # pos, speed statistics should be global
        n_s = np.zeros(s.shape)
        n_s[:, 0] = (s[:, 0] - self.pos_mu) / self.pos_sigma
        n_s[:, 1] = (s[:, 1] - self.speed_mu) / self.speed_sigma
        centers = []
        for i in -1.2, -0.6, 0, 0.6, 1.2:
            for j in -0.07, -0.03, 0, 0.03, 0.07:
                centers.append((i, j))
        n_centers = np.array([(
            (c[0] - self.pos_mu)/self.pos_sigma,
            (c[1] - self.speed_mu)/self.speed_sigma) for c in centers])
        scales = [1 for c in centers]

        feats = np.ones([n_s.shape[0], np.size(scales) + 1])
        for i, n_c in enumerate(n_centers):
            feats[:, i] = np.exp(-scales[i] * np.linalg.norm(n_s - n_c, axis=1))

        return feats

    def visualize(self):
        if self.vis_samples is None:
            ret = lspi_data_sample(10000)
            self.vis_samples = ret[1]

        N = self.vis_samples.shape[0]
        opt_a = self.next_a(self.vis_samples)

        plt.clf()
        ac = [0, 1, 2]
        for a, color, label in zip(ac, ['tab:blue', 'tab:orange', 'tab:green'], ['LEFT', 'STAY', 'RIGHT']):
            xy = self.vis_samples[a == opt_a, :]
            plt.scatter(xy[:, 0], xy[:, 1], c=color, label=label, edgecolors='none')

        plt.legend()
        plt.grid(True)
        plt.title('Sample size - {}'.format(N))
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        plt.pause(0.1)

    def gather_data(self, epsilon, iterations_per_game=1000, games=5):
        states = np.zeros((iterations_per_game*games, 2))
        actions = np.zeros((iterations_per_game*games, 1))
        next_states = np.zeros((iterations_per_game*games, 2))
        rewards = np.zeros((iterations_per_game*games, 1))
        data = (states, actions, next_states, rewards)

        success_count = 0
        data_index = 0
        for g in range(games):
            state = self.reset()
            state = state.reshape((1, 2))
            for i in range(iterations_per_game):
                if np.random.uniform() > epsilon:
                    rand = False
                    action = self.next_a(state)[0]
                else:
                    rand = True
                    action = np.random.choice(3)
                next_state, reward, is_done, _ = self.game.step(action)
                success_count += np.sum(reward)
                states[data_index, :] = state
                actions[data_index, :] = action
                next_states[data_index, :] = next_state
                rewards[data_index, :] = reward
                data_index += 1
                #print("i: {}, state: {}, action: {}, next: {}, r: {}, rand: {}".format(i, state, action, next_state, reward, rand))
                state = np.array(next_state).reshape((1, 2))
                if is_done:
                    break
        success_rate = success_count / games
        return data, success_rate, data_index

    def train_step(self, alpha, data, batch_size=100, gamma=0.999):
        data_length = data[0].shape[0]
        reward_indices = (data[3] == 1).reshape(data_length)
        reward_count = reward_indices.sum()
        batch_indices = np.random.randint(0, data_length, batch_size - reward_count)
        batch_marker = np.zeros(data_length, dtype=bool)
        batch_marker[batch_indices] = True
        batch_marker[reward_indices] = True
        batch_size = batch_marker.sum()

        states = data[0][batch_marker]
        actions = data[1][batch_marker]
        next_states = data[2][batch_marker]
        rewards = data[3][batch_marker]

        update_step = 0
        for i in range(batch_size):
            coeff = rewards[i] + gamma * self.q_max(next_states[i].reshape((1, 2))) - self.q(states[i].reshape((1, 2)), actions[i])
            step = self.extract_features(states[i].reshape((1, 2)), actions[i]) * coeff
            update_step += step
        max_element = np.max(np.abs(update_step))
        return self.theta + alpha * update_step / (max_element or 1)

    def reset_random(self):
        init_state = (np.random.uniform(-1.2, 0.6), np.random.uniform(-0.07, 0.07))
        return self.reset(init_state)

    def train(self, init_epsilon=1, init_alpha=1, max_iterations=30, visualise=True, test_states=[]):
        alpha = init_alpha
        epsilon = init_epsilon
        success_rates = np.zeros((len(test_states), max_iterations))
        for i in range(max_iterations):
            data, win_pct, max_ind = self.gather_data(epsilon)

            data = (data[0][:max_ind, :],
                    data[1][:max_ind, :],
                    data[2][:max_ind, :],
                    data[3][:max_ind, :])
            old_theta = self.theta
            for j in range(10):
                self.theta = self.train_step(alpha, data)
            theta_diff = self.theta - old_theta

            diff_max = np.max(np.abs(theta_diff))
            theta_max = np.max(np.abs(self.theta))
            success_rates[:, i] = self.test_train_iteration(test_states)
            avg_rate = np.average(success_rates[:, i])
            print("Iter", i, "train_iters", max_ind, "train_win_pct", win_pct, "test_win_pct", avg_rate, "alpha", alpha, "ep", epsilon, "theta_new - theta (max) =", diff_max, "theta_max", theta_max)
            epsilon = 0.9 * epsilon
            alpha = 0.8 * alpha
            if visualise:
                self.visualize()
        return success_rates

    def play(self, init_state=None, render=True, max_iterations=1000):
        state = self.reset(init_state).reshape((1,2))
        done = False
        for i in range(max_iterations):
            action = int(self.next_a(state))
            next_state, reward, is_done, _ = self.game.step(action)
            if render:
                self.game.render()
            state = np.array(next_state).reshape((1,2))
            if is_done:
                done = True
                break
        self.game.close()
        return done

    def test_train_iteration(self, test_init_states):
        results = np.zeros(len(test_init_states))
        for state_idx, init_state in enumerate(test_init_states):
            result = self.play(init_state, render=False)
            results[state_idx] = int(result)

        return results

    def get_test_states(self, count=10):
        return [(np.random.uniform(low=-0.6, high=-0.4), 0) for i in range(count)]

    def test_model(self, training_cycles=5, test_states=None, **training_args):
        success_rates = None
        if test_states is None:
            test_states = self.get_test_states()
        for t in range(training_cycles):
            self.reset_theta()
            print("*** TRAINING EXPERIMENT {} ***".format(t))
            rates = self.train(test_states=test_states, **training_args)
            print("*** RESULT ***")
            print(rates)
            if success_rates is None:
                success_rates = rates
            else:
                success_rates += rates
        success_rates /= training_cycles

        return test_states, success_rates

    def plot_success_rates(self, success_rates):
        plt.figure()
        avg = np.average(success_rates, axis=0)
        plt.plot(avg)
        plt.title('Average success rate per iteration')
        plt.xlabel('Iteration')
        plt.ylabel('Success rate')
        plt.show()
示例#13
0
def run_q_learning_training(seed, epsilon=0.1, max_episodes=1000):
    env = MountainCarWithResetEnv()
    np.random.seed(seed)
    env.seed(seed)

    gamma = 0.999
    learning_rate = 0.01

    max_episodes = max_episodes
    solver = Solver(
        # learning parameters
        gamma=gamma,
        learning_rate=learning_rate,
        # feature extraction parameters
        number_of_kernels_per_dim=[7, 5],
        # env dependencies (DO NOT CHANGE):
        number_of_actions=env.action_space.n,
    )
    train_statistics = defaultdict(list)

    bellman_error = list()
    bellman_error_index = 100
    for episode_index in range(1, max_episodes + 1):
        episode_gain, mean_delta = run_episode(env,
                                               solver,
                                               is_train=True,
                                               epsilon=epsilon)
        bellman_error.append(mean_delta)

        print(
            f'After {episode_index}, reward = {episode_gain}, epsilon {epsilon}, average error {mean_delta}'
        )
        env.reset()
        init_state = env.state
        phi_st_0 = solver.get_state_action_features(init_state, 0)
        phi_st_1 = solver.get_state_action_features(init_state, 1)
        phi_st_2 = solver.get_state_action_features(init_state, 2)
        Q_st_0 = phi_st_0.transpose() @ solver.theta
        Q_st_1 = phi_st_1.transpose() @ solver.theta
        Q_st_2 = phi_st_2.transpose() @ solver.theta

        train_statistics["init_state"].append(max(Q_st_0, Q_st_1, Q_st_2))
        train_statistics["reward"].append(episode_gain)

        if episode_index % 100 == 99:
            train_statistics["bellman_error"].append(np.mean(bellman_error))
            train_statistics["bellman_error_index"].append(bellman_error_index)
            bellman_error_index += 100
            bellman_error = list()

        if episode_index % 10 == 9:
            test_gains = [
                run_episode(env, solver, is_train=False, epsilon=0.)[0]
                for _ in range(10)
            ]
            mean_test_gain = np.mean(test_gains)
            train_statistics["performance"].append(mean_test_gain)

            print(f'tested 10 episodes: mean gain is {mean_test_gain}')
            if mean_test_gain >= -75.:
                print(f'solved in {episode_index} episodes')
                break

    return train_statistics
示例#14
0
def run_lspi(seed,
             w_updates=20,
             samples_to_collect=100000,
             evaluation_number_of_games=1,
             evaluation_max_steps_per_game=200,
             thresh=0.00001,
             only_final=False):
    """
    This is the main lspi function
    :param seed: random seed for the run
    :param w_updates: how many w updates to do
    :param samples_to_collect: how many samples to collect
    :param evaluation_number_of_games: how many game evaluations to do
    :param evaluation_max_steps_per_game: how many steps to allow the evaluation game to run
    :param thresh: the threshold for the stopping condition
    :param only_final: run evaluation only at the end of the run
    :return: None
    """
    res_dir = './Results/'
    np.random.seed(seed)
    number_of_kernels_per_dim = [12, 10]
    gamma = 0.999
    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print('success rate: {}'.format(data_success_rate))
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
    encoded_states = feature_extractor.encode_states_with_radial_basis_functions(
        states)
    encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions(
        next_states)
    # set a new linear policy
    linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3,
                                 True)
    # but set the weights as random
    linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape))
    # start an object that evaluates the success rate over time
    evaluator = GamePlayer(env, data_transformer, feature_extractor,
                           linear_policy)

    # success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)
    # print("Initial success rate: {}".format(success_rate))
    performances = []
    if not only_final:
        performances.append(
            evaluator.play_games(evaluation_number_of_games,
                                 evaluation_max_steps_per_game))
    read = False
    if read:
        with open(res_dir + 'weight.pickle', 'rb') as handle:
            new_w = pickle.load(handle)
            linear_policy.set_w(np.expand_dims(new_w, 1))
    for lspi_iteration in range(w_updates):
        print('starting lspi iteration {}'.format(lspi_iteration))

        new_w = compute_lspi_iteration(encoded_states, encoded_next_states,
                                       actions, rewards, done_flags,
                                       linear_policy, gamma)
        with open(res_dir + 'weight.pickle', 'wb') as handle:
            pickle.dump(new_w, handle, protocol=pickle.HIGHEST_PROTOCOL)

        norm_diff = linear_policy.set_w(new_w)
        if not only_final:
            performances.append(
                evaluator.play_games(evaluation_number_of_games,
                                     evaluation_max_steps_per_game))
        if norm_diff < thresh:
            break
    print('done lspi')
    if not only_final:
        with open(res_dir + 'perf' + str(seed) + '.pickle', 'wb') as handle:
            pickle.dump(performances, handle, protocol=pickle.HIGHEST_PROTOCOL)
    if only_final:
        score = evaluator.play_games(evaluation_number_of_games,
                                     evaluation_max_steps_per_game)
        with open(res_dir + 'final_perf' + str(samples_to_collect) + '.pickle',
                  'wb') as handle:
            pickle.dump(score, handle, protocol=pickle.HIGHEST_PROTOCOL)
    evaluator.play_game(evaluation_max_steps_per_game, render=True)