示例#1
0
def training_the_model(samples_to_collect=100000, seed=100):
    number_of_kernels_per_dim = [10, 8]
    gamma = 0.999
    w_updates = 20
    evaluation_number_of_games = 50
    evaluation_max_steps_per_game = 300
    np.random.seed(seed)

    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print(f'Data Success Rate {data_success_rate}')
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
    encoded_states = feature_extractor.encode_states_with_radial_basis_functions(
        states)
    encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions(
        next_states)
    # set a new linear policy
    linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3,
                                 True)
    # but set the weights as random
    linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape))
    # start an object that evaluates the success rate over time
    evaluator = GamePlayer(env, data_transformer, feature_extractor,
                           linear_policy)

    success_rate_vs_iteration = list()

    for lspi_iteration in range(w_updates):
        print(f'Starting LSPI iteration {lspi_iteration}')

        new_w = compute_lspi_iteration(encoded_states, encoded_next_states,
                                       actions, rewards, done_flags,
                                       linear_policy, gamma)
        norm_diff = linear_policy.set_w(new_w)

        success_rate = evaluator.play_games(evaluation_number_of_games,
                                            evaluation_max_steps_per_game)

        success_rate_vs_iteration.append(success_rate)

        if norm_diff < 0.00001:
            break

    print('LSPI Done')
    return success_rate_vs_iteration
class Solver:
    def __init__(self, number_of_kernels_per_dim, number_of_actions, gamma, learning_rate):
        # Set max value for normalization of inputs
        self._max_normal = 1
        # get state \action information
        self.data_transformer = DataTransformer()
        state_mean = [-3.00283763e-01,  5.61618575e-05]
        state_std = [0.51981243, 0.04024895]
        self.data_transformer.set(state_mean, state_std)
        self._actions = number_of_actions
        # create RBF features:
        self.feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
        self.number_of_features = self.feature_extractor.get_number_of_features()
        # the weights of the q learner
        self.theta = np.random.uniform(-0.001, 0, size=number_of_actions * self.number_of_features)
        # discount factor for the solver
        self.gamma = gamma
        self.learning_rate = learning_rate

    def _normalize_state(self, s):
        return self.data_transformer.transform_states(np.array([s]))[0]

    def get_features(self, state):
        normalized_state = self._normalize_state(state)
        features = self.feature_extractor.encode_states_with_radial_basis_functions([normalized_state])[0]
        return features

    def get_q_val(self, features, action):
        theta_ = self.theta[action*self.number_of_features: (1 + action)*self.number_of_features]
        return np.dot(features, theta_)

    def get_all_q_vals(self, features):
        all_vals = np.zeros(self._actions)
        for a in range(self._actions):
            all_vals[a] = solver.get_q_val(features, a)
        return all_vals

    def get_max_action(self, state):
        sparse_features = solver.get_features(state)
        q_vals = solver.get_all_q_vals(sparse_features)
        return np.argmax(q_vals)

    def get_state_action_features(self, state, action):
        state_features = self.get_features(state)
        all_features = np.zeros(len(state_features) * self._actions)
        all_features[action * len(state_features): (1 + action) * len(state_features)] = state_features
        return all_features

    def update_theta(self, state, action, reward, next_state, done):
        # compute the new weights and set in self.theta. also return the bellman error (for tracking).
        assert False, "implement update_theta"
        return 0.0
    np.random.seed(123)
    # np.random.seed(234)

    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print(f'success rate {data_success_rate}')
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
    encoded_states = feature_extractor.encode_states_with_radial_basis_functions(
        states)
    encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions(
        next_states)
    # set a new linear policy
    linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3,
                                 True)
    # but set the weights as random
    linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape))
    # start an object that evaluates the success rate over time
    evaluator = GamePlayer(env, data_transformer, feature_extractor,
示例#4
0
def run_lspi(seed,
             w_updates=20,
             samples_to_collect=100000,
             evaluation_number_of_games=1,
             evaluation_max_steps_per_game=200,
             thresh=0.00001,
             only_final=False):
    """
    This is the main lspi function
    :param seed: random seed for the run
    :param w_updates: how many w updates to do
    :param samples_to_collect: how many samples to collect
    :param evaluation_number_of_games: how many game evaluations to do
    :param evaluation_max_steps_per_game: how many steps to allow the evaluation game to run
    :param thresh: the threshold for the stopping condition
    :param only_final: run evaluation only at the end of the run
    :return: None
    """
    res_dir = './Results/'
    np.random.seed(seed)
    number_of_kernels_per_dim = [12, 10]
    gamma = 0.999
    env = MountainCarWithResetEnv()
    # collect data
    states, actions, rewards, next_states, done_flags = DataCollector(
        env).collect_data(samples_to_collect)
    # get data success rate
    data_success_rate = np.sum(rewards) / len(rewards)
    print('success rate: {}'.format(data_success_rate))
    # standardize data
    data_transformer = DataTransformer()
    data_transformer.set_using_states(
        np.concatenate((states, next_states), axis=0))
    states = data_transformer.transform_states(states)
    next_states = data_transformer.transform_states(next_states)
    # process with radial basis functions
    feature_extractor = RadialBasisFunctionExtractor(number_of_kernels_per_dim)
    # encode all states:
    encoded_states = feature_extractor.encode_states_with_radial_basis_functions(
        states)
    encoded_next_states = feature_extractor.encode_states_with_radial_basis_functions(
        next_states)
    # set a new linear policy
    linear_policy = LinearPolicy(feature_extractor.get_number_of_features(), 3,
                                 True)
    # but set the weights as random
    linear_policy.set_w(np.random.uniform(size=linear_policy.w.shape))
    # start an object that evaluates the success rate over time
    evaluator = GamePlayer(env, data_transformer, feature_extractor,
                           linear_policy)

    # success_rate = evaluator.play_games(evaluation_number_of_games, evaluation_max_steps_per_game)
    # print("Initial success rate: {}".format(success_rate))
    performances = []
    if not only_final:
        performances.append(
            evaluator.play_games(evaluation_number_of_games,
                                 evaluation_max_steps_per_game))
    read = False
    if read:
        with open(res_dir + 'weight.pickle', 'rb') as handle:
            new_w = pickle.load(handle)
            linear_policy.set_w(np.expand_dims(new_w, 1))
    for lspi_iteration in range(w_updates):
        print('starting lspi iteration {}'.format(lspi_iteration))

        new_w = compute_lspi_iteration(encoded_states, encoded_next_states,
                                       actions, rewards, done_flags,
                                       linear_policy, gamma)
        with open(res_dir + 'weight.pickle', 'wb') as handle:
            pickle.dump(new_w, handle, protocol=pickle.HIGHEST_PROTOCOL)

        norm_diff = linear_policy.set_w(new_w)
        if not only_final:
            performances.append(
                evaluator.play_games(evaluation_number_of_games,
                                     evaluation_max_steps_per_game))
        if norm_diff < thresh:
            break
    print('done lspi')
    if not only_final:
        with open(res_dir + 'perf' + str(seed) + '.pickle', 'wb') as handle:
            pickle.dump(performances, handle, protocol=pickle.HIGHEST_PROTOCOL)
    if only_final:
        score = evaluator.play_games(evaluation_number_of_games,
                                     evaluation_max_steps_per_game)
        with open(res_dir + 'final_perf' + str(samples_to_collect) + '.pickle',
                  'wb') as handle:
            pickle.dump(score, handle, protocol=pickle.HIGHEST_PROTOCOL)
    evaluator.play_game(evaluation_max_steps_per_game, render=True)