Пример #1
0
    def learn_polynomial_basis(self, degree=DEGREE, discount=DISCOUNT,
                               explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None):

        if initial_policy is None:
            initial_policy = lspi.Policy(lspi.basis_functions.OneDimensionalPolynomialBasis(degree, 4), discount, explore)

        learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        while (not absorb) and (steps_to_goal < max_steps):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            absorb = sample.absorb
            if absorb:
                print('Reached the goal in %d', steps_to_goal)
            steps_to_goal += 1
            samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
Пример #2
0
    def learn_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10,
                             p=1, q=1, epochs=1, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS,
                             max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/grid6.edgelist'):

        if initial_policy is None:
            initial_policy = lspi.Policy(lspi.basis_functions.Node2vecBasis(
                edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities,
                dimension=dimension,walk_length=walk_length, num_walks=num_walks, window_size=window_size,
                p=p, q=q, epochs=epochs), discount, explore)

        learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        while (not absorb) and (steps_to_goal < max_steps):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            absorb = sample.absorb
            if absorb:
                print('Reached the goal in %d', steps_to_goal)
            steps_to_goal += 1
            samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
Пример #3
0
    def setUp(self):
        self.domain = lspi.domains.ChainDomain()

        sampling_policy = lspi.Policy(lspi.basis_functions.FakeBasis(2), .9, 1)

        self.samples = []
        for i in range(1000):
            action = sampling_policy.select_action(self.domain.current_state())
            self.samples.append(self.domain.apply_action(action))

        self.random_policy_cum_rewards = np.sum([sample.reward
                                                 for sample in self.samples])

        self.solver = lspi.solvers.LSTDQSolver()
Пример #4
0
    def test_chain_polynomial_basis(self):

        initial_policy = lspi.Policy(
            lspi.basis_functions.OneDimensionalPolynomialBasis(3, 2),
            .9,
            0)

        learned_policy = lspi.learn(self.samples, initial_policy, self.solver)

        self.domain.reset()
        cumulative_reward = 0
        for i in range(1000):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            cumulative_reward += sample.reward

        self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
Пример #5
0
    def __init__(self, height, width, reward_location, walls_location,
                 obstacles_location, initial_state=None, obstacles_transition_probability=.2, num_sample=NUM_SAMPLES):

        self.domain = lspi.domains.GridMazeDomain(height, width, reward_location,
                                                  walls_location, obstacles_location, initial_state, obstacles_transition_probability)

        sampling_policy = lspi.Policy(lspi.basis_functions.FakeBasis(4), DISCOUNT, 1)

        self.samples = []

        for i in xrange(num_sample):
            action = sampling_policy.select_action(self.domain.current_state())
            self.samples.append(self.domain.apply_action(action))

        self.random_policy_cumulative_rewards = np.sum([sample.reward for
                                                        sample in self.samples])

        self.solver = lspi.solvers.LSTDQSolver()
Пример #6
0
    def test_chain_rbf_basis(self):

        initial_policy = lspi.Policy(
            lspi.basis_functions.RadialBasisFunction(
                np.array([[0], [2], [4], [6], [8]]), .5, 2),
            .9,
            0)

        learned_policy = lspi.learn(self.samples, initial_policy, self.solver)

        self.domain.reset()
        cumulative_reward = 0
        for i in range(1000):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            cumulative_reward += sample.reward

        self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
Пример #7
0
import gym, pcg
import numpy as np
import lspi

solver = lspi.solvers.LSTDQSolver()
env = gym.make("SapsBattiti-v0")

init_pol = lspi.Policy(lspi.basis_functions.BattitiBasis(20, 0, 1), 0.99,
                       0.1)  # Battiti State
# init_pol = lspi.Policy(lspi.basis_functions.BattitiBasis(20, 4, 5), 0.99, 0.1) # Boosted State
max_steps_per_eps = 1200
episodes = 5
rewards = []
lengths = []

for i in range(episodes):
    obs = env.reset()
    # Extracting normalizedHam and normalizedDelta
    obs = obs[4:6]  # Battiti State
    # obs = obs # Boosted State

    samples = []  # collect observations of each episode ...[*]
    done = False
    c_reward = 0
    steps = 0
    while not done:
        act = init_pol.select_action(obs)
        nobs, r, done, info = env.step(act)
        nobs = nobs[4:6]  # Battiti State
        # nobs = nobs # Boosted State
        c_reward += r