Exemplo n.º 1
0
    def test_solver_uses_policy_and_data(self):
        """Test that the solver is passed the data and policy."""

        data = [10]
        initial_policy = Policy(FakeBasis(1))

        solver_stub = SolverParamStub(data, initial_policy)

        lspi.learn(solver_stub.data,
                   solver_stub.policy,
                   solver_stub,
                   max_iterations=1)
Exemplo n.º 2
0
    def test_epsilon_stopping_condition(self):
        """Test if learning stops when distance is less than epsilon."""

        with self.assertRaises(ValueError):
            lspi.learn(None, None, None, epsilon=0)

        epsilon_solver = EpsilonSolverStub(10**-21)

        lspi.learn(None,
                   Policy(FakeBasis(1)),
                   epsilon_solver,
                   epsilon=10**-20,
                   max_iterations=1000)

        self.assertEqual(epsilon_solver.num_calls, 1)
Exemplo n.º 3
0
    def test_max_iterations_stopping_condition(self):
        """Test if learning stops when max_iterations is reached."""

        with self.assertRaises(ValueError):
            lspi.learn(None, None, None, max_iterations=0)

        max_iterations_solver = MaxIterationsSolverStub()

        lspi.learn(None,
                   Policy(FakeBasis(1)),
                   max_iterations_solver,
                   epsilon=10**-200,
                   max_iterations=10)

        self.assertEqual(max_iterations_solver.num_calls, 10)
Exemplo n.º 4
0
    def learn_polynomial_basis(self, degree=DEGREE, discount=DISCOUNT,
                               explore=EXPLORE, max_iterations=MAX_ITERATIONS, max_steps=NUM_SAMPLES, initial_policy=None):

        if initial_policy is None:
            initial_policy = lspi.Policy(lspi.basis_functions.OneDimensionalPolynomialBasis(degree, 4), discount, explore)

        learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        while (not absorb) and (steps_to_goal < max_steps):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            absorb = sample.absorb
            if absorb:
                print('Reached the goal in %d', steps_to_goal)
            steps_to_goal += 1
            samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
Exemplo n.º 5
0
    def learn_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10,
                             p=1, q=1, epochs=1, discount=DISCOUNT, explore=EXPLORE, max_iterations=MAX_ITERATIONS,
                             max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/grid6.edgelist'):

        if initial_policy is None:
            initial_policy = lspi.Policy(lspi.basis_functions.Node2vecBasis(
                edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities,
                dimension=dimension,walk_length=walk_length, num_walks=num_walks, window_size=window_size,
                p=p, q=q, epochs=epochs), discount, explore)

        learned_policy, distances = lspi.learn(self.samples, initial_policy, self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        while (not absorb) and (steps_to_goal < max_steps):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            absorb = sample.absorb
            if absorb:
                print('Reached the goal in %d', steps_to_goal)
            steps_to_goal += 1
            samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
Exemplo n.º 6
0
Arquivo: main.py Projeto: SHi-ON/Erwin
def chain_walk(n_samples):
    domain = ChainWalkDomain(
        num_states=4, reward_location=ChainWalkDomain.RewardLocation.Middle)

    samples = []
    init_action = np.random.randint(domain.num_actions)
    init_sample = domain.apply_action(init_action)
    samples.append(init_sample)

    for i in range(1, n_samples):
        a = samples[-1].action
        samples.append(domain.apply_action(a))

    # basis = FakeBasis(2)
    poly_basis = OneDimensionalPolynomialBasis(3, 2)
    # policy = Policy(basis)
    policy = Policy(poly_basis)
    policy.weights
    print('initial policy weights:', policy.weights)

    solver = LSTDQSolver()

    learned_policy = learn(samples, policy, solver)
    print('final policy weights:', learned_policy.weights)

    return learned_policy
Exemplo n.º 7
0
    def test_chain_polynomial_basis(self):

        initial_policy = lspi.Policy(
            lspi.basis_functions.OneDimensionalPolynomialBasis(3, 2),
            .9,
            0)

        learned_policy = lspi.learn(self.samples, initial_policy, self.solver)

        self.domain.reset()
        cumulative_reward = 0
        for i in range(1000):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            cumulative_reward += sample.reward

        self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
Exemplo n.º 8
0
    def test_returns_policy_with_new_weights(self):
        """Test if the weights in the new policy differ and are not the same underlying numpy vector."""

        initial_policy = Policy(FakeBasis(1))

        weight_solver = WeightSolverStub(initial_policy.weights)

        new_policy = lspi.learn(None,
                                initial_policy,
                                weight_solver,
                                max_iterations=1)

        self.assertEqual(weight_solver.num_calls, 1)
        self.assertFalse(
            np.may_share_memory(initial_policy.weights, new_policy))
        self.assertNotEquals(id(initial_policy), id(new_policy))
        np.testing.assert_array_almost_equal(new_policy.weights,
                                             weight_solver.weights)
Exemplo n.º 9
0
    def test_chain_rbf_basis(self):

        initial_policy = lspi.Policy(
            lspi.basis_functions.RadialBasisFunction(
                np.array([[0], [2], [4], [6], [8]]), .5, 2),
            .9,
            0)

        learned_policy = lspi.learn(self.samples, initial_policy, self.solver)

        self.domain.reset()
        cumulative_reward = 0
        for i in range(1000):
            action = learned_policy.select_action(self.domain.current_state())
            sample = self.domain.apply_action(action)
            cumulative_reward += sample.reward

        self.assertGreater(cumulative_reward, self.random_policy_cum_rewards)
Exemplo n.º 10
0
Arquivo: main.py Projeto: SHi-ON/Erwin
def mdps(domain, n_samples):
    samples = []
    init_action = np.random.randint(domain.num_actions)
    init_sample = domain.apply_action(init_action)
    samples.append(init_sample)

    for i in range(1, n_samples):
        a = samples[-1].action
        samples.append(domain.apply_action(a))

    # basis = FakeBasis(2)
    # basis = OneDimensionalPolynomialBasis(3, domain.num_actions)
    basis = RadialBasisFunction(np.array([np.array([i]) for i in range(4)]),
                                0.8, domain.num_actions)
    policy = Policy(basis)
    print('initial policy weights:', policy.weights)

    solver = LSTDQSolver()

    learned_policy = learn(samples, policy, solver)
    print('final policy weights:', learned_policy.weights)

    return learned_policy
Exemplo n.º 11
0
  #   Sample(np.array([0]), 0, 1, np.array([0])),
  #   Sample(np.array([1]), 0, -1, np.array([1]), True)
  # ]

  precondition_value = .3
  initial_policy = Policy(OneDimensionalPolynomialBasis(3,2), .9, 0, tie_breaking_strategy=Policy.TieBreakingStrategy.FirstWins)
  # initial_policy = Policy(lspi.basis_functions.RadialBasisFunction(np.array([[0], [2], [4], [6], [8]]), .5, 2), .9, 0)
  sampling_policy = Policy(FakeBasis(2), .9, 1)
  solver = LSTDQSolver(precondition_value)
  # weights = solver.solve(data[:-1], initial_policy)
  domain = ChainDomain()
  samples = []

  for i in range(1000):
    action = sampling_policy.select_action(domain.current_state())
    samples.append(domain.apply_action(action))

  learned_policy = lspi.learn(samples, initial_policy, solver)

  domain.reset()

  cumulative_reward = 0

  for i in range(1000):
    action = learned_policy.best_action(domain.current_state())
    sample = domain.apply_action(action)
    print action
    cumulative_reward += sample.reward

  print cumulative_reward
Exemplo n.º 12
0
    return sample_data



if __name__ == "__main__":


    quad_domain = QuadcopterDomain()
    num_actions = quad_domain.num_actions()
    #print(num_actions)
    mean_bf = [np.random.uniform(0,1, size = (6,))]
    #print(mean_bf, '************')
    basis_func = RadialBasisFunction(mean_bf, 0.5, num_actions)
    quad_policy = QuadcopterPolicy(basis_func)
    
    sample_data = collect_samples(quad_domain, quad_policy)
    print(sample_data[0])
    solver = LSTDQSolver()
    start = time.clock()
    new_policy = lspi.learn(sample_data, quad_policy, solver)
    print('Done!', (time.clock() - start))


    with open('weights.pickle', 'wb') as weights_file:
        pickle.dump(new_policy.weights, weights_file)
    



Exemplo n.º 13
0
episodes = 5
rewards = []
lengths = []

for i in range(episodes):
    obs = env.reset()
    # Extracting normalizedHam and normalizedDelta
    obs = obs[4:6]  # Battiti State
    # obs = obs # Boosted State

    samples = []  # collect observations of each episode ...[*]
    done = False
    c_reward = 0
    steps = 0
    while not done:
        act = init_pol.select_action(obs)
        nobs, r, done, info = env.step(act)
        nobs = nobs[4:6]  # Battiti State
        # nobs = nobs # Boosted State
        c_reward += r
        samples.append(lspi.Sample(obs, act, r, nobs, done))
        obs = nobs
        steps += 1
        if steps >= max_steps_per_eps:
            break
    rewards.append(c_reward)
    lengths.append(steps)
    print('{:>6d}: {:>7.1f}'.format(i, c_reward))
    # [*]... to immediately learn from the trajectory of the episode
    init_pol = lspi.learn(samples, init_pol, solver)
env.close()