Exemplo n.º 1
0
    def learn_proto_values_basis(self,
                                 num_basis=NUM_BASIS,
                                 walk_length=30,
                                 num_walks=10,
                                 discount=DISCOUNT,
                                 explore=EXPLORE,
                                 max_iterations=MAX_ITERATIONS,
                                 max_steps=NUM_SAMPLES,
                                 initial_policy=None,
                                 rpi_epochs=1,
                                 run_simulation=False):

        if initial_policy is None:
            initial_policy = policy.Policy(
                basis_functions.ProtoValueBasis(
                    self.domain.learn_graph(walk_length, num_walks,
                                            self.sampling_policy), 4,
                    num_basis), discount, explore)

        learned_policy, distances = lspi.learn(self.samples,
                                               initial_policy,
                                               self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        if run_simulation:
            while (not absorb) and (steps_to_goal < max_steps):
                action = learned_policy.select_action(
                    self.domain.current_state())
                sample = self.domain.apply_action(action)
                absorb = sample.absorb
                if absorb:
                    print('Reached the goal in %d', steps_to_goal)
                steps_to_goal += 1
                samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
 def compute_samples(self, reset_samples=True, reset_policy=True, biased_walk=False):
     if reset_policy:
         self.sampling_policy = policy.Policy(basis_functions.FakeBasis(4), self.discount, 1)
     if reset_samples:
         self.samples = []
         self.lspi_samples = []
         self.walks = []
         self.actions = []
     for i in range(self.num_samples):
         if biased_walk:
             sample, walk, terminated, lspi_sample = self.domain.generate_unique_samples(self.length_sample,
                                                                                         self.sampling_policy)
         else:
             sample, walk, walk_actions, terminated, lspi_sample = self.domain.generate_samples(self.length_sample,
                                                                                  self.sampling_policy)
         self.samples.extend(sample)
         self.walks.append(walk)
         self.actions.append(walk_actions)
         # if terminated: # and len(self.lspi_samples) <= NUM_SAMPLES:
         self.lspi_samples.extend(lspi_sample)
Exemplo n.º 3
0
    def learn_polynomial_basis(self,
                               degree=DEGREE,
                               discount=DISCOUNT,
                               explore=EXPLORE,
                               max_iterations=MAX_ITERATIONS,
                               max_steps=NUM_SAMPLES,
                               initial_policy=None,
                               run_simulation=False):

        if initial_policy is None:
            initial_policy = policy.Policy(
                basis_functions.OneDimensionalPolynomialBasis(degree, 4),
                discount, explore)

        learned_policy, distances = lspi.learn(self.samples,
                                               initial_policy,
                                               self.solver,
                                               max_iterations=max_iterations)

        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        if run_simulation:
            while (not absorb) and (steps_to_goal < max_steps):
                action = learned_policy.select_action(
                    self.domain.current_state())
                sample = self.domain.apply_action(action)
                absorb = sample.absorb
                if absorb:
                    print('Reached the goal in %d', steps_to_goal)
                steps_to_goal += 1
                samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
    def learn_discounted_node2vec_basis(self, dimension=NUM_BASIS, walk_length=30, num_walks=10, window_size=10, gamma=0.6,
                             p=1, q=1, epochs=1, learning_rate=0.5, explore=EXPLORE, max_iterations=MAX_ITERATIONS,
                             max_steps=NUM_SAMPLES, initial_policy=None, edgelist ='node2vec/graph/NA.edgelist',
                             run_simulation=False, lspi_epochs=1):

        if initial_policy is None:
            initial_policy = policy.Policy(basis_functions.DiscountedNode2vecBasis(
                edgelist, num_actions=4, transition_probabilities=self.domain.transition_probabilities, discount=self.discount,
                dimension=dimension, walks=self.walks, walk_length=walk_length, num_walks=num_walks, window_size=window_size,
                p=p, q=q, epochs=epochs, learning_rate=learning_rate), gamma, explore)

        self.sampling_policy = initial_policy
        for i in range(lspi_epochs):
            learned_policy, distances = lspi.learn(self.lspi_samples, self.sampling_policy, self.solver,
                                               max_iterations=max_iterations)
            self.sampling_policy = learned_policy
            self.sampling_policy.explore *= EPSILON_DECAY
            self.compute_samples(True)
        # self.sampling_policy.explore = 1.
        self.domain.reset()

        steps_to_goal = 0
        absorb = False
        samples = []

        if run_simulation:
            while (not absorb) and (steps_to_goal < max_steps):
                action = learned_policy.select_action(self.domain.current_state())
                sample = self.domain.apply_action(action)
                absorb = sample.absorb
                if absorb:
                    print('Reached the goal in %d', steps_to_goal)
                steps_to_goal += 1
                samples.append(sample)

        return steps_to_goal, learned_policy, samples, distances
    def run_lspi(self, basis, discount, max_iter, explore):
        basis_policy = policy.Policy(basis, discount, explore)
        learned_policy, distances, iterations = lspi.learn(self.lspi_samples, basis_policy, self.solver,
                                                           max_iterations=max_iter)

        return learned_policy, distances, iterations