Exemplo n.º 1
0
    def policyEvaluation(self, policy):
        '''
        Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than
        a given threshold.

        Returns: convergence status as a boolean
        '''
        converged = False
        policy_evaluation_iteration = 0
        while (not converged and self.hasTime()
               and policy_evaluation_iteration < self.max_PE_iterations):
            policy_evaluation_iteration += 1

            # Sweep The State Space
            for i in xrange(0, self.representation.agg_states_num):

                # Check for solver time
                if not self.hasTime(): break

                # Map an state ID to state
                s = self.representation.stateID2state(i)

                # Skip terminal states and states with no possible action
                possible_actions = self.domain.possibleActions(s=s)
                if (self.domain.isTerminal(s) or len(possible_actions) == 0):
                    continue

                # Apply Bellman Backup
                self.BellmanBackup(s, policy.pi(s, False, possible_actions),
                                   self.ns_samples, policy)

                # Update number of backups
                self.bellmanUpdates += 1

                # Check for the performance
                if self.bellmanUpdates % self.log_interval == 0:
                    performance_return = self.performanceRun()[0]
                    self.logger.info('[%s]: BellmanUpdates=%d, Return=%0.4f' %
                                     (hhmmss(deltaT(self.start_time)),
                                      self.bellmanUpdates, performance_return))

            # check for convergence: L_infinity norm of the difference between the to the weight vector of representation
            weight_vec_change = l_norm(
                policy.representation.weight_vec -
                self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # Log Status
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' %
                (policy_evaluation_iteration, hhmmss(deltaT(
                    self.start_time)), self.bellmanUpdates, weight_vec_change))

            # Show Plots
            if self.show:
                self.domain.showDomain(s=s, filename="policy")
                self.domain.showLearning(self.representation,
                                         filename="policy")

        return converged
Exemplo n.º 2
0
    def solve(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to show the total time took the process
        bellmanUpdates = 0  # used to track the performance improvement.
        converged = False
        iteration = 0

        # Check for Tabular Representation
        if not self.IsTabularRepresentation():
            self.logger.error("Value Iteration works only with a tabular representation.")
            return 0

        no_of_states = self.representation.agg_states_num

        while self.hasTime() and not converged:

            iteration += 1

            # Store the weight vector for comparison
            prev_weight_vec = self.representation.weight_vec.copy()

            # Sweep The State Space
            for i in xrange(no_of_states):

                s = self.representation.stateID2state(i)

                # Sweep through possible actions
                for a in self.domain.possibleActions(s):

                    # Check for available planning time
                    if not self.hasTime(): break

                    self.BellmanBackup(s, a, ns_samples=self.ns_samples)
                    bellmanUpdates += 1

                    # Create Log
                    if bellmanUpdates % self.log_interval == 0:
                        performance_return, _, _, _ = self.performanceRun()
                        self.logger.info(
                            '[%s]: BellmanUpdates=%d, Return=%0.4f' %
                            (hhmmss(deltaT(self.start_time)), bellmanUpdates, performance_return))

            # check for convergence
            weight_vec_change = l_norm(prev_weight_vec - self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # log the stats
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.4f, Steps=%d' % (iteration,
                 hhmmss(deltaT(self.start_time)),
                 bellmanUpdates,
                 weight_vec_change,
                 performance_return,
                 performance_steps))

            # Show the domain and value function
            if self.show:
                self.domain.show(a, s=s, representation=self.representation)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged: self.logger.info('Converged!')
        super(ValueIteration, self).solve()
Exemplo n.º 3
0
    def policyEvaluation(self, policy):

        '''
        Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than
        a given threshold.

        Returns: convergence status as a boolean
        '''
        converged = False
        policy_evaluation_iteration = 0
        while (not converged and
                self.hasTime() and
                policy_evaluation_iteration < self.max_PE_iterations
                ):
            policy_evaluation_iteration += 1

            # Sweep The State Space
            for i in range(0, self.representation.agg_states_num):

                # Check for solver time
                if not self.hasTime(): break

                # Map an state ID to state
                s = self.representation.stateID2state(i)

                # Skip terminal states and states with no possible action
                possible_actions = self.domain.possibleActions(s=s)
                if (self.domain.isTerminal(s) or
                    len(possible_actions) == 0):
                    continue

                # Apply Bellman Backup
                self.BellmanBackup(
                    s,
                    policy.pi(s, False, possible_actions),
                    self.ns_samples,
                    policy)

                # Update number of backups
                self.bellmanUpdates += 1

                # Check for the performance
                if self.bellmanUpdates % self.log_interval == 0:
                    performance_return = self.performanceRun()[0]
                    self.logger.info(
                        '[%s]: BellmanUpdates=%d, Return=%0.4f' %
                        (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return))

            # check for convergence: L_infinity norm of the difference between the to the weight vector of representation
            weight_vec_change = l_norm(policy.representation.weight_vec - self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # Log Status
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' %
                (policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, weight_vec_change))

            # Show Plots
            if self.show:
                self.domain.show(
                                 policy.pi(s, False, possible_actions),
                                 self.representation,
                                 s=s)
        return converged
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(self.representation, epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in xrange(self.samples_num):
                phi_s_a = self.representation.phi_sa(S[i], T[i],
                                                     Actions[i, 0]).reshape(
                                                         (-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A),
                                                     self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(
                new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f'
                % (iteration, hhmmss(deltaT(self.start_time)), samples,
                   delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()
Exemplo n.º 5
0
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(
            self.representation,
            epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in range(self.samples_num):
                phi_s_a = self.representation.phi_sa(
                    S[i], T[i], Actions[i, 0]).reshape((-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' %
                (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()