Exemplo n.º 1
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0

        while episodes < self.num_episodes:
            start_state, start_action = self.mdp_rep.init_state_action_gen()
            mc_path = self.get_mc_path(this_polf, start_state, start_action)
            rew_arr = np.array([x for _, _, x, _ in mc_path])
            if self.mdp_rep.terminal_state_func(mc_path[-1][0]):
                returns = get_returns_from_rewards_terminating(
                    rew_arr, self.mdp_rep.gamma)
            else:
                returns = get_returns_from_rewards_non_terminating(
                    rew_arr, self.mdp_rep.gamma, self.nt_return_eval_steps)

            sgd_pts = [((mc_path[i][0], mc_path[i][1]), r)
                       for i, r in enumerate(returns)
                       if not self.first_visit or mc_path[i][3]]
            # MC is offline update and so, policy improves after each episode
            self.qvf_fa.update_params(*zip(*sgd_pts))

            if control:
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval, self.state_action_func,
                    self.softmax, self.epsilon_func(episodes))
            episodes += 1

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval(
            (st, act))
Exemplo n.º 2
0
    def get_optimal_policy_func_pi(self) -> Callable[[S], A]:
        this_polf = self.get_init_policy_func()
        eps = self.tol * 1e4
        iters = 0
        params = deepcopy(self.fa.params)
        while eps >= self.tol:
            self.get_value_func_fa(this_polf, True)
            qvf = self.get_act_value_func_fa(this_polf, False)

            def q_func(sa: Tuple[S, A], qvf=qvf) -> float:
                return qvf(sa[0])(sa[1])

            this_polf = get_soft_policy_func_from_qf(
                qf=q_func,
                state_action_func=self.state_action_func,
                softmax=self.softmax,
                epsilon=self.epsilon_func(iters)
            )
            new_params = deepcopy(self.fa.params)
            eps = ADP.get_gradient_max(
                [new_params[i] - p for i, p in enumerate(params)]
            )
            params = new_params
            iters += 1

        # noinspection PyShadowingNames
        def det_pol(s: S, this_polf=this_polf) -> A:
            return max(this_polf(s).items(), key=itemgetter(1))[0]

        return det_pol
Exemplo n.º 3
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        ffs = self.qvf_fa.feature_funcs
        features = len(ffs)
        a_mat = np.zeros((features, features))
        b_vec = np.zeros(features)
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()

        for episode in range(self.num_episodes):
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()

            # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                phi_s = np.array([f((state, action)) for f in ffs])
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if control:
                    next_act = max(
                        [(a, self.qvf_fa.get_func_eval((next_state, a)))
                         for a in self.state_action_func(next_state)],
                        key=itemgetter(1))[0]
                else:
                    next_act = next_action
                phi_sp = np.array([f((next_state, next_act)) for f in ffs])
                a_mat += np.outer(phi_s, phi_s - self.mdp_rep.gamma * phi_sp)
                b_vec += reward * phi_s

                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)
                state = next_state
                action = next_action

            if control and (episode + 1) % self.batch_size == 0:
                self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)]
                # print(self.qvf_fa.params)
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval, self.state_action_func,
                    self.softmax, self.epsilon_func(episode))
                a_mat = np.zeros((features, features))
                b_vec = np.zeros(features)

        if not control:
            self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)]

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval(
            (st, act))
Exemplo n.º 4
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0

        while episodes < self.num_episodes:
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()

            # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(
                        self.qvf_fa.get_func_eval((next_state, a))
                        for a in self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_func_eval((next_state, a))
                    #               for a in self.state_action_func(next_state))
                    next_qv = get_expected_action_value(
                        {
                            a: self.qvf_fa.get_func_eval((next_state, a))
                            for a in self.state_action_func(next_state)
                        }, self.softmax, self.epsilon_func(episodes))
                else:
                    next_qv = self.qvf_fa.get_func_eval(
                        (next_state, next_action))

                target = reward + self.mdp_rep.gamma * next_qv
                # TD is online update and so, policy improves at every time step
                self.qvf_fa.update_params([(state, action)], [target])
                if control:
                    this_polf = get_soft_policy_func_from_qf(
                        self.qvf_fa.get_func_eval, self.state_action_func,
                        self.softmax, self.epsilon_func(episodes))
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)
                state = next_state
                action = next_action

            episodes += 1

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval(
            (st, act))
Exemplo n.º 5
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            et = np.zeros(self.qvf_fa.num_features)
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()
            features = self.qvf_fa.get_feature_vals((state, action))

            # print((episodes, max(self.qvf_fa.get_feature_vals((state, a)).dot(self.qvf_w)
            #                      for a in self.mdp_rep.state_action_func(state))))
            # print(self.qvf_w)

            old_qvf_fa = 0.
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                next_features = self.qvf_fa.get_feature_vals(
                    (next_state, next_action))
                qvf_fa = features.dot(self.qvf_w)
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qvf_fa = max(
                        self.qvf_fa.get_feature_vals((next_state,
                                                      a)).dot(self.qvf_w)
                        for a in self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qvf_fa = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w)
                    #               for a in self.state_action_func(next_state))
                    next_qvf_fa = get_expected_action_value(
                        {
                            a: self.qvf_fa.get_feature_vals(
                                (next_state, a)).dot(self.qvf_w)
                            for a in self.state_action_func(next_state)
                        }, self.softmax, self.epsilon_func(episodes))
                else:
                    next_qvf_fa = next_features.dot(self.qvf_w)

                target = reward + self.mdp_rep.gamma * next_qvf_fa
                delta = target - qvf_fa
                alpha = self.vf_fa.learning_rate * \
                    (updates / self.learning_rate_decay + 1) ** -0.5
                et = et * self.gamma_lambda + features * \
                    (1 - alpha * self.gamma_lambda * et.dot(features))
                self.qvf_w += alpha * (et * (delta + qvf_fa - old_qvf_fa) -
                                       features * (qvf_fa - old_qvf_fa))

                if control and self.batch_size == 0:
                    this_polf = get_soft_policy_func_from_qf(
                        lambda sa: self.qvf_fa.get_feature_vals(sa).dot(
                            self.qvf_w), self.state_action_func, self.softmax,
                        self.epsilon_func(episodes))
                updates += 1
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)
                old_qvf_fa = next_qvf_fa
                state = next_state
                action = next_action
                features = next_features

            episodes += 1

            if control and self.batch_size != 0 and\
                    episodes % self.batch_size == 0:
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval, self.state_action_func,
                    self.softmax, self.epsilon_func(episodes - 1))

        return lambda st: lambda act, st=st: self.qvf_fa.get_feature_vals(
            (st, act)).dot(self.qvf_w)
Exemplo n.º 6
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0

        while episodes < self.num_episodes:
            et = [np.zeros_like(p) for p in self.qvf_fa.params]
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()

            # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            states_actions = []
            targets = []
            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in
                                  self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_func_eval((next_state, a))
                    #               for a in self.state_action_func(next_state))
                    next_qv = get_expected_action_value(
                        {a: self.qvf_fa.get_func_eval((next_state, a)) for a in
                         self.state_action_func(next_state)},
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                else:
                    next_qv = self.qvf_fa.get_func_eval((next_state, next_action))

                target = reward + self.mdp_rep.gamma * next_qv
                delta = target - self.qvf_fa.get_func_eval((state, action))

                if self.offline:
                    states_actions.append((state, action))
                    targets.append(target)
                else:
                    et = [et[i] * self.gamma_lambda + g for i, g in
                          enumerate(self.qvf_fa.get_sum_objective_gradient(
                              [(state, action)],
                              np.ones(1)
                          )
                          )]
                    self.qvf_fa.update_params_from_gradient(
                        [-e * delta for e in et]
                    )
                if control and self.batch_size == 0:
                    this_polf = get_soft_policy_func_from_qf(
                        self.qvf_fa.get_func_eval,
                        self.state_action_func,
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)

                state = next_state
                action = next_action

            if self.offline:
                avg_grad = [g / len(states_actions) for g in
                            self.qvf_fa.get_el_tr_sum_loss_gradient(
                                states_actions,
                                targets,
                                self.gamma_lambda
                            )]
                self.qvf_fa.update_params_from_gradient(avg_grad)

            episodes += 1

            if control and self.batch_size != 0 and\
                    episodes % self.batch_size == 0:
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval,
                    self.state_action_func,
                    self.softmax,
                    self.epsilon_func(episodes - 1)
                )

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))