示例#1
0
                else:
                    Q.append(r + self.gamma * self.v[s])
            # 贪婪策略,进行更新
            self.pi[state] = self.actions[Q.index(max(Q))]
            # self.pi[state] = Q.index(max(Q))

    def policy_iterate(self):
        for i in range(100):
            # 策略评估,变的时v
            self.policy_evaluate()
            # 策略改进, 变的是pi
            self.policy_improve()


if __name__ == "__main__":
    yuanyang = YuanYangEnv()
    policy_value = Policy_Value(yuanyang)
    policy_value.policy_iterate()
    flag = 1
    s = 0
    # print(policy_value.pi)
    step_num = 0
    # 将最优路径打印出来
    while flag:
        a = policy_value.pi[s]
        print('%d->%s\t' % (s, a))
        yuanyang.bird_male_position = yuanyang.state_to_position(s)
        yuanyang.render()
        time.sleep(0.2)
        step_num += 1
        yuanyang.state = s
示例#2
0
    def e_greedy_policy(self, s):
        a_max = np.argmax(self.Q[s])
        if np.random.uniform() < 1 - self.epislon:
            return self.env.actions[a_max]
        else:
            return self.env.actions[int(random.random() *
                                        len(self.env.actions))]

    # 训练结束后,使用贪婪策略试验
    def greedy_policy(self, s):
        a_max = np.argmax(self.Q[s])
        return self.env.actions[a_max]


if __name__ == '__main__':
    env = YuanYangEnv()
    agent = Qlearning(env)
    agent.td_learning()
    print(np.sum(agent.Q))
    flag = 1
    s = 0
    step_num = 0
    # 将最优路径打印出来
    while flag:
        a = agent.greedy_policy(s)
        print('%d->%s\t' % (s, a))
        env.bird_male_position = env.state_to_position(s)
        env.render()
        time.sleep(0.2)
        step_num += 1
        env.state = s
示例#3
0
            self.pi = self.policy_improvment()

    def choose_action(self, s):
        a_select = 0
        rd = random.random()
        prob = 0
        for i in range(len(self.env.actions)):
            prob += self.pi[s][i]
            if rd <= prob:
                a_select = self.env.actions[i]
                break
        return a_select


if __name__ == '__main__':
    env = YuanYangEnv()
    agent = DP_soft(env, 0)
    agent.policy_iteration()
    Q = np.zeros((len(env.states), len(env.actions)), dtype=np.float32)
    for state in env.states:
        if env.is_terminal(state):
            continue
        for action in env.actions:
            # state = int(state)
            # action = int(action)
            next_state, r, done = env.step(action)
            if done:
                Q[state, action] = r
            else:
                Q[state, action] = r + agent.gamma * agent.V[next_state]
    print("动作值函数总和为:", np.sum(Q))