Exemplo n.º 1
0
 def sarsa(self, YuanYangEnv, num_iter, alpha, epsilon):
     for iter in range(num_iter):
         #随机初始化状态
         s = YuanYangEnv.reset()
         #随机选初始动作
         a = YuanYangEnv.actions[int(random.random() *
                                     len(YuanYangEnv.actions))]
         t = False
         count = 0
         while False == t and count < 200:
             #与环境交互得到下一个状态
             s_next, r, t = YuanYangEnv.transform(s, a)
             a_num = self.find_anum(YuanYangEnv, a)
             if t == True:
                 q_target = r
             else:
                 # 下一个状态处的最大动作
                 a1 = self.epsilon_greedy_policy(YuanYangEnv, self.qvalue,
                                                 s_next, epsilon)
                 a1_num = self.find_anum(YuanYangEnv, a1)
                 # qlearning的更新公式
                 q_target = r + YuanYangEnv.gamma * self.qvalue[s_next,
                                                                a1_num]
                 # 利用td方法更新动作值函数
             self.qvalue[s, a_num] = self.qvalue[
                 s, a_num] + alpha * (q_target - self.qvalue[s, a_num])
             # YuanYangEnv2.bird_male_position = YuanYangEnv2.state_to_position(s)
             # YuanYangEnv2.render()
             # time.sleep(1)
             # 转到下一个状态
             s = s_next
             a = self.epsilon_greedy_policy(YuanYangEnv, self.qvalue, s,
                                            epsilon)
             count += 1
     return self.qvalue
Exemplo n.º 2
0
                #策略评估
                v1 = r + self.gamma * self.v[s]
                #策略改进
                for action in self.actions:
                    s, r, t = yuanyang.transform( state, action )
                    if v1 < r + self.gamma * self.v[s]:
                        a1 = action
                        v1 = r + self.gamma * self.v[s]
                delta+= abs(v1 - self.v[state])
                self.pi[state] = a1
                self.v[state]  = v1
            if delta <  1e-6:
                print("迭代次数为",i)
                break
if __name__ == "__main__":
    yuanyang     = YuanYangEnv()
    policy_value = DP_Value_Iter(yuanyang)
    policy_value.value_iteration()
    # 将v值打印出来
    s = 0
    path = []
    for state in range(100):
        i = int(state / 10)
        j = state % 10
        yuanyang.value[j, i] = policy_value.v[state]
    flag = 1
    step_num = 0
    # 将最优路径打印出来
    while flag:
        #渲染路径点
        path.append(s)
Exemplo n.º 3
0
                action = self.pi[state]
                s, r, t = yuanyang.transform(state, action)
                new_v = r + self.gamma * self.v[s]
                delta += abs(self.v[state] - new_v)
                self.v[state] = new_v
            if delta < 1e-6:
                break

    def policy_iterate(self):
        for i in range(100):
            self.policy_evaluate()
            self.policy_improve()


if __name__ == "__main__":
    yuanyang = YuanYangEnv()
    policy_value = Policy_Value(yuanyang)
    policy_value.policy_iterate()
    flag = 1
    s = 0
    # print(policy_value.pi)
    step_num = 0
    #将最优路径打印出来
    while flag:
        a = policy_value.pi[s]
        print('%d->%s\t' % (s, a))
        yuanyang.bird_male_position = yuanyang.state_to_position(s)
        yuanyang.render()
        time.sleep(0.2)
        step_num += 1
        s_, r, t = yuanyang.transform(s, a)