-
Notifications
You must be signed in to change notification settings - Fork 0
/
RL_easy_21.py
85 lines (55 loc) · 1.48 KB
/
RL_easy_21.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 28 15:24:31 2019
@author: gbcfr
"""
import numpy as np
import random as rd
from step import step
terminal = np.full((1),False)
# Question 2
# Applying Monte-Carlo control
# No discounting
# Initialization
pi = np.full((21,10),'hit')
pi[17:]='stick'
n0 = 100
ns = np.full((21,10),0)
nsa = np.full((21,10,2),0)
alpha = 0
Q = np.full((21,10,2),0)
V = np.full((21,10),0)
G = 0
r = 0
# Start
episodes = 1000
for i in range(episodes):
sum_p = rd.randint(1,10)
sum_d = rd.randint(1,10)
s = [sum_p,sum_d]
sp = [sum_p,sum_d]
terminal = False
G = 0
while terminal == False:
a = pi[s[0]-1,s[1]-1]
ns[s[0]-1,s[1]-1] += 1
s, sp, r, terminal = step(s,a,sp)
if a == 'hit':
nsa[s[0]-1,s[1]-1,0]+=1
G += r
alpha = 1/nsa[s[0]-1,s[1]-1,0]
Q[s[0]-1,s[1]-1,0] += alpha*(G - Q[s[0]-1,s[1]-1,0])
else:
nsa[s[0]-1,s[1]-1,1]+=1
G += r
alpha = 1/nsa[s[0]-1,s[1]-1,1]
Q[s[0]-1,s[1]-1,1] += alpha*(G - Q[s[0]-1,s[1]-1,1])
s = sp[:]
for i, a in np.ndenumerate(pi):
epsilon = n0/(n0+ns[i[0],i[1]])
if max(Q[i[0],i[1],0],Q[i[0],i[1],1])==Q[i[0],i[1],0]:
a = 'hit'
else:
a = 'stick'
pi_new = rd.choices(population=[rd.choice(['hit','stick']),a],weights=[epsilon, (1-epsilon)])
pi[i] = pi_new[0]