-
Notifications
You must be signed in to change notification settings - Fork 0
/
CS533-4.py
181 lines (147 loc) · 5.78 KB
/
CS533-4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
__author__ = 'Austin'
import random
from mdp import MDP
from sim import Sim
from plan import *
def random_policy(sim):
rewards = 0
while not sim.is_trial_over():
action = random.choice(range(sim.mdp.num_actions))
rewards += sim.do_action(action)
return rewards
def safe_policy(sim, p_park):
rewards = 0
while not sim.is_trial_over():
if sim.is_occupied():
# drive
rewards += sim.do_action(sim.drive())
else:
# randomly decide to park or not
if random.random() > p_park:
rewards += sim.do_action(sim.park())
else:
rewards += sim.do_action(sim.drive())
return rewards
def range_policy(sim, near, far):
"""
Parks if the spot is empty and spot is in [near, far]
Allows to only go to best spots, or to avoid handicapped
"""
rewards = 0
while not sim.is_trial_over():
if sim.is_occupied():
# drive
rewards += sim.do_action(sim.drive())
else:
# park if within range
if near <= sim.get_location() <= far:
rewards += sim.do_action(sim.park())
else:
rewards += sim.do_action(sim.drive())
return rewards
def run_policy(sim, policy):
""" Runs policy on sim.
"""
rewards = 0
while not sim.is_trial_over() and sim.time < 200:
rewards += sim.do_action(policy[sim.current])
return rewards
def adp_rl(mdp, sim, transition_count):
p_explore = 0.1
while sim.time < 200:
old_state = sim.current
val, policy, iterations = plan(mdp, 0.95, 0.05)
# take action according to explore/exploit: epsilon-greedy
# if epsilon
# random
# else
# greedy
if random.random() < p_explore:
# choose random action
action = random.choice(range(sim.get_actions()))
#print "Random action: " + str(action)
else:
# choose policy action
action = policy[old_state]
reward = sim.do_action(action)
new_state = sim.current
# update model:
# reward[current] = reward we received on the action
# transition[action_taken][old_state][new_state] += epsilon
# somehow remove epsilon from all other transitions like transition[action_taken][old_state][s]
mdp.rewards[new_state] = reward
transition_count[action][old_state][new_state] += 1
mdp = update_transitions(mdp, transition_count)
return mdp, transition_count
def update_transitions(mdp, transition_count):
for a in range(mdp.num_actions):
for s_old in range(mdp.num_states):
for s_new in range(mdp.num_states):
mdp.transitions[a][s_old][s_new] = float(transition_count[a][s_old][s_new]) / float(sum(transition_count[a][s_old]))
return mdp
def average(l):
return sum(l) / float(len(l))
def generate_blank_mdp(n_actions, n_states):
rewards = [0 for _ in range(n_states)]
transitions = [[[1.0 / float(n_states) for _ in range(n_states)] for _ in range(n_states)] for _ in range(n_actions)]
name = "blank_" + str(n_actions) + "_actions_" + str(n_states) + "_states_mdp"
with open(name + ".txt", "w") as out_file:
out_file.write(str(n_states) + "\n")
out_file.write(str(n_actions) + "\n")
out_file.write(" ".join([str(x) for x in rewards]))
out_file.write("\n")
for a in range(n_actions):
out_file.write("\n".join([" ".join([str(cell) for cell in line]) for line in transitions[a]]))
out_file.write("\n")
def part_ii_evaluation():
random_results_1 = []
safe_results_1 = []
range_results_1 = []
random_results_2 = []
safe_results_2 = []
range_results_2 = []
for i in range(1000):
print i
random_results_1.append(random_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt"))))
random_results_2.append(random_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt"))))
safe_results_1.append(safe_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 0.5))
safe_results_2.append(safe_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 0.5))
range_results_1.append(range_policy(Sim(MDP("parking_mdp_linear_rewards_n_10.txt")), 2, 8))
range_results_2.append(range_policy(Sim(MDP("parking_mdp_quad_rewards_n_10.txt")), 2, 6))
print average(random_results_1)
print average(safe_results_1)
print average(range_results_1)
print average(random_results_2)
print average(safe_results_2)
print average(range_results_2)
def part_iii_evaluation(sim_filename):
print sim_filename
mdp = MDP("blank_2_actions_81_states_mdp.txt")
results = []
# prior: assume each transition seen once
transition_count = [[[0.1 for _ in range(81)] for _ in range(81)] for _ in range(2)]
for n in range(10):
print "Big loop " + str(n)
results.append([])
for i in range(100):
mdp, transition_count = adp_rl(mdp, Sim(MDP(sim_filename)), transition_count)
value_fn, policy, iterations = plan(mdp, 0.99, 0.01)
print "Value: " + str(value_fn)
print "Policy: " + str(policy)
#print "Reward: " + str(mdp.rewards)
#print "Transitions: " + str(mdp.transitions)
for i in range(100):
reward = run_policy(Sim(MDP(sim_filename)), policy)
results[n].append(reward)
print "Average reward of policy: " + str(average(results[n]))
for l in results:
print average(l)
#part_ii_evaluation() # need to redo, since random start now
# -71.391
# 3.539
# 12.004
# -71.498
# -0.154
# 8.024
part_iii_evaluation("parking_mdp_linear_rewards_n_10.txt")
part_iii_evaluation("parking_mdp_quad_rewards_n_10.txt")