/
player.py
162 lines (121 loc) · 4.29 KB
/
player.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# List potential players
# ===========================
from random import randint, random
import numpy as np
from numpy.random import beta
from random import random
class Random:
"""
Player which plays arms at random
"""
def __init__(self, nb_arms):
self.nb_arms = nb_arms
def choose_next_arm(self):
return randint(0, self.nb_arms - 1)
def update(self, arm, reward):
pass
class Oracle:
"""
Player which plays the best arm
"""
def __init__(self, best_arm):
self.best_arm = best_arm
def choose_next_arm(self):
return self.best_arm
def update(self, arm, reward):
pass
def restart(self):
pass
class ExploreThenCommit:
"""
Player which plays arms at random for n0 trials, and then plays the best arm up to now
"""
def __init__(self, nb_arms, n0):
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
self.n0 = n0
self.winner = -1
def choose_next_arm(self):
t = np.sum(self.nb_trials)
if t < self.n0:
# explore
return (np.uint(t % self.cum_reward.shape[0]))
else:
# exploit
return self.winner
def update(self, arm, reward):
self.cum_reward[arm] += reward
self.nb_trials[arm] += 1
T = np.sum(self.nb_trials)
if T == self.n0:
self.winner = np.argmax(self.cum_reward / self.nb_trials)
def restart(self):
nb_arms = self.cum_reward.shape[0]
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
self.winner = -1
class EpsilonNGreedy:
"""
Player which plays the best arm (up to now) with probability (1-c/t), and an arm uniformly at random otherwise
"""
def __init__(self, nb_arms, c):
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
self.c = c
def choose_next_arm(self, epsilon=10 ** (-5)):
t = sum(self.nb_trials) + epsilon
if random() < self.c / t:
# explore
return randint(0, self.cum_reward.shape[0] - 1)
else:
# exploit
return np.argmax(self.cum_reward / (self.nb_trials + epsilon))
def update(self, arm, reward):
self.cum_reward[arm] += reward
self.nb_trials[arm] += 1
def restart(self):
nb_arms = self.cum_reward.shape[0]
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
class UCB1:
"""
Player which plays the arm with the highest confidence upper confidence bound
"""
def __init__(self, nb_arms, c=2):
self.c = c
self.nb_arms = nb_arms
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
def choose_next_arm(self, epsilon=10**(-5)):
# L'instant actuel
t = sum(self.nb_trials) + 1
mu = self.cum_reward/(self.nb_trials + epsilon)
#return np.argmax(mu + np.sqrt( (2*np.log(t))/(self.nb_trials + epsilon)))
return np.argmax(mu + np.sqrt( (self.c*np.log(t))/(self.nb_trials + epsilon)))
def update(self, arm, reward):
self.cum_reward[arm] += reward
self.nb_trials[arm] += 1
def restart(self):
nb_arms = self.cum_reward.shape[0]
self.cum_reward = np.zeros(nb_arms)
self.nb_trials = np.zeros(nb_arms, dtype=np.uint)
class ThompsonSamplingBernoulli:
"""
Approximate random sampling given posterior probability to be optimal
"""
def __init__(self, nb_arms, prior_s=0.5, prior_f=0.5):
self.prior_s = prior_s
self.prior_f = prior_f
self.success = np.ones(nb_arms, dtype=np.uint) * prior_s
self.failure = np.ones(nb_arms, dtype=np.uint) * prior_f
def choose_next_arm(self):
thetas = beta(self.success, self.failure)
return np.argmax(thetas)
def update(self, arm, reward):
rew = int(random() < reward)
self.success[arm] += rew
self.failure[arm] += 1 - rew
def restart(self):
nb_arms = self.success.shape[0]
self.success = np.ones(nb_arms, dtype=np.uint)*self.prior_s
self.failure = np.ones(nb_arms, dtype=np.uint)*self.prior_f