def update(self, arm, reward): """更新收益 :param reward: 收益 :type arm: 选中的臂的下标 """ Policy.update(self, arm, reward) self.b[arm] = self.b[arm] + reward * self.context[arm] self.context[arm].shape = (self.d, 1) self.A[arm] = self.A[arm] + self.context[arm].dot(np.transpose(self.context[arm]))
def update(self, arm, reward): Policy.update(self, arm, reward) ratio = math.exp(self.gamma * reward / (self.n_bandits * self._probs[arm])) # todo 权重会越来越大? self._weights[arm] *= ratio