Пример #1
0
    def __init__(self, exp, idx):
        super().__init__(exp, idx)
        self.exp = exp
        self.idx = idx

        self.states = self.params['states']

        mu_pl = self.params['behavior']
        self.behavior = Policy(lambda s: [mu_pl, 1 - mu_pl])

        pi_pl = self.params['target']
        self.target = Policy(lambda s: [pi_pl, 1 - pi_pl])

        self.env = RWEnv(self.states)

        # build representation
        representation = self.params['representation']
        self.rep = self._buildRepresentation(representation)

        # build agent
        self.agent = self.Agent(self.rep.features(), 2, self.params)
Пример #2
0
    def __init__(self, exp, idx):
        super().__init__(exp, idx)
        self.exp = exp
        self.idx = idx

        N = self._getSize()

        self.reward_scale = self.metaParameters.get('reward_scale', 1)
        self.env = Chain(N, self.reward_scale)

        # build target policy
        self.target = self._getTarget()

        self.behavior = Policy(lambda s: [0.5, 0.5])

        self.v_star = self.compute_v(N, self.target, self.reward_scale)

        # build representation
        self.rep = self._getRepresentation(N)

        # build agent
        self.agent = self.Agent(self.rep.features(), self.metaParameters)

        # compute the observable value for each state once
        self.X = np.array([self.rep.encode(i) for i in range(N + 1)])

        # (1/n+1) sum_{k=0}^n P^k gives a matrix with db in each row, where P is the markov chain
        # induced by the behaviour policy
        self.db = self._getdb()

        # build transition probability matrix (under target)
        self.P = np.zeros((N + 1, N + 1))
        pl, pr = self.target.probs(0)
        self.P[0, 1] = pr
        self.P[0, N] = pl
        self.P[N - 1, N - 2] = pl
        self.P[N - 1, N] = pr
        for i in range(1, N - 1):
            self.P[i, i - 1] = pl
            self.P[i, i + 1] = pr

        self.R = np.zeros(N + 1)
        self.R[0] = pl * -self.reward_scale
        self.R[N - 1] = pr * self.reward_scale

        self.setupIdealH()
Пример #3
0
 def _getTarget(self):
     return Policy(lambda s: [.1, .9])
Пример #4
0
 def _getTarget(self):
     return Policy(lambda s: [.25, .75])
Пример #5
0
 def _getTarget(self):
     return Policy(lambda s: [.4, .6])