示例#1
0
                G += E*self.delta[k_id]
                E = self.gamma*E*((1-self.sigma[k_id])*self.target_policy(self.S[(k + 1) % (n + 1)])['probs'][self.A[(k + 1) % (n + 1)]]+self.sigma[(k + 1) % (n + 1)])
                rho *= (1 - self.sigma[k_id] + self.sigma[k_id]*self.rho[k_id])

            delta = rho*(G-self.Q[S_tau][A_tau])
            self.Q[S_tau][A_tau] += self.alpha * delta

        self.t += 1
        if done:
            self.t = 0

    def get_policy(self, s, epsilon):
        a = np.argmax(self.Q[s])
        pi_probs = np.ones(self.env.nA) * epsilon / self.env.nA
        pi_probs[a] += (1 - epsilon)
        return {'action': np.random.choice(range(self.env.nA), p=pi_probs), 'probs': pi_probs}

    def get_sigma(self, a):
        return np.random.randint(2, size=self.env.nA)[a]


if __name__ == "__main__":
    envn = 'CliffWalking-v0'
    env = gym.make(envn)
    agent = Qsigma(env, n=3, gamma=0.9, epsilon=0.1, alpha=0.5)
    exp = f"experiments/{envn}_{agent}"
    train(env, agent, exp, num_episodes=50, max_runs=5)
    main_plot(exp, smoothing_window=10)
    plt.ylim([-100, 0])
    plt.show()
示例#2
0
    for _ in range(50):
        agents = [
            SarsaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma),
            ExpSarsaAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma),
            QslAgent(env,
                     epsilon=epsilon,
                     alpha=alpha,
                     gamma=gamma,
                     sigma_strat='static',
                     sigma=0.5,
                     lamb=1),
            QslAgent(env, epsilon=epsilon, alpha=alpha, gamma=gamma, lamb=0.8)
        ]

        experiments = []
        for agent in agents:
            expn = f"experiments/{str(agent)}"
            train(env, agent, expn, num_episodes=num_episodes, max_runs=100)
            experiments.append(expn)
    return experiments


if __name__ == "__main__":
    envn = 'StochWindyGridWorld-v0'
    env = gym.make(envn)
    experiments = run_exp(env, num_episodes=200)
    main_plot(experiments, smoothing_window=15)
    plt.ylim([-100, -30])
    plt.savefig('plot.png')
    plt.show()
示例#3
0
        """
        self.a = self.pi_eps(sp) if not done else -1  # !b #!b self.a = ....
        pi_probs = self.pi_probs(sp)
        exp_sarsa_target = np.dot(pi_probs, self.Q[sp])
        """ now that you know A' = self.a, perform the update to self.Q[s][a] here """
        delta = r + (self.gamma *
                     exp_sarsa_target if not done else 0) - self.Q[s][a]  # !b
        self.Q[s][a] += self.alpha * delta  # !b
        self.t = 0 if done else self.t + 1  # update current iteration number

    def __str__(self):
        return f"ExpSarsa($\\gamma={self.gamma},\\epsilon={self.epsilon},\\alpha={self.alpha}$)"


def experiment():
    envn = 'StochWindyGridWorld-v0'
    env = gym.make(envn)
    agent = ExpSarsaAgent(env, epsilon=0.1, alpha=0.5)
    exp = f"experiments/{str(agent)}"
    train(env, agent, exp, num_episodes=200, max_runs=10)
    return env, exp


if __name__ == "__main__":
    env, q_experiment = q_agent_exp()  # get results from Q-learning
    env, sarsa_exp = experiment()
    main_plot([q_experiment, sarsa_exp], smoothing_window=10)
    plt.ylim([-100, 0])
    plt.title("Q and Sarsa learning on " + env.spec._env_name)
    plt.show()