def learn(self, env, episode_count=1000, gamma=0.9,
              learning_rate=0.1, render=False, report_interval=50):
        """QLearningAgent.learn
        env:環境のデータが格納された変数です。
        episode_count:エピソード回数を指定します。default:1000
        gamma:割引率を指定します。default:0.9
        render:画面に様子を表示するかどうか設定します。default:False
        report_interval:ログを保存する間隔を設定します。default:50"""
        self.init_log()
        actions = list(range(env.action_space.n))
        self.Q = defaultdict(lambda: [0] * len(actions))
        for e in tqdm(range(episode_count)):
            s = env.reset()
            done = False
            while not done:
                if render:
                    cuitools.reset()
                    env.render()
                    time.sleep(0.01)
                a = self.policy(s, actions)
                n_state, reward, done, info = env.step(a)

                gain = reward + gamma * max(self.Q[n_state])
                estimated = self.Q[s][a]
                self.Q[s][a] += learning_rate * (gain - estimated)
                s = n_state

            else:
                self.log(reward)

            if e != 0 and e % report_interval == 0:
                pass
    def learn(self,
              env,
              episode_count=1000,
              gamma=0.9,
              render=False,
              report_interval=50):
        """MonteCarloAgent.learn
env:環境のデータが格納された変数です。
episode_count:エピソード回数を指定します。default:1000
gamma:割引率を指定します。default:0.9
render:画面に様子を表示するかどうか設定します。default:False
report_interval:ログを保存する間隔を設定します。default:50"""
        self.init_log()
        actions = list(range(env.action_space.n))
        self.Q = defaultdict(lambda: [0] * len(actions))
        N = defaultdict(lambda: [0] * len(actions))

        for e in tqdm(range(episode_count)):
            s = env.reset()
            done = False
            # Play until the end of episode.
            experience = []
            while not done:
                if render:
                    cuitools.reset()
                    env.render()
                    time.sleep(0.01)
                a = self.policy(s, actions)
                n_state, reward, done, info = env.step(a)
                experience.append({"state": s, "action": a, "reward": reward})
                s = n_state
            else:
                self.log(reward)

            # Evaluate each state, action.
            for i, x in enumerate(experience):
                s, a = x["state"], x["action"]

                # Calculate discounted future reward of s.
                G, t = 0, 0
                for j in range(i, len(experience)):
                    G += math.pow(gamma, t) * experience[j]["reward"]
                    t += 1

                N[s][a] += 1  # count of s, a pair
                alpha = 1 / N[s][a]
                self.Q[s][a] += alpha * (G - self.Q[s][a])

            if e != 0 and e % report_interval == 0:
                pass
Пример #3
0
    def train(self,
              env,
              episode_count=1000,
              gamma=0.9,
              learning_rate=0.1,
              render=False,
              report_interval=50):
        """actor_critic.train
                env:環境のデータが格納された変数です。
                episode_count:エピソード回数を指定します。default:1000
                gamma:割引率を指定します。default:0.9
                render:画面に様子を表示するかどうか設定します。default:False
                report_interval:ログを保存する間隔を設定します。default:50"""
        actor = self.actor_class(env)
        critic = self.critic_class(env)

        actor.init_log()
        for e in tqdm(range(episode_count)):
            s = env.reset()
            done = False
            while not done:
                if render:
                    cuitools.reset()
                    env.render()
                    time.sleep(0.01)
                a = actor.policy(s)
                n_state, reward, done, info = env.step(a)

                gain = reward + gamma * critic.V[n_state]
                estimated = critic.V[s]
                td = gain - estimated
                actor.Q[s][a] += learning_rate * td
                critic.V[s] += learning_rate * td
                s = n_state

            else:
                actor.log(reward)

            if e != 0 and e % report_interval == 0:
                pass
                # actor.show_reward_log(episode=e)

        return actor, critic
Пример #4
0
    _ = plus_proche(T, ref, diss)
    fin = time()
    duree = fin - debut
    print("Duree (moyen): ", duree)

    n = 100000  # grand
    T = []
    for _ in range(n):
        T.append(randint(0, n))
    ref = randint(0, n)

    debut = time()
    _ = plus_proche(T, ref, diss)
    fin = time()
    duree = fin - debut
    print("Duree (grand): ", duree)

    print(f"Duree (grand): {tps_plus_proche(n)}")

    N = []
    Tps = []
    for n in tqdm(range(10, 10**5, 200)):
        N.append(n)
        # calcul
        Tps.append(tps_plus_proche(n))

    #affichage de la courbe N vs Tps
    plt.figure()
    plt.plot(N, Tps)
    plt.show()
# tqdm a installer
#from tqdm import tqdm                   # programme python classique
#from tqdm.notebook import tqdm           # dans un notebook
from tqdm.gui import tqdm  # pour thonny
from time import sleep
from random import randint

for i in tqdm(range(5), desc='Boucle sur i'):
    for j in tqdm(range(3), desc=f'Boucle sur j', leave=False):
        duree_sommeil = randint(1, 2)
        sleep(duree_sommeil)