Пример #1
0
import os, sys
sys.path.append(os.getcwd())
import random
import numpy as np
import matplotlib.pyplot as plt
from agents.simple_rl_agent import QLearningAgent
from agents.policy import EpsGreedyQPolicy, UCB
from envs.grid_env import GridEnv
import ipdb

if __name__ == '__main__':

    grid_env = GridEnv()  # 5本のアームを設定
    ini_state = grid_env.start_pos
    is_goal = False
    policy = EpsGreedyQPolicy(epsilon=1.0)
    agent = QLearningAgent(actions=np.arange(5),
                           observation=ini_state,
                           policy=policy,
                           epsilon_decay_rate=0.99)
    nb_episode = 10000  #ステップ数
    rewards = []
    for episode in range(nb_episode):
        episode_reward = []
        while (is_goal == False):
            action = agent.act()
            state, reward, is_goal = grid_env.step(action)
            # print("action:{}, state:{}, reward:{}".format(action, state, reward))
            agent.observe(state)
            agent.get_reward(reward)
            agent.decay_alpha()
import matplotlib.pyplot as plt
from copy import deepcopy
from abc import ABCMeta, abstractmethod
from collections import deque, namedtuple
from agents.dqn import DQNAgent
from agents.policy import EpsGreedyQPolicy
from agents.memory import Memory
import random

if __name__ == '__main__':
    env = gym.make('CartPole-v0')  # ゲームを指定して読み込む
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n
    actions = np.arange(nb_actions)
    policy = EpsGreedyQPolicy(eps=1.0, eps_decay_rate=0.99, min_eps=0.01)
    memory = Memory(limit=50000, maxlen=1)
    # 初期観測情報
    obs = env.reset()
    # エージェントの初期化
    agent = DQNAgent(actions=actions,
                     memory=memory,
                     update_interval=200,
                     train_interval=1,
                     batch_size=32,
                     memory_interval=1,
                     observation=obs,
                     input_shape=[len(obs)],
                     id=1,
                     name=None,
                     training=True,
    x = tf.keras.layers.Flatten()(input_layer)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    output_layer = tf.keras.layers.Dense(nb_output, activation='linear')(x)
    model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)

    return model


env = gym.make('MountainCar-v0')
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
actions = np.arange(nb_actions)
policy = EpsGreedyQPolicy(eps=1., eps_decay_rate=.9999, min_eps=.01)
memory = RandomMemory(limit=10000)
# memory = SequentialMemory(limit=50000, maxlen=1)
ini_observation = env.reset()
loss_fn = tf.keras.losses.Huber()
optimizer = tf.keras.optimizers.Adam()
model = build_q_network(
    input_shape=[
        len(ini_observation)],
    nb_output=len(actions))
target_model = build_q_network(
    input_shape=[
        len(ini_observation)],
    nb_output=len(actions))

agent = DQNAgent(actions=actions,
import os, sys
import random
import numpy as np
import matplotlib.pyplot as plt
from agents.sarsa_agent import SARSAAgent
from agents.policy import EpsGreedyQPolicy
from envs.grid_world import GridWorld

if __name__ == '__main__':
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    policy = EpsGreedyQPolicy(epsilon=0.01)  # 方策の初期化。ここではε-greedy
    agent = SARSAAgent(actions=np.arange(4),
                       observation=ini_state,
                       policy=policy)  # sarsa エージェントの初期化
    nb_episode = 100  #エピソード数
    rewards = []  # 評価用報酬の保存
    is_goal = False  # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = []  # 1エピソードの累積報酬
        while (is_goal == False):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_goal = grid_env.step(action)
            agent.observe_state_and_reward(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  #  初期化
        agent.observe(state)  # エージェントを初期位置に
        is_goal = False

    # 結果のプロット