import os, sys sys.path.append(os.getcwd()) import random import numpy as np import matplotlib.pyplot as plt from agents.simple_rl_agent import QLearningAgent from agents.policy import EpsGreedyQPolicy, UCB from envs.grid_env import GridEnv import ipdb if __name__ == '__main__': grid_env = GridEnv() # 5本のアームを設定 ini_state = grid_env.start_pos is_goal = False policy = EpsGreedyQPolicy(epsilon=1.0) agent = QLearningAgent(actions=np.arange(5), observation=ini_state, policy=policy, epsilon_decay_rate=0.99) nb_episode = 10000 #ステップ数 rewards = [] for episode in range(nb_episode): episode_reward = [] while (is_goal == False): action = agent.act() state, reward, is_goal = grid_env.step(action) # print("action:{}, state:{}, reward:{}".format(action, state, reward)) agent.observe(state) agent.get_reward(reward) agent.decay_alpha()
import matplotlib.pyplot as plt from copy import deepcopy from abc import ABCMeta, abstractmethod from collections import deque, namedtuple from agents.dqn import DQNAgent from agents.policy import EpsGreedyQPolicy from agents.memory import Memory import random if __name__ == '__main__': env = gym.make('CartPole-v0') # ゲームを指定して読み込む np.random.seed(123) env.seed(123) nb_actions = env.action_space.n actions = np.arange(nb_actions) policy = EpsGreedyQPolicy(eps=1.0, eps_decay_rate=0.99, min_eps=0.01) memory = Memory(limit=50000, maxlen=1) # 初期観測情報 obs = env.reset() # エージェントの初期化 agent = DQNAgent(actions=actions, memory=memory, update_interval=200, train_interval=1, batch_size=32, memory_interval=1, observation=obs, input_shape=[len(obs)], id=1, name=None, training=True,
x = tf.keras.layers.Flatten()(input_layer) x = tf.keras.layers.Dense(32, activation='relu')(x) x = tf.keras.layers.Dense(32, activation='relu')(x) x = tf.keras.layers.Dense(32, activation='relu')(x) output_layer = tf.keras.layers.Dense(nb_output, activation='linear')(x) model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer) return model env = gym.make('MountainCar-v0') np.random.seed(123) env.seed(123) nb_actions = env.action_space.n actions = np.arange(nb_actions) policy = EpsGreedyQPolicy(eps=1., eps_decay_rate=.9999, min_eps=.01) memory = RandomMemory(limit=10000) # memory = SequentialMemory(limit=50000, maxlen=1) ini_observation = env.reset() loss_fn = tf.keras.losses.Huber() optimizer = tf.keras.optimizers.Adam() model = build_q_network( input_shape=[ len(ini_observation)], nb_output=len(actions)) target_model = build_q_network( input_shape=[ len(ini_observation)], nb_output=len(actions)) agent = DQNAgent(actions=actions,
import os, sys import random import numpy as np import matplotlib.pyplot as plt from agents.sarsa_agent import SARSAAgent from agents.policy import EpsGreedyQPolicy from envs.grid_world import GridWorld if __name__ == '__main__': grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) policy = EpsGreedyQPolicy(epsilon=0.01) # 方策の初期化。ここではε-greedy agent = SARSAAgent(actions=np.arange(4), observation=ini_state, policy=policy) # sarsa エージェントの初期化 nb_episode = 100 #エピソード数 rewards = [] # 評価用報酬の保存 is_goal = False # エージェントがゴールしてるかどうか? for episode in range(nb_episode): episode_reward = [] # 1エピソードの累積報酬 while (is_goal == False): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_goal = grid_env.step(action) agent.observe_state_and_reward(state, reward) # 状態と報酬の観測 episode_reward.append(reward) rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_goal = False # 結果のプロット