def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=25, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model(in_features=2, hidden=[self.state_len, self.state_len], out_features=len(Agent.actions)) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self.predict_q(s) a = np.argmax(q_predicted) a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost q_target = q_predicted q_target[a] = r + self.y * self.predict_q(s1).max() history = self.model.fit(x=self._encode_state(s), y=np.array([q_target]), epochs=1, verbose=False) self.losses.append(history.history["loss"][-1]) s = s1 self.rewards[-1] += r if over: break
def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.model = model self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size
def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): a = np.argmax(self.Q[s, :]) a = self.select_action(a) s1, r, over = self.step(s, AbstractAgent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.Q[s, a] = self.Q[s, a] + self.lr * ( r + self.y * np.max(self.Q[s1, :]) - self.Q[s, a]) s = s1 self.rewards[-1] += r if over: break
def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break
def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.Q = np.zeros((env.width * env.height, len(Agent.actions))) self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = []
def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.model = model
def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 episode_number = len(self.rewards) self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q_policy(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break if episode_number % self.target_update == 0: self.target_nn.load_state_dict(self.nn.state_dict())
def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost q = (1.0 - p) / 2 self.stochastic_actions = { '←': [[0, 2, 3], [p, q, q]], '→': [[1, 2, 3], [p, q, q]], '↑': [[2, 0, 1], [p, q, q]], '↓': [[3, 0, 1], [p, q, q]] } self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn.load_state_dict(self.nn.state_dict()) self.target_nn.eval() self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.target_update = target_update