def doAction(self, chaser: Agent, target: Agent, grid: GridMap, state: State): #状態を記録 now_st = state.getState(chaser, target) self.st.append(now_st) #行動の決定 action = -1 p = random.random() if p < epsilon or (not np.any( self.q[now_st[0]][now_st[1]][now_st[2]])): while True: action = random.randint(0, 3) if state.canMoveDirection(target, grid, action): break else: max_q = -100000000 for i in range(4): if max_q < self.q[now_st[0]][now_st[1]][ now_st[2]][i] and state.canMoveDirection( target, grid, i): max_q = self.q[now_st[0]][now_st[1]][now_st[2]][i] action = i #行動を実行・記録 target.Walk(action) self.act.append(action) #次状態のエージェントを予測 tmp_chaser = copy.deepcopy(chaser) tmp_target = copy.deepcopy(target) tmp_state = copy.deepcopy(state) tmp_target.Walk(action) tmp_chaser.Walk(tmp_state.nextDirection(chaser, target, grid)) next_st = state.getState(tmp_chaser, tmp_target) #Q値更新 nextMax_q = -100000000 p = random.random() if p < epsilon or (not np.any( self.q[next_st[0]][next_st[1]][next_st[2]])): while True: next_action = random.randint(0, 3) if state.canMoveDirection(target, grid, next_action): nextMax_q = nextMax_q = self.q[next_st[0]][next_st[1]][ next_st[2]][next_action] break else: for i in range(4): if nextMax_q < self.q[next_st[0]][next_st[1]][ next_st[2]][i] and grid.canMove( next_st[0] + dx[i], next_st[1] + dy[i]): nextMax_q = self.q[next_st[0]][next_st[1]][next_st[2]][i] self.q[now_st[0]][now_st[1]][now_st[2]][action] = (1 - alpha) * self.q[ now_st[0]][now_st[1]][now_st[2]][action] + alpha * (self.getReward( tmp_chaser, tmp_target, tmp_state) + ganma * nextMax_q)
def greedy_doAction(self, chaser:Agent, target:Agent, grid:GridMap, state:State): now_st = state.getState(chaser, target) action = 0 max_q = -100000000 for i in range(4): if max_q < self.q[now_st[0]][now_st[1]][now_st[2]][i] and state.canMoveDirection(target, grid, i): max_q = self.q[now_st[0]][now_st[1]][now_st[2]][i] action = i target.Walk(action)
def doAction(self, chaser:Agent, target:Agent, grid:GridMap, state:State): #状態を記録 now_st = state.getState(chaser, target) self.st.append(now_st) #行動の決定 action = -1 p = random.random() if p < epsilon or (not np.any(self.q[now_st[0]][now_st[1]][now_st[2]])): while True: action = random.randint(0,3) if state.canMoveDirection(target, grid, action): break else: max_q = -100000000 for i in range(4): if max_q < self.q[now_st[0]][now_st[1]][now_st[2]][i] and state.canMoveDirection(target, grid, i): max_q = self.q[now_st[0]][now_st[1]][now_st[2]][i] action = i #行動を実行・記録 target.Walk(action) self.act.append(action)
def chaseTarget(self, chaser: Agent, target: Agent, grid: GridMap, state: State): dire = state.nextDirection(chaser, target, grid) chaser.Walk(dire)