class SimpleQBot(qbot.QBot): """A simple Q-bot Attributes: entity_iden (int): the entity we are controlling model (FeedforwardComplex): the model that does the evaluating teacher (FFTeacher): the teacher for the model optimizer (torch.nn.optimizer): the optimizer for the network criterion (callable): the evaluator for the network offline (OfflineLearner): the offline learner encoder (Encoder): the encoder """ def __init__(self, entity_iden): self.entity_iden = entity_iden self.model = gen.init_or_load_model(_init_model, MODELFILE) self.teacher = FFTeacher() self.optimizer = torch.optim.Adam( [p for p in self.model.parameters() if p.requires_grad], lr=0.003) self.criterion = torch.nn.MSELoss() self.encoder = _init_encoder(entity_iden) self.offline = OfflineLearner(self._learn, heap_size=10) def __call__(self, entity_iden): self.entity_iden = entity_iden self.encoder = _init_encoder(entity_iden) @property def cutoff(self): return 3 @property def alpha(self): return 0.3 def evaluate(self, game_state: GameState, move: Move) -> float: result = torch.tensor([0.0], dtype=torch.float) self.teacher.classify(self.model, self.encoder.encode(game_state, move), result) return float(result.item()) def learn(self, game_state: GameState, move: Move, reward: float) -> None: self.offline(game_state, move, reward) def think(self, max_time: float): self.offline.think(max_time) def _learn(self, game_state: GameState, move: Move, reward: float) -> None: self.teacher.teach(self.model, self.optimizer, self.criterion, self.encoder.encode(game_state, move), torch.tensor([reward], dtype=torch.float32)) return abs(reward) def save(self) -> None: gen.save_model(self.model, MODELFILE)
class SimpleBot(Bot): """Simple pathfinding bot Attributes: history (deque[GameState]): recent game states, where the left corresponds to len(history) ticks ago and the right corresponds to the last tick model (FeedforwardComplex): the model that predicts q-values teacher (FFTeacher): the teacher for the model optimizer (torch.nn.Optimizer): the optimizer criterion (callable): criterion """ def __init__(self, entity_iden: int): super().__init__(entity_iden) self.model = _init_or_load_model() self.history = deque() self.teacher = FFTeacher() self.optimizer = torch.optim.Adam( [p for p in self.model.parameters() if p.requires_grad], lr=0.003) self.criterion = torch.nn.MSELoss() self.spam_loss = False self.spam_moves = False self.print_loss_improves = True self.random_perc = 0.2 self.best_loss = float('inf') self.next_save = 50 def move(self, game_state: GameState): gs_copy = ser.deserialize(ser.serialize(game_state)) self.history.append((gs_copy, None)) if len(self.history) == CUTOFF + 1: self.teach() move = self.eval(game_state) if np.random.uniform(0, 1) < self.random_perc: move = random.choice(MOVE_MAP) self.history.pop() self.history.append((gs_copy, move)) self.next_save -= 1 if self.next_save <= 0: self.save() self.next_save = 50 return move def finished(self, game_state: GameState, result): self.save() def save(self): """saves the model""" print(f'[simplebot] {time.ctime()} saving') sys.stdout.flush() _save_model(self.model) def teach(self): """Must be called when we have CUTOFF+1 history. Takes the oldest history item, calculates the value for the finite series of diminished rewards, and then trains the network on that""" original, og_move = self.history.popleft() previous = original penalty = 1 reward = 0 for i in range(CUTOFF): reward += penalty * _reward(previous, self.history[i][0], self.entity_iden) previous = self.history[i][0] penalty *= ALPHA loss = self.teacher.teach(self.model, self.optimizer, self.criterion, _encode(original, self.entity_iden, og_move), torch.tensor([reward], dtype=torch.float32)) if self.spam_loss: print(f'[simplebot] loss={loss}') sys.stdout.flush() if self.print_loss_improves: if loss < self.best_loss: self.best_loss = loss print(f'[simplebot] loss improved to {loss} for move ' + f'{og_move.name} reward {reward}') sys.stdout.flush() def eval(self, game_state: GameState) -> Move: """Chooses the best move according to our model for the given state""" scores = [] out = torch.tensor([0.0]) for move in MOVE_MAP: self.teacher.classify(self.model, _encode(game_state, self.entity_iden, move), out) scores.append(out.item()) if self.spam_moves: toprint = [] for ind, move in enumerate(MOVE_MAP): toprint.extend((str(move), ': ', f'{scores[ind]:.3f}')) print('{' + ', '.join(toprint) + '}') sys.stdout.flush() return MOVE_MAP[int(np.argmax(scores))]
class DeepQBot(qbot.QBot): """The Q-bot implementation Attributes: entity_iden (int): the entity we are controlling model (FeedforwardComplex): the model that does the evaluating teacher (FFTeacher): the teacher for the model evaluation (bool): True to not store experiences, False to store experiences replay (WritableReplayBuffer, optional): the buffer for replays encoder (Encoder): the encoder """ def __init__(self, entity_iden: int, replay_path=REPLAY_FOLDER, evaluation=False): self.entity_iden = entity_iden if not os.path.exists(EVAL_MODELFILE): _init_model() self.model = Deep1ModelEval.load(EVAL_MODELFILE) self.teacher = FFTeacher() self.evaluation = evaluation self.encoder = init_encoder(entity_iden) if not evaluation: self.replay = replay_buffer.FileWritableReplayBuffer(replay_path, exist_ok=True) else: self.replay = None def __call__(self, entity_iden): self.entity_iden = entity_iden self.encoder = init_encoder(entity_iden) @property def cutoff(self): return CUTOFF @property def alpha(self): return ALPHA def evaluate(self, game_state: GameState, move: Move): result = torch.tensor([0.0], dtype=torch.float) self.teacher.classify(self.model, self.encoder.encode(game_state, move), result) return float(result.item()) def learn(self, game_state: GameState, move: Move, new_state: GameState, reward_raw: float, reward_pred: float) -> None: if self.evaluation: print( f'predicted reward: {self.evaluate(game_state, move):.2f} vs actual reward ' + f'{reward_raw:.2f} + {reward_pred:.2f} = {reward_raw + reward_pred:.2f}' ) return player_id = 1 if self.entity_iden == game_state.player_1_iden else 2 self.replay.add( replay_buffer.Experience( game_state, move, self.cutoff, new_state, reward_raw, player_id, None, self.encoder.encode(game_state, move).numpy(), self.encoder.encode(new_state, move).numpy())) def save(self) -> None: pass