def run(self): pytorch_utils.add_model_indicators(self.policy) for epoch, (game, arrange) in enumerate(self.games): board = Board(arrange) # TODO change this state = board.get_current_board() for iteration in count(): logger.log('epoch : {}, iteration : {}'.format(epoch, iteration), Color.cyan) action = self.get_action(state) next_state, reward, done = self.step(board, action.item()) if done: next_state = None self.memory.push(state, action, next_state, reward) state = next_state self.train() if done: tracker.add(iterations=iteration) tracker.save() break if epoch % self.target_update == 0: self.target.load_state_dict(self.policy.state_dict()) if self.is_log_parameters: pytorch_utils.store_model_indicators(self.policy)
def step(self, board: Board, action: int): done = False num, let = unravel_index(action, [BOARD_SIZE, BOARD_SIZE]) res = board.play(num, let) if board.is_sunk_ship(): res = SUNK_SHIP if board.is_won(): res = WON done = True reward = get_reward(res) reward = torch.tensor([reward], device=self.device) next_state = board.get_current_board() return next_state, reward, done,