def __init__(self, discount: float): super().__init__(discount) self.env = gym.make('CartPole-v1') self.env = ScalingObservationWrapper(self.env, low=[-2.4, -2.0, -0.42, -3.5], high=[2.4, 2.0, 0.42, 3.5]) self.actions = list( map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def legal_actions(self) -> List[Action]: return list( map(lambda i: Action(i), range(len(list(self.board.legal_moves)))))
def recurrent_inference(self, hidden_state, action) -> NetworkOutput: return NetworkOutput( 0, 0, {Action(i): 1 / self.action_size for i in range(self.action_size)}, None)
def initial_inference(self, image) -> NetworkOutput: return NetworkOutput( 0, 0, {Action(i): 1 / self.action_size for i in range(self.action_size)}, None)
def build_policy_logits(policy_logits): return {Action(i): logit for i, logit in enumerate(policy_logits[0])}