Exemplo n.º 1
0
def play_game(config: MuZeroConfig,
              network: AbstractNetwork,
              train: bool = True) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'

    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        #root = run_mcts(config, game.action_history(), network, game, train)
        root = run_mcts(config, game.action_history(), network, game, train,
                        Node(1))
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        action = Action(int(action))
        game.apply(action)
        game.store_search_statistics(root)
    return game
Exemplo n.º 2
0
 def __init__(self, discount: float):
     super().__init__(discount)
     self.env = gym.make('LunarLander-v2')
     self.env = ScalingObservationWrapper(self.env, low=[-1, -1, -1, -1, -1, -1, -1, -1], high=[1, 1, 1, 1, 1, 1, 1, 1])
     self.actions = list(map(lambda i: Action(i), range(self.env.action_space.n)))
     self.observations = [self.env.reset()]
     self.done = False
Exemplo n.º 3
0
def rulebase_actions(vision):
    objs = list(map(lambda a: a != (0, 0, 0), vision))
    length = len(objs)
    middle_idx_l = length // 2 - length // 20
    middle_idx_r = length // 2 + length // 20
    middle_objs = objs[middle_idx_l:middle_idx_r]
    left_objs = objs[:middle_idx_l]
    right_objs = objs[middle_idx_r:length]

    actions = set([])

    # Have something in the middle of the view, go forward
    if sum(middle_objs) > 0:
        actions.add(Action.F)

    # See which direction has more objs, go to that direction
    if sum(left_objs) > sum(right_objs):
        actions.add(Action.L)
    elif sum(right_objs) > sum(left_objs):
        actions.add(Action.R)

    if len(actions) > 0:
        return actions

    # When no actions can be made, backward and make one random action
    actions.add(Action.B)
    actions.add(Action(random.randint(0, len(Action) - 1)))
    return actions
Exemplo n.º 4
0
 def __init__(self, discount: float):
     super().__init__(discount)
     self.env = gym.make('Centipede-v0')
     self.env = DownSampleVisualObservationWrapper(self.env, factor=5)
     self.actions = list(
         map(lambda i: Action(i), range(self.env.action_space.n)))
     self.observations = [self.env.reset()]
     self.done = False
Exemplo n.º 5
0
 def __init__(self, discount: float, vertices=10):
     super().__init__(discount)
     init_graph = nx.generators.random_graphs.gnp_random_graph(
         vertices, np.random.uniform(0.5, 1))
     self.actions = [Action(node) for node in list(init_graph.nodes())]
     self.env = init_graph
     self.observations = [self.env]
     self.done = nx.classes.function.is_empty(self.env)
     self.cover = []
Exemplo n.º 6
0
 def __init__(self, discount: float):
     super().__init__(discount)
     self.env = gym.make('BreakoutDeterministic-v4')
     self.env = ResizeObservation(self.env, shape=(84, 84))
     self.env = GrayScaleObservation(self.env, keep_dim=True)
     self.actions = list(
         map(lambda i: Action(i), range(self.env.action_space.n)))
     self.observations = [self.env.reset()]
     self.done = False
Exemplo n.º 7
0
    def store_search_statistics(self, root: Node):
        """After each MCTS run, store the statistics generated by the search."""

        sum_visits = sum(child.visit_count for child in root.children.values())
        action_space = (Action(index) for index in range(self.action_space_size))
        self.child_visits.append([
            root.children[a].visit_count / sum_visits if a in root.children else 0
            for a in action_space
        ])
        self.root_values.append(root.value())
Exemplo n.º 8
0
 def __init__(self, discount: float):
     super().__init__(discount)
     self.env = gym.make('Acrobot-v1')
     self.env = ScalingObservationWrapper(self.env,
                                          low=[-2.4, -2.0, -0.42, -3.5],
                                          high=[2.4, 2.0, 0.42, 3.5])
     self.actions = list(
         map(lambda i: Action(i), range(self.env.action_space.n)))
     self.observations = [self.env.reset()]
     self.done = False
Exemplo n.º 9
0
 def legal_actions(self) -> List[Action]:
     la = self.env.get_possible_actions()
     action_list = [Action(i) for i in la]
     if self.env.current_player == 0:
         pass
     elif self.env.current_player == 1:
         action_list = [self.reverse_action(i) for i in action_list]
     else:
         raise RuntimeError("error player {}".format(
             self.env.current_player))
     return action_list
Exemplo n.º 10
0
 def step(self, action) -> int:
     """Execute one step of the game conditioned by the given action."""
     new_obs = self.env.copy()
     new_obs.remove_node(
         action.index)  #take action by removing node and attached edges
     new_obs.add_node(
         action.index)  #adds empty node to preserve action space
     self.actions = [Action(node) for node in list(new_obs.nodes())]
     self.done = nx.classes.function.is_empty(new_obs)
     self.env = new_obs
     self.observations += [self.env]
     return -1  # -1 reward as minimum size cover is desired
Exemplo n.º 11
0
    def __init__(self, discount: float, **kwargs):
        super(Acrobot, self).__init__(discount)
        id = 'AcrobotModified-v1'
        entry_point = 'deer.helper.gym_env:ContinuableAcrobotEnv'

        max_steps = kwargs.get('max_steps', 200)
        gym.envs.register(
            id=id,
            entry_point=entry_point,
            max_episode_steps=max_steps,
        )
        self.env = gym.make(id)

        self.env = ScalingObservationWrapper(self.env,
                                             low=[-2.4, -2.0, -0.42, -3.5],
                                             high=[2.4, 2.0, 0.42, 3.5])
        self.actions = list(
            map(lambda i: Action(i), range(self.env.action_space.n)))
        self.observations = [self.env.reset()]
        self.done = False
Exemplo n.º 12
0
 def reverse_action(self, action: Action):
     action_ind = action.index
     from_act, to_act = divmod(action_ind, 90)
     from_act = self.__reverse_pos(from_act)
     to_act = self.__reverse_pos(to_act)
     return Action(from_act * 90 + to_act)
Exemplo n.º 13
0
 def build_policy_logits(policy_logits):
     return {
         Action(i): logit
         for i, logit in enumerate(policy_logits.reshape(-1))
     }
Exemplo n.º 14
0
def random_select_action():
    return set([Action(random.randint(0, len(Action) - 1))])
Exemplo n.º 15
0
 def recurrent_inference(self, hidden_state, action) -> NetworkOutput:
     return NetworkOutput(
         0, 0,
         {Action(i): 1 / self.action_size
          for i in range(self.action_size)}, None)
Exemplo n.º 16
0
 def initial_inference(self, image) -> NetworkOutput:
     return NetworkOutput(
         0, 0,
         {Action(i): 1 / self.action_size
          for i in range(self.action_size)}, None)
Exemplo n.º 17
0
 def build_policy_logits(policy_logits):
     return {Action(i): logit for i, logit in enumerate(policy_logits[0])}
Exemplo n.º 18
0
def idx_2_action(index):
    return Action(index)
Exemplo n.º 19
0
 def action_space(self) -> List[Action]:
     return [Action(i) for i in range(self.action_space_size)]