def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame: """ Each game is produced by starting at the initial board position, then repeatedly executing a Monte Carlo Tree Search to generate moves until the end of the game is reached. """ game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the networks. #root = run_mcts(config, game.action_history(), network, game, train) root = run_mcts(config, game.action_history(), network, game, train, Node(1)) action = select_action(config, len(game.history), root, network, mode=mode_action_select) action = Action(int(action)) game.apply(action) game.store_search_statistics(root) return game
def __init__(self, discount: float): super().__init__(discount) self.env = gym.make('LunarLander-v2') self.env = ScalingObservationWrapper(self.env, low=[-1, -1, -1, -1, -1, -1, -1, -1], high=[1, 1, 1, 1, 1, 1, 1, 1]) self.actions = list(map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def rulebase_actions(vision): objs = list(map(lambda a: a != (0, 0, 0), vision)) length = len(objs) middle_idx_l = length // 2 - length // 20 middle_idx_r = length // 2 + length // 20 middle_objs = objs[middle_idx_l:middle_idx_r] left_objs = objs[:middle_idx_l] right_objs = objs[middle_idx_r:length] actions = set([]) # Have something in the middle of the view, go forward if sum(middle_objs) > 0: actions.add(Action.F) # See which direction has more objs, go to that direction if sum(left_objs) > sum(right_objs): actions.add(Action.L) elif sum(right_objs) > sum(left_objs): actions.add(Action.R) if len(actions) > 0: return actions # When no actions can be made, backward and make one random action actions.add(Action.B) actions.add(Action(random.randint(0, len(Action) - 1))) return actions
def __init__(self, discount: float): super().__init__(discount) self.env = gym.make('Centipede-v0') self.env = DownSampleVisualObservationWrapper(self.env, factor=5) self.actions = list( map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def __init__(self, discount: float, vertices=10): super().__init__(discount) init_graph = nx.generators.random_graphs.gnp_random_graph( vertices, np.random.uniform(0.5, 1)) self.actions = [Action(node) for node in list(init_graph.nodes())] self.env = init_graph self.observations = [self.env] self.done = nx.classes.function.is_empty(self.env) self.cover = []
def __init__(self, discount: float): super().__init__(discount) self.env = gym.make('BreakoutDeterministic-v4') self.env = ResizeObservation(self.env, shape=(84, 84)) self.env = GrayScaleObservation(self.env, keep_dim=True) self.actions = list( map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def store_search_statistics(self, root: Node): """After each MCTS run, store the statistics generated by the search.""" sum_visits = sum(child.visit_count for child in root.children.values()) action_space = (Action(index) for index in range(self.action_space_size)) self.child_visits.append([ root.children[a].visit_count / sum_visits if a in root.children else 0 for a in action_space ]) self.root_values.append(root.value())
def __init__(self, discount: float): super().__init__(discount) self.env = gym.make('Acrobot-v1') self.env = ScalingObservationWrapper(self.env, low=[-2.4, -2.0, -0.42, -3.5], high=[2.4, 2.0, 0.42, 3.5]) self.actions = list( map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def legal_actions(self) -> List[Action]: la = self.env.get_possible_actions() action_list = [Action(i) for i in la] if self.env.current_player == 0: pass elif self.env.current_player == 1: action_list = [self.reverse_action(i) for i in action_list] else: raise RuntimeError("error player {}".format( self.env.current_player)) return action_list
def step(self, action) -> int: """Execute one step of the game conditioned by the given action.""" new_obs = self.env.copy() new_obs.remove_node( action.index) #take action by removing node and attached edges new_obs.add_node( action.index) #adds empty node to preserve action space self.actions = [Action(node) for node in list(new_obs.nodes())] self.done = nx.classes.function.is_empty(new_obs) self.env = new_obs self.observations += [self.env] return -1 # -1 reward as minimum size cover is desired
def __init__(self, discount: float, **kwargs): super(Acrobot, self).__init__(discount) id = 'AcrobotModified-v1' entry_point = 'deer.helper.gym_env:ContinuableAcrobotEnv' max_steps = kwargs.get('max_steps', 200) gym.envs.register( id=id, entry_point=entry_point, max_episode_steps=max_steps, ) self.env = gym.make(id) self.env = ScalingObservationWrapper(self.env, low=[-2.4, -2.0, -0.42, -3.5], high=[2.4, 2.0, 0.42, 3.5]) self.actions = list( map(lambda i: Action(i), range(self.env.action_space.n))) self.observations = [self.env.reset()] self.done = False
def reverse_action(self, action: Action): action_ind = action.index from_act, to_act = divmod(action_ind, 90) from_act = self.__reverse_pos(from_act) to_act = self.__reverse_pos(to_act) return Action(from_act * 90 + to_act)
def build_policy_logits(policy_logits): return { Action(i): logit for i, logit in enumerate(policy_logits.reshape(-1)) }
def random_select_action(): return set([Action(random.randint(0, len(Action) - 1))])
def recurrent_inference(self, hidden_state, action) -> NetworkOutput: return NetworkOutput( 0, 0, {Action(i): 1 / self.action_size for i in range(self.action_size)}, None)
def initial_inference(self, image) -> NetworkOutput: return NetworkOutput( 0, 0, {Action(i): 1 / self.action_size for i in range(self.action_size)}, None)
def build_policy_logits(policy_logits): return {Action(i): logit for i, logit in enumerate(policy_logits[0])}
def idx_2_action(index): return Action(index)
def action_space(self) -> List[Action]: return [Action(i) for i in range(self.action_space_size)]