def expand(self, state: np.ndarray) -> (list, np.ndarray, torch.tensor, tuple): # Initialize needed data structures states = cube.repeat_state(state, self.workers) states_oh = cube.as_oh(states) paths = paths = np.empty((self.workers, self.depth), dtype=int) # Index n contains path for worker n new_states = np.empty((self.workers * self.depth, *cube.shape()), dtype=cube.dtype) new_states_oh = torch.empty(self.workers * self.depth, cube.get_oh_shape(), dtype=torch.float, device=gpu) # Expand for self.depth iterations for d in range(self.depth): # Use epsilon-greedy to decide where to use policy and random actions use_random = np.random.choice(2, self.workers, p=[1-self.epsilon, self.epsilon]).astype(bool) use_policy = ~use_random actions = np.empty(self.workers, dtype=int) # Random actions actions[use_random] = np.random.randint(0, cube.action_dim, use_random.sum()) # Policy actions p = self.net(states_oh[use_policy], value=False).cpu().numpy() actions[use_policy] = p.argmax(axis=1) # Update paths paths[:, d] = actions # Expand using selected actions faces, dirs = cube.indices_to_actions(actions) states = cube.multi_rotate(states, faces, dirs) states_oh = cube.as_oh(states) solved_states = cube.multi_is_solved(states) if np.any(solved_states): self._explored_states += (d+1) * self.workers w = np.where(solved_states)[0][0] return paths, None, None, (w, d+1) new_states[self._get_indices(d)] = states new_states_oh[self._get_indices(d)] = states_oh self._explored_states += len(new_states) return paths, new_states, new_states_oh, (-1, -1)
def onehot(self, n: int): self.log.section( f"Benchmarking {TickTock.thousand_seps(n)} one-hot encodings, {_repstr()}" ) states = _get_states((n, )) pname = f"One-hot encoding single state, {_repstr()}" for state in states.squeeze(): self.tt.profile(pname) cube.as_oh(state) self.tt.end_profile() self._log_method_results("Average state encoding time", pname)
def multi_onehot(self, n: int, n_states: int): self.log.section( f"Benchmarking {TickTock.thousand_seps(n)} one-hot encodings of " f"{TickTock.thousand_seps(n_states)} states each, {_repstr()}") all_states = _get_states((n, n_states)) pname = f"One-hot encoding {TickTock.thousand_seps(n_states)} states, {_repstr()}" for states in all_states: self.tt.profile(pname) cube.as_oh(states) self.tt.end_profile() self._log_method_results("Average state encoding time", pname, n_states)
def load(load_dir: str, logger=NullLogger(), load_best=False): """ Load a model from a configuration directory """ model_path = os.path.join(load_dir, "model.pt" if not load_best else "model-best.pt") conf_path = os.path.join(load_dir, "config.json") with open(conf_path, encoding="utf-8") as conf: try: state_dict = torch.load(model_path, map_location=gpu) except FileNotFoundError: model_path = os.path.join(load_dir, "model.pt") state_dict = torch.load(model_path, map_location=gpu) config = ModelConfig.from_json_dict(json.load(conf)) model = Model.create(config, logger) model.load_state_dict(state_dict) model.to(gpu) # First time the net is loaded, a feedforward is performed, as the first time is slow # This avoids skewing evaluation results with torch.no_grad(): model.eval() model(cube.as_oh(cube.get_solved())) model.train() return model
def search(self, state: np.ndarray, time_limit: float=None, max_states: int=None) -> bool: time_limit, max_states = self.reset(time_limit, max_states) self.tt.tick() self.indices[state.tostring()] = 1 self.states[1] = state if cube.is_solved(state): return True oh = cube.as_oh(state) p, v = self.net(oh) self.P[1] = p.softmax(dim=1).cpu().numpy() self.V[1] = v.cpu().numpy() indices_visited = [1] actions_taken = [] while self.tt.tock() < time_limit and len(self) + cube.action_dim <= max_states: self.tt.profile("Expanding leaves") solve_leaf_index, solve_action = self.expand_leaf(indices_visited, actions_taken) self.tt.end_profile("Expanding leaves") # If a solution is found if solve_leaf_index != -1: self.action_queue = deque(actions_taken) + deque([solve_action]) if self.search_graph: self._complete_graph() self._shorten_action_queue(solve_leaf_index) return True # Find leaves indices_visited, actions_taken = self.find_leaf(time_limit) self.action_queue = deque(actions_taken) # Generates a best guess action queue in case of no solution return False
def _step(self, state: np.ndarray) -> (int, np.ndarray, bool): substates = cube.multi_rotate(cube.repeat_state(state, cube.action_dim), *cube.iter_actions()) solutions = cube.multi_is_solved(substates) if np.any(solutions): action = np.where(solutions)[0][0] return action, substates[action], True else: substates_oh = cube.as_oh(substates) v = self.net(substates_oh, policy=False).squeeze().cpu().numpy() action = np.argmax(v) return action, substates[action], False
def cost(self, states: np .ndarray, indeces: np.ndarray) -> np.ndarray: """The A star cost of the state using the DNN heuristic Uses the value neural network. -value is regarded as the distance heuristic It is actually not really necessay to accept both the states and their indices, but it speeds things a bit up not having to calculate them here again. :param states: (batch size, *(cube_dimensions)) of states :param indeces: indeces in self.indeces corresponding to these states. """ states = cube.as_oh(states) H = -self.net(states, value=True, policy=False) H = H.cpu().squeeze().detach().numpy() return self.lambda_ * self.G[indeces] + H
def _mcts_test(self, state: np.ndarray, search_graph: bool): agent = MCTS(Model.create(ModelConfig()), c=1, search_graph=search_graph) solved = agent.search(state, .2) # Indices assert agent.indices[state.tostring()] == 1 for s, i in agent.indices.items(): assert agent.states[i].tostring() == s assert sorted(agent.indices.values())[0] == 1 assert np.all(np.diff(sorted(agent.indices.values())) == 1) used_idcs = np.array(list(agent.indices.values())) # States assert np.all(agent.states[1] == state) for i, s in enumerate(agent.states): if i not in used_idcs: continue assert s.tostring() in agent.indices assert agent.indices[s.tostring()] == i # Neighbors if not search_graph: for i, neighs in enumerate(agent.neighbors): if i not in used_idcs: continue state = agent.states[i] for j, neighbor_index in enumerate(neighs): assert neighbor_index == 0 or neighbor_index in agent.indices.values( ) if neighbor_index == 0: continue substate = cube.rotate(state, *cube.action_space[j]) assert np.all(agent.states[neighbor_index] == substate) # Policy and value with torch.no_grad(): p, v = agent.net(cube.as_oh(agent.states[used_idcs])) p, v = p.softmax(dim=1).cpu().numpy(), v.squeeze().cpu().numpy() assert np.all(np.isclose(agent.P[used_idcs], p, atol=1e-5)) assert np.all(np.isclose(agent.V[used_idcs], v, atol=1e-5)) # Leaves if not search_graph: assert np.all(agent.neighbors.all(axis=1) != agent.leaves) # W assert agent.W[used_idcs].all() return agent, solved
def test_as_oh(self): state = cube.get_solved() oh = cube.as_oh(state) supposed_state = torch.zeros(20, 24, device=gpu) corners = [ get_corner_pos(c, o) for c, o in zip(SimpleState.corners.tolist(), SimpleState.corner_orientations.tolist()) ] supposed_state[torch.arange(8), corners] = 1 sides = [ get_side_pos(s, o) for s, o in zip(SimpleState.sides.tolist(), SimpleState.side_orientations.tolist()) ] supposed_state[torch.arange(8, 20), sides] = 1 assert (supposed_state.flatten() == oh).all()
def __init__(self, evaluations: np.ndarray, games: int, depth: int, extra_evals: int, reward_method: str, logger: Logger = NullLogger()): """Initialize containers mostly :param np.ndarray evaluations: array of the evaluations performed on the model. Used for the more intensive analysis :param int depth: Rollout depth :param extra_evals: If != 0, extra evaluations are added for the first `exta_evals` rollouts """ self.games = games self.depth = depth self.depths = np.arange(depth) self.extra_evals = min(evaluations[-1] if len(evaluations) else 0, extra_evals) #Wont add evals in the future (or if no evals are needed) self.evaluations = np.unique( np.append(evaluations, range( self.extra_evals )) ) self.reward_method = reward_method self.orig_params = None self.params = None self.first_states = np.stack(( cube.get_solved(), *cube.multi_rotate(cube.repeat_state(cube.get_solved(), cube.action_dim), *cube.iter_actions()) )) self.first_states = cube.as_oh( self.first_states ) self.first_state_values = list() self.substate_val_stds = list() self.avg_value_targets = list() self.param_changes = list() self.param_total_changes = list() self.policy_entropies = list() self.rollout_policy = list() self.log = logger self.log.verbose(f"Analysis of this training was enabled. Extra analysis is done for evaluations and for first {extra_evals} rollouts")
def ADI_traindata(self, net, alpha: float): """ Training data generation Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1 Loss weighting is dependant on `self.loss_weighting`. :param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout. :param int rollout: The current rollout number. Used in adaptive loss weighting. :return: Games * sequence_length number of observations divided in four arrays - states contains the rubiks state for each data point - policy_targets and value_targets contains optimal value and policy targets for each training point - loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper) :rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor) """ net.eval() self.tt.profile("Scrambling") # Only include solved state in training if using Max Lapan convergence fix states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix') self.tt.end_profile("Scrambling") # Keeps track of solved states - Max Lapan's convergence fix solved_scrambled_states = cube.multi_is_solved(states) # Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape self.tt.profile("ADI substates") substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states))) self.tt.end_profile("ADI substates") self.tt.profile("One-hot encoding") substates_oh = cube.as_oh(substates) self.tt.end_profile("One-hot encoding") self.tt.profile("Reward") solved_substates = cube.multi_is_solved(substates) # Reward for won state is 1 normally but 0 if running with reward0 rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\ (*solved_substates.shape) rewards[~solved_substates] = -1 self.tt.end_profile("Reward") # Generates policy and value targets self.tt.profile("ADI feedforward") while True: try: value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()] values = torch.cat(value_parts).cpu() break except RuntimeError as e: # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced if "alloc" not in str(e): raise e self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}") self.adi_ff_batches *= 2 self.tt.end_profile("ADI feedforward") self.tt.profile("Calculating targets") values += rewards values = values.reshape(-1, 12) policy_targets = torch.argmax(values, dim=1) value_targets = values[np.arange(len(values)), policy_targets] if self.reward_method == 'lapanfix': # Trains on goal state, sets goalstate to 0 value_targets[solved_scrambled_states] = 0 elif self.reward_method == 'schultzfix': # Does not train on goal state, but sets first 12 substates to 0 first_substates = np.zeros(len(states), dtype=bool) first_substates[np.arange(0, len(states), self.rollout_depth)] = True value_targets[first_substates] = 0 self.tt.end_profile("Calculating targets") # Weighting examples according to alpha weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games) unweighted = np.ones_like(weighted) ws, us = weighted.sum(), len(unweighted) loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us) if self.with_analysis: self.tt.profile("ADI analysis") self.analysis.ADI(values) self.tt.end_profile("ADI analysis") return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float()
def expand_leaf(self, visited_states_idcs: list, actions_taken: list) -> (int, int): """ Expands around the given leaf and updates V and W in all visited_states_idcs Returns the action taken to solve the cube. -1 if no solution is found :param visited_states_idcs: List of states that have been visited including the starting node. Length n :param actions_taken: List of actions taken from starting state. Length n-1 :return: The index of the leaf that is the solution and the action that must be taken from leaf_index. Both are 0 if solution is not found """ if len(self) + cube.action_dim > len(self.states): self.increase_stack_size() leaf_index = visited_states_idcs[-1] solve_leaf, solve_action = -1, -1 self.tt.profile("Get substates") state = self.states[leaf_index] substates = cube.multi_rotate(cube.repeat_state(state), *cube.iter_actions()) self.tt.end_profile("Get substates") # Check what states have been seen already substate_strs = [s.tostring() for s in substates] # Unique identifier for each substate get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b] # Shitty way to easily index into list with boolean array seen_substates = np.array([s in self.indices for s in substate_strs]) # States already in the graph unseen_substates = ~seen_substates # States not already in the graph self.tt.profile("Update indices and states") new_states_idcs = len(self) + np.arange(unseen_substates.sum()) + 1 new_idcs_dict = { s: i for i, s in zip(new_states_idcs, get_substate_strs(unseen_substates)) } self.indices.update(new_idcs_dict) substate_idcs = np.array([self.indices[s] for s in substate_strs]) new_substate_idcs = substate_idcs[unseen_substates] new_substates = substates[unseen_substates] self.states[new_substate_idcs] = new_substates self.tt.end_profile("Update indices and states") self.tt.profile("Update neigbors and leaf status") actions = np.arange(cube.action_dim) self.neighbors[leaf_index, actions] = substate_idcs self.neighbors[substate_idcs, cube.rev_actions(actions)] = leaf_index self.leaves[leaf_index] = False self.tt.end_profile("Update neigbors and leaf status") self.tt.profile("Check for solution") solved_substate = np.where(cube.multi_is_solved(substates))[0] if solved_substate.size: solve_leaf = substate_idcs[solved_substate[0]] solve_action = solved_substate[0] self.tt.end_profile("Check for solution") # Update policy, value, and W self.tt.profile("One-hot encoding") new_substates_oh = cube.as_oh(new_substates) self.tt.end_profile("One-hot encoding") self.tt.profile("Feedforward") p, v = self.net(new_substates_oh) p, v = p.cpu().softmax(dim=1).numpy(), v.cpu().numpy().squeeze() self.tt.end_profile("Feedforward") self.tt.profile("Update P, V, and W") self.P[new_substate_idcs] = p self.V[new_substate_idcs] = v best_substate_v = v.max() self.W[leaf_index] = self.V[self.neighbors[leaf_index]] self.W[new_substate_idcs] = np.tile(v, (cube.action_dim, 1)).T self.W[visited_states_idcs[:-1], actions_taken] = np.maximum(self.W[visited_states_idcs[:-1], actions_taken], best_substate_v) self.tt.end_profile("Update P, V, and W") # Update N and L self.tt.profile("Update N and L") if actions_taken: # Crashes if actions_taken is empty, which happens on the first run self.N[visited_states_idcs[:-1], actions_taken] += 1 self.L[visited_states_idcs[:-1], actions_taken] = 0 self.L[visited_states_idcs[1:], cube.rev_actions(np.array(actions_taken))] = 0 self.tt.end_profile("Update N and L") return solve_leaf, solve_action
def _step(self, state: np.ndarray) -> (int, np.ndarray, bool): policy = torch.nn.functional.softmax(self.net(cube.as_oh(state), value=False).cpu(), dim=1).numpy().squeeze() action = np.random.choice(cube.action_dim, p=policy) if self.sample_policy else policy.argmax() state = cube.rotate(state, *cube.action_space[action]) return action, state, cube.is_solved(state)