def test_cube_shuffle(): random.seed(5) c = Cube() c.shuffle(4) shuffled_state = np.array([[[0, 1, 2], [3, 4, 5], [8, 17, 20]], [[9, 10, 11], [12, 13, 14], [7, 16, 23]], [[24, 21, 18], [25, 22, 19], [6, 15, 26]]]) assert (c.state == shuffled_state).all()
def test_node_init(): model = CNN() c = Cube() state = c.state node = Node(state, model, .4, .1) assert (node.cube_moves[2] == 'right') \ & (len(node.P.keys()) == 12)
def __init__(self, state, model, c, v, parent=None): self.parent = parent self.state = copy.deepcopy(state) self.action_taken_string = None self.cube_moves = [action.__name__ for action in Cube().func_list] self.model = model self.c = c self.v = v self.children = self._init_children() self.N = self._init_N() self.W = self._init_W() self.L = self._init_L() self.P = self._init_P()
def get_validation_cubes(val_num_shuffles=1, validation_count=100): ''' Get set of validation cubes that will remain consistent over training period Parameters: ------------ val_num_shuffles : int number of times validation cube is shuffled validation_count : int number of validation cubes Returns: --------- validation_cubes : list list of rubiks_cube.environment.cube.Cube() objects ''' validation_cubes = [] for i in range(validation_count): val_cube = Cube() val_cube.shuffle(val_num_shuffles) validation_cubes.append(val_cube) return validation_cubes
def get_val_acc(model, validation_cubes, val_max_time_steps=5, val_solve_method='greedy', mcts_c=.1, mcts_v=.1, mcts_num_search=10): ''' Assess training progress on ability to solve validation cubes Parameters: ------------- model : tf.keras.Model validation_cubes : list list of rubiks_cube.environment.cube.Cube() objects val_max_time_steps : int val_solve_method : str 'greedy' or 'mcts' mcts_c : float mcts_v : float mcts_num_search : int Returns: ---------- val_acc : float ''' assert val_solve_method in ['greedy', 'mcts'] solve_count = 0 for val_cube in validation_cubes: val_cube_trial = Cube() val_cube_trial.state = np.copy(val_cube.state) if val_solve_method == 'greedy': solved, _, _ = greedy_solve(model, val_cube_trial, val_max_time_steps) solve_count += solved elif val_solve_method == 'mcts': solved, _ = mcts_solve(model, val_cube_trial, mcts_c, mcts_v, mcts_num_search) solve_count += solved return solve_count / len(validation_cubes)
def test_greedy_solve(): model = CNN() c = Cube() c.shuffle(5) solved, solved_cube, _ = greedy_solve(model, c, 5, verbose=False) if solved: assert solved_cube == Cube() else: assert solved_cube != Cube()
def greedy_solve(model, shuffled_cube, max_time_steps, verbose=False): ''' attempt to solve cube greedily by taking action with highest q value in each state Parameters: ----------- model : tf.keras.Model Q function approximator shuffled_cube : rubiks_cube.environment.cube.Cube() rubix cube object to be solved max_time_steps : int maximum number of time steps allowed to solve cube verbose : boolean whether to print steps taken to solve Returns: -------- solved : boolean cube : rubiks_cube.environment.cube.Cube() solver_steps : list of action function names ''' # initialize solution conditions solved_cube = Cube() solved = False solver_steps = [] s0 = copy.deepcopy(shuffled_cube.state) st = tf.expand_dims(tf.convert_to_tensor(s0), 0) # (1, 3, 3, 3) # at each step takes argmax_a Q(a,s) for t in range(max_time_steps): at_index = tf.math.argmax(model(st, training=False), 1).numpy()[0] at = shuffled_cube.func_list[at_index] solver_steps.append(at.__name__) if verbose: # print action taken print(at) st1 = at() if shuffled_cube == solved_cube: # break on solve solved = True break st = tf.expand_dims(tf.convert_to_tensor(st1), 0) return solved, shuffled_cube, solver_steps
def test_mcts_solve_call(): model = CNN() shuffled_cube = Cube() shuffled_cube.shuffle(2) solved, solved_cube = mcts_solve(model, shuffled_cube, c=.1, v=.1, num_searches=100, verbose=False) if solved: assert solved_cube == Cube() else: assert solved_cube != Cube()
def test_cube_init(): c = Cube() assert (c.state == np.arange(0, 27).reshape(3, 3, 3)).all()
def test_cube_equal(): c1 = Cube() c2 = Cube() c2.back() c2.back_p() assert c1 == c2
def test_cube_rotation(): c = Cube() for rotation in c.func_list: rotation() assert (c.state == np.arange(0, 27).reshape(3, 3, 3)).all()
def mcts_solve(model, shuffled_cube, c=.1, v=.1, num_searches=100, verbose=False): ''' Attempt to solve cube via Monte carlo tree search Parameters: ----------- model : tf.keras.Model Q function approximator shuffled_cube : rubiks_cube.environment.cube.Cube() rubix cube object to be solved c : float exploration hyperparameter v : float virtual loss hyperparameter num_searches : int # of iterations to search for verbose : false print output on solving progress Returns: -------- solved : boolean shuffled_cube : rubiks_cube.environment.cube.Cube() ''' #initial condiations solved = False solved_cube = Cube() cube_state = copy.deepcopy(shuffled_cube.state) root = Node(cube_state, model, c, v, parent=None) # perform search for i in range(num_searches): # 1) Selection if verbose: print("Selection") #start search at inital state current_node = root # traverse search tree until leaf node encountered # Every simulation starts from the root node and iteratively selects actions by following a tree # policy until an unexpanded leaf node, sτ , is reached has_children = (sum( map(lambda x: x is None, current_node.children.values())) == 0) while has_children: #calculate values for current node Q_st = current_node.get_Q_st() U_st = current_node.get_U_st() # select "best" action to perform A_st = np.argmax(U_st + Q_st) A_st_string = current_node.cube_moves[A_st] if verbose: print(f"Enter Selection: {A_st_string}") #save action taken current_node.action_taken_string = A_st_string #move to next node current_node = current_node.children[A_st_string] has_children = (sum( map(lambda x: x is None, current_node.children.values())) == 0) #check if cube has been solved if (current_node.state == solved_cube.state).all(): if verbose: print("Cube is solved") solved = True shuffled_cube.set_state(current_node.state) break # 2) Expansion if verbose: print("Expansion") # Once a leaf node, sτ , is reached, the state is expanded by adding the children of s for move in shuffled_cube.func_list: shuffled_cube.set_state(current_node.state) move() new_state = copy.deepcopy(shuffled_cube.state) new_node = Node(new_state, model, c, v, parent=current_node) #add resulting states to current node's children current_node.children[move.__name__] = new_node # 3) Simulation if verbose: print("Simulation") # make copy of current state for simulation current_state = copy.deepcopy(current_node.state) # convert current state to tensor current_state = tf.expand_dims(tf.convert_to_tensor(current_state), 0) # find max Q for in current state q_current_state = model(current_state, training=False).numpy()[0].max() # 4) Backpropagation if verbose: print("Backpropagation") # update nodes with results of simulation current_node.update_memory(q_current_state) # traverse tree while current_node.parent is not None: # update all past parents with q value from current state current_node = current_node.parent current_node.update_memory(q_current_state) if verbose: if i == num_searches: print("Time Out") else: print('--------------') return solved, shuffled_cube
# 4) Backpropagation if verbose: print("Backpropagation") # update nodes with results of simulation current_node.update_memory(q_current_state) # traverse tree while current_node.parent is not None: # update all past parents with q value from current state current_node = current_node.parent current_node.update_memory(q_current_state) if verbose: if i == num_searches: print("Time Out") else: print('--------------') return solved, shuffled_cube if __name__ == "__main__": model = CNN() shuffled_cube = Cube() shuffled_cube.shuffle(2) solved, solved_cube = mcts_solve(model, shuffled_cube, c=.1, v=.1, num_searches=100, verbose=True)
def play_autodidactic_episode(model, loss_object, optimizer, replay_buffer, num_shuffles=5, max_time_steps=10, exploration_rate=.1, end_state_reward=1.0, batch_size=16, discount_factor=.9, training=True): ''' In a single episode, cube is shuffled up to num_shuffle times, however agent tries to solve cube at every shuffle and has 2 x current number of shuffles + 1 to solve Parameters: ------------ model : tf.keras.Model loss_object : tf.keras.losses optimizer : tf.keras.optimizer replay_buffer : rubiks_cube.agent.replay_buffer.ReplayBuffer num_shuffles : int (>= 0) max_time_steps : int (>= 1) exploration_rate : float [0, 1] end_state_reward: float batch_size : int (>= 1) discount_factor: float training : boolean ''' #Initialize episode cube episode_cube = Cube() #Initialize episode loss episode_loss = tf.keras.metrics.Mean() # Initialize solved cube solved_cube = Cube() for shuffle_step in range(num_shuffles): # Initialze shuffle step cube state episode_cube.shuffle(1) shuffle_step_cube = Cube() shuffle_step_cube.state = copy.deepcopy(episode_cube.state) #Set up training shuffle_step loss shuffle_step_loss = tf.keras.metrics.Mean() # regular training loop s0 = shuffle_step_cube.state # convert cube state into tensor to feed into model st = tf.expand_dims(tf.convert_to_tensor(s0), 0) # (1, 3, 3, 3) # Play shuffle_step until solved or shuffle_max_time_steps is reached shuffle_max_time_steps = 2 * shuffle_step + 1 for t in range(shuffle_max_time_steps): #with some probability select a random action a_t if np.random.rand() < exploration_rate: at_index = np.random.randint( 0, 12) #WARNING: Number of possible otations #otherwise select a_t = max_a Q(s_t,a) else: at_index = tf.math.argmax(model(st), 1).numpy()[0] # Execute action a_t and observe state s_t+1 and reward r_t at = shuffle_step_cube.func_list[at_index] st1 = at() if shuffle_step_cube == solved_cube: rt = end_state_reward else: rt = 0. # Store transition in replay buffer, convert state to numpy for convenience st_numpy = st.numpy()[0] # (3, 3, 3) transition = (st_numpy, at_index, rt, st1 ) # (np.array, int, float, np.array) replay_buffer.add(transition) #if training is enabled, update q function if training: loss = update_q_function(model, loss_object, optimizer, replay_buffer, end_state_reward, batch_size, discount_factor) else: loss = 0 shuffle_step_loss(loss) #if reward state has been reached, stop shuffle_step early if (rt == end_state_reward): break # convert next cube state into tensor to feed into model st = tf.expand_dims(tf.convert_to_tensor(st1), 0) # (1, 3, 3, 3) shuffle_step_loss_result = shuffle_step_loss.result() episode_loss(shuffle_step_loss_result) shuffle_step_loss.reset_states() episode_loss_result = episode_loss.result() episode_loss.reset_states() return episode_cube, episode_loss_result
# initialize exploration rate schduler exploration_rate_scheduler = ExplorationRateSchedule( **config['exploration_rate']['params']) try: # Train the model train_via_experience_replay(model, loss_object, optimizer, exploration_rate_scheduler, logging=True, train_log_dir=train_log_dir, **config['training_loop']['params']) except KeyboardInterrupt: print("Training got interrupted") #save weights as a fail safe save_state = Cube().state save_state_tensor = tf.expand_dims(tf.convert_to_tensor(save_state), 0) model.predict(save_state_tensor) model.save(model_weights_dir) # Save trained model weights save_state = Cube().state save_state_tensor = tf.expand_dims(tf.convert_to_tensor(save_state), 0) model.predict(save_state_tensor) model.save(model_weights_dir) # module MCTS #gpu/ container
def test_cnn_call_shape(): model = CNN() x = tf.constant(np.stack([Cube().state, Cube().state])) assert model(x).shape == tf.TensorShape([2,12])