def get_val_acc(model, validation_cubes, val_max_time_steps=5, val_solve_method='greedy', mcts_c=.1, mcts_v=.1, mcts_num_search=10): ''' Assess training progress on ability to solve validation cubes Parameters: ------------- model : tf.keras.Model validation_cubes : list list of rubiks_cube.environment.cube.Cube() objects val_max_time_steps : int val_solve_method : str 'greedy' or 'mcts' mcts_c : float mcts_v : float mcts_num_search : int Returns: ---------- val_acc : float ''' assert val_solve_method in ['greedy', 'mcts'] solve_count = 0 for val_cube in validation_cubes: val_cube_trial = Cube() val_cube_trial.state = np.copy(val_cube.state) if val_solve_method == 'greedy': solved, _, _ = greedy_solve(model, val_cube_trial, val_max_time_steps) solve_count += solved elif val_solve_method == 'mcts': solved, _ = mcts_solve(model, val_cube_trial, mcts_c, mcts_v, mcts_num_search) solve_count += solved return solve_count / len(validation_cubes)
def play_autodidactic_episode(model, loss_object, optimizer, replay_buffer, num_shuffles=5, max_time_steps=10, exploration_rate=.1, end_state_reward=1.0, batch_size=16, discount_factor=.9, training=True): ''' In a single episode, cube is shuffled up to num_shuffle times, however agent tries to solve cube at every shuffle and has 2 x current number of shuffles + 1 to solve Parameters: ------------ model : tf.keras.Model loss_object : tf.keras.losses optimizer : tf.keras.optimizer replay_buffer : rubiks_cube.agent.replay_buffer.ReplayBuffer num_shuffles : int (>= 0) max_time_steps : int (>= 1) exploration_rate : float [0, 1] end_state_reward: float batch_size : int (>= 1) discount_factor: float training : boolean ''' #Initialize episode cube episode_cube = Cube() #Initialize episode loss episode_loss = tf.keras.metrics.Mean() # Initialize solved cube solved_cube = Cube() for shuffle_step in range(num_shuffles): # Initialze shuffle step cube state episode_cube.shuffle(1) shuffle_step_cube = Cube() shuffle_step_cube.state = copy.deepcopy(episode_cube.state) #Set up training shuffle_step loss shuffle_step_loss = tf.keras.metrics.Mean() # regular training loop s0 = shuffle_step_cube.state # convert cube state into tensor to feed into model st = tf.expand_dims(tf.convert_to_tensor(s0), 0) # (1, 3, 3, 3) # Play shuffle_step until solved or shuffle_max_time_steps is reached shuffle_max_time_steps = 2 * shuffle_step + 1 for t in range(shuffle_max_time_steps): #with some probability select a random action a_t if np.random.rand() < exploration_rate: at_index = np.random.randint( 0, 12) #WARNING: Number of possible otations #otherwise select a_t = max_a Q(s_t,a) else: at_index = tf.math.argmax(model(st), 1).numpy()[0] # Execute action a_t and observe state s_t+1 and reward r_t at = shuffle_step_cube.func_list[at_index] st1 = at() if shuffle_step_cube == solved_cube: rt = end_state_reward else: rt = 0. # Store transition in replay buffer, convert state to numpy for convenience st_numpy = st.numpy()[0] # (3, 3, 3) transition = (st_numpy, at_index, rt, st1 ) # (np.array, int, float, np.array) replay_buffer.add(transition) #if training is enabled, update q function if training: loss = update_q_function(model, loss_object, optimizer, replay_buffer, end_state_reward, batch_size, discount_factor) else: loss = 0 shuffle_step_loss(loss) #if reward state has been reached, stop shuffle_step early if (rt == end_state_reward): break # convert next cube state into tensor to feed into model st = tf.expand_dims(tf.convert_to_tensor(st1), 0) # (1, 3, 3, 3) shuffle_step_loss_result = shuffle_step_loss.result() episode_loss(shuffle_step_loss_result) shuffle_step_loss.reset_states() episode_loss_result = episode_loss.result() episode_loss.reset_states() return episode_cube, episode_loss_result