def select_action_in_rollout(available_after_states, policy_weights, num_features, use_filters_during_rollout, feature_directors, use_dom, use_cumul_dom): num_after_states = len(available_after_states) action_features = np.zeros((num_after_states, num_features)) for ix, after_state in enumerate(available_after_states): action_features[ix] = after_state.get_features_pure(False) # , order_by=self.feature_order if use_filters_during_rollout: not_simply_dominated, not_cumu_dominated = dominance_filter(action_features * feature_directors, len_after_states=num_after_states) # domtools. if use_dom: action_features = action_features[not_simply_dominated] map_back_vector = np.nonzero(not_simply_dominated)[0] elif use_cumul_dom: action_features = action_features[not_cumu_dominated] map_back_vector = np.nonzero(not_cumu_dominated)[0] else: raise ValueError("Either use_dom or use_cumul_dom has to be true.") utilities = action_features.dot(np.ascontiguousarray(policy_weights)) move_index = np.argmax(utilities) if use_filters_during_rollout: state_tmp = available_after_states[map_back_vector[move_index]] else: state_tmp = available_after_states[move_index] return state_tmp
def choose_max_util_action_in_rollout(available_after_states, policy_weights, rollout_dom_filter, rollout_cumu_dom_filter, feature_directors, num_features): num_states = len(available_after_states) action_features = np.zeros((num_states, num_features)) for ix, after_state in enumerate(available_after_states): action_features[ix] = after_state.get_features_and_direct( feature_directors, False) # , order_by=self.feature_order if rollout_dom_filter or rollout_cumu_dom_filter: not_simply_dominated, not_cumu_dominated = dominance_filter( action_features, len_after_states=num_states) # domtools. if rollout_cumu_dom_filter: action_features = action_features[not_cumu_dominated] map_back_vector = np.nonzero(not_cumu_dominated)[0] elif rollout_dom_filter: action_features = action_features[not_simply_dominated] map_back_vector = np.nonzero(not_simply_dominated)[0] else: map_back_vector = np.arange(num_states) utilities = action_features.dot(policy_weights) move_index = np.random.choice( map_back_vector[utilities == np.max(utilities)]) # move_index = np.argmax(utilities) move = available_after_states[move_index] return move
def choose_greedy_if_reward_else_max_util_from_learned_directions_action_in_rollout( available_after_states, policy_weights, rollout_dom_filter, rollout_cumu_dom_filter, feature_directors, num_features, learned_directions): num_states = len(available_after_states) action_features = np.zeros((num_states, num_features)) for ix, after_state in enumerate(available_after_states): action_features[ix] = after_state.get_features_and_direct( feature_directors, False) # , order_by=self.feature_order if rollout_dom_filter or rollout_cumu_dom_filter: not_simply_dominated, not_cumu_dominated = dominance_filter( action_features, len_after_states=num_states) # domtools. if rollout_cumu_dom_filter: available_after_states = [ s for (s, d) in zip(available_after_states, not_cumu_dominated) if d ] # map_back_vector = np.nonzero(not_cumu_dominated)[0] # available_after_states = [available_after_states[i] for i in map_back_vector] elif rollout_dom_filter: available_after_states = [ s for (s, d) in zip(available_after_states, not_simply_dominated) if d ] # map_back_vector = np.nonzero(not_simply_dominated)[0] # available_after_states = available_after_states[map_back_vector] num_states = len(available_after_states) # else: # map_back_vector = np.arange(num_states) rewards = np.zeros(num_states) max_reward = 0 for ix, after_state in enumerate(available_after_states): reward_of_after_state = after_state.n_cleared_lines if reward_of_after_state > 0: rewards[ix] = after_state.n_cleared_lines if reward_of_after_state > max_reward: max_reward = reward_of_after_state if max_reward > 0: # max_reward_indeces = rewards == max_reward # available_after_states = [s for (s, d) in zip(available_after_states, not_simply_dominated) if d] max_reward_indeces = np.where(rewards == max_reward)[0] available_after_states = [ available_after_states[i] for i in max_reward_indeces ] action_features = action_features[max_reward_indeces] num_states = len(available_after_states) utilities = action_features.dot(policy_weights * learned_directions) # utilities == np.max(utilities) move_index = np.random.choice( np.arange(num_states)[utilities == np.max(utilities)]) # move_index = np.argmax(utilities) move = available_after_states[move_index] return move
def choose_action_using_rollouts( start_state, start_tetromino, rollout_mechanism, rollout_length, generative_model, policy_weights, dom_filter, cumu_dom_filter, rollout_dom_filter, rollout_cumu_dom_filter, feature_directors, num_features, gamma, number_of_rollouts_per_child, learned_directions): children_states = start_tetromino.get_after_states(start_state) num_children = len(children_states) if num_children == 0: # Game over! return ( State(np.zeros((1, 1), dtype=np.bool_), np.zeros(1, dtype=np.int64), np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), 0.0, 1, "bcts", True, False), # dummy state 0, # dummy child_index np.zeros((2, 2))) # dummy action_features action_features = np.zeros((num_children, num_features), dtype=np.float_) for ix in range(num_children): action_features[ix] = children_states[ix].get_features_and_direct( feature_directors, False) # , order_by=self.feature_order if dom_filter or cumu_dom_filter: not_simply_dominated, not_cumu_dominated = dominance_filter( action_features, len_after_states=num_children) child_total_values = np.zeros(num_children) for child in range(num_children): do_rollout = False if cumu_dom_filter: if not_cumu_dominated[child]: do_rollout = True elif dom_filter: if not_simply_dominated[child]: do_rollout = True else: do_rollout = True if do_rollout: for rollout_ix in range(number_of_rollouts_per_child): child_total_values[child] += roll_out( children_states[child], rollout_length, rollout_mechanism, generative_model, policy_weights, rollout_dom_filter, rollout_cumu_dom_filter, feature_directors, num_features, gamma, learned_directions) else: child_total_values[child] = -np.inf max_value = np.max(child_total_values) max_value_indices = np.where(child_total_values == max_value)[0] child_index = np.random.choice(max_value_indices) return children_states[child_index], child_index, action_features
def calculate_available_actions(rollout_state_population, generative_model, env): print("Calculate available actions with and without filters") num_av_acts = np.zeros(len(rollout_state_population)) num_fil_av_acts = np.zeros(len(rollout_state_population)) feature_directors = np.array([-1, -1, -1, -1, -1, -1, 1, -1], dtype=np.float64) for ix in range(len(rollout_state_population)): # print(ix) generative_model.next_tetromino() child_states = generative_model.get_after_states(rollout_state_population[ix]) num_child_states = len(child_states) num_av_acts[ix] = len(child_states) state_action_features = np.zeros((num_child_states, env.num_features), dtype=np.float_) for child_ix in range(num_child_states): state_action_features[child_ix] = child_states[child_ix].get_features_pure(False) # , order_by=self.feature_order not_simply_dominated, not_cumu_dominated = dominance_filter(state_action_features * feature_directors, len_after_states=num_child_states) num_fil_av_acts[ix] = np.sum(not_cumu_dominated) print(f"The mean number of available actions was {np.mean(num_av_acts)}") print(f"The mean number of FILTERED available actions was {np.mean(num_fil_av_acts)}")
def choose_action(self, start_state, start_tetromino): children_states = start_tetromino.get_after_states(start_state) # , current_state= num_children = len(children_states) if num_children == 0: # Terminal state!! return State(np.zeros((1, 1), dtype=np.bool_), np.zeros(1, dtype=np.int64), np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), 0.0, 1, "bcts", True, False) action_features = np.zeros((num_children, self.num_features)) for ix, after_state in enumerate(children_states): action_features[ix] = after_state.get_features_pure(False) if self.use_filter_in_eval: not_simply_dominated, not_cumu_dominated = dominance_filter(action_features * self.feature_directors, len_after_states=num_children) # domtools. if self.use_cumul_dom_filter: action_features = action_features[not_cumu_dominated] map_back_vector = np.nonzero(not_cumu_dominated)[0] else: action_features = action_features[not_simply_dominated] map_back_vector = np.nonzero(not_simply_dominated)[0] utilities = action_features.dot(np.ascontiguousarray(self.policy_weights)) max_indices = np.where(utilities == np.max(utilities))[0] move_index = np.random.choice(max_indices) if self.use_filter_in_eval: move = children_states[map_back_vector[move_index]] else: move = children_states[move_index] return move
def general_action_value_rollout(use_filters_during_rollout, use_filters_before_rollout, start_state, rollout_length, rollouts_per_action, gamma, generative_model, policy_weights, value_weights, num_features, use_state_values, reward_greedy, use_dom, use_cumul_dom, feature_directors): child_states = generative_model.get_after_states(start_state) num_child_states = len(child_states) action_value_estimates = np.zeros(num_child_states) state_action_features = np.zeros((num_child_states, num_features)) if num_child_states == 0: # Rollout starting state is terminal state return action_value_estimates, state_action_features state_action_features = np.zeros((num_child_states, num_features), dtype=np.float_) for ix in range(num_child_states): state_action_features[ix] = child_states[ix].get_features_pure(False) # , order_by=self.feature_order if use_filters_before_rollout: not_simply_dominated, not_cumu_dominated = dominance_filter(state_action_features * feature_directors, len_after_states=num_child_states) is_not_filtered_out = np.ones(num_child_states, dtype=np.bool_) for child_ix in range(num_child_states): do_rollout = False if use_filters_before_rollout: if use_dom: if not_simply_dominated[child_ix]: do_rollout = True elif use_cumul_dom: if not_cumu_dominated[child_ix]: do_rollout = True else: raise ValueError else: do_rollout = True if do_rollout: state_tmp = child_states[child_ix] start_reward = state_tmp.n_cleared_lines for rollout_ix in range(rollouts_per_action): cumulative_reward = start_reward game_ended = False count = 0 while not game_ended and count < rollout_length: # there are rollout_length rollouts generative_model.next_tetromino() available_after_states = generative_model.get_after_states(state_tmp) num_after_states = len(available_after_states) if num_after_states == 0: # Terminal state game_ended = True else: state_tmp = select_action_in_rollout(available_after_states, policy_weights, num_features, use_filters_during_rollout, feature_directors, use_dom, use_cumul_dom) cumulative_reward += (gamma ** count) * state_tmp.n_cleared_lines count += 1 # One more (the (rollout_length+1)-th) for truncation value! if use_state_values and not game_ended: generative_model.next_tetromino() available_after_states = generative_model.get_after_states(state_tmp) num_after_states = len(available_after_states) if num_after_states > 0: state_tmp = select_action_in_rollout(available_after_states, policy_weights, num_features, use_filters_during_rollout, feature_directors, use_dom, use_cumul_dom) # Get state value of last state. final_state_features = state_tmp.get_features_pure(True) cumulative_reward += (gamma ** count) * final_state_features.dot(value_weights) action_value_estimates[child_ix] += cumulative_reward else: is_not_filtered_out[child_ix] = False action_value_estimates[child_ix] = -np.inf action_value_estimates = action_value_estimates[is_not_filtered_out] state_action_features = state_action_features[is_not_filtered_out] action_value_estimates /= rollouts_per_action return action_value_estimates, state_action_features