def _optimize(self, objective_function_parameters: ObjectiveFunctionParameters, algorithm: str, objective_scale_factor: float, debug: bool) -> Set[MIDSRule]: # init objective function objective_function = MIDSObjectiveFunction( objective_func_params=objective_function_parameters, cover_checker=self.cover_checker, overlap_checker=self.overlap_checker, scale_factor=objective_scale_factor) self.objective_function = objective_function optimizer: AbstractOptimizer = self.algorithms[algorithm]( objective_function=objective_function, ground_set=objective_function_parameters.all_rules.ruleset, debug=debug) solution_set: Set[MIDSRule] = optimizer.optimize() objective_function_value: float = objective_function.evaluate( MIDSRuleSet(solution_set)) self.solution_set = solution_set self.objective_function_value = objective_function_value self.nb_of_objective_function_calls_necessary_for_training = objective_function.call_counter return solution_set
def evaluate(self, solution_set: MIDSRuleSet): if type(solution_set) == set: solution_set = MIDSRuleSet(solution_set) if type(solution_set) != MIDSRuleSet: raise Exception("Type of solution_set must be MIDSRuleSet") self.call_counter += 1 self.set_size_collector.add_value(len(solution_set)) start_time = time.time() l: List[float] = self.objective_func_params.lambda_array ground_set_size = self.objective_func_params.ground_set_size current_nb_of_rules = len(solution_set) f0 = self.f0_minimize_rule_set_size(ground_set_size, current_nb_of_rules) f1 = self.f1_minimize_total_nb_of_literals(solution_set) if MIDSObjectiveFunction.should_cache_f2_f3: f2, f3 = self.f2_f3_combo_minimize_overlap_predicting_the_same_and_different_class_using_cache( solution_set) else: f2, f3 = self.f2_f3_combo_minimize_overlap_predicting_the_same_and_different_class( solution_set) # f2 = self.f2_minimize_overlap_predicting_the_same_class(solution_set) # f3 = self.f3_minimize_overlap_predicting_different_class(solution_set) f4 = self.f4_at_least_one_rule_per_attribute_value_combo(solution_set) f5 = self.f5_minimize_incorrect_cover(solution_set) f6 = self.f6_cover_each_example(solution_set) fs = np.array([f0, f1, f2, f3, f4, f5, f6]) / self.scale_factor result = np.dot(l, fs) if self.stat_collector is not None: self.stat_collector.add_values(f0, f1, f2, f3, f4, f5, f6, result) end_time = time.time() elapsed_time = end_time - start_time self.run_time_collector.add_value(elapsed_time) self.f0_val = f0 self.f1_val = f1 self.f2_val = f2 self.f3_val = f3 self.f4_val = f4 self.f5_val = f5 self.f6_val = f6 # print(f"MIDS f1:{f1}") return result
def get_total_rule_set(self) -> MIDSRuleSet: total_rule_set: Set[MIDSRule] = set() for target_attribute in self.target_attribute_to_mids_classifier.keys( ): for rule in self.target_attribute_to_mids_classifier[ target_attribute].rules: total_rule_set.add(rule) total_mids_rule_set = MIDSRuleSet(total_rule_set) return total_mids_rule_set
def _f2_f3_target_attr_to_upper_bound_map(all_rules: MIDSRuleSet, nb_of_training_examples: int, target_attrs: Iterable[TargetAttr] ) \ -> Dict[TargetAttr, int]: """ F2 and f3 = low overlap of rules for a given target (avged over the different targets). How do we define the upper bound for a given target? --> the max overlap for that target R_{init, X_j} = the number of rules in R_init predicting target X_j N = the number of training examples For any two rules ri, rj, max(overlap(ri, rj)) == N. THUS a given target Xj: upper bound = N * |R_{init, X_j}|^2 (Note: can we divide this by two? Otherwise, we count double!) :param all_rules: :param nb_of_training_examples: :param target_attrs: :return: """ # Xj --> |R_{init, X_j}| target_attr_to_nb_of_predicting_rules_map: Dict[TargetAttr, int] \ = all_rules.get_nb_of_rules_predicting_each_attribute() f2_f3_target_attr_to_upper_bound_map: Dict[TargetAttr, int] = {} for target_attr in target_attrs: n_ground_set_rules_predicting_target: int = target_attr_to_nb_of_predicting_rules_map.get( target_attr, 0) f3_upper_bound_for_target: int = nb_of_training_examples * n_ground_set_rules_predicting_target**2 f2_f3_target_attr_to_upper_bound_map[ target_attr] = f3_upper_bound_for_target target_attr_set = set(target_attrs) for target_attr in f2_f3_target_attr_to_upper_bound_map.keys(): if target_attr not in target_attr_set: raise Exception(f"ILLEGAL TARGET ATTRIBUTE: {target_attr}") # print(f2_f3_target_attr_to_upper_bound_map) return f2_f3_target_attr_to_upper_bound_map
def f1_minimize_total_nb_of_literals(self, solution_set: MIDSRuleSet): """ Minimize the total number of terms in the rule set :param solution_set: :return: """ upper_bound_nb_of_literals = self.objective_func_params.f1_upper_bound_nb_literals f1_unnormalized = upper_bound_nb_of_literals - solution_set.sum_rule_length( ) if self.normalize: f1 = f1_unnormalized / upper_bound_nb_of_literals else: f1 = f1_unnormalized self._normalized_boundary_check(f1, 'f1') return f1
def f4_at_least_one_rule_per_attribute_value_combo( self, solution_set: MIDSRuleSet): """ The requirement to have one rule for each value of each attribute might need to be relaxed, as it is no longer guaranteed that each value of each attribute occurs in at least one rule head. :param solution_set: :return: """ # 1. gather for each attribute the unique values that are predicted target_attr_to_val_set_dict: Dict[TargetAttr, Set[TargetVal]] \ = solution_set.get_predicted_values_per_predicted_attribute() # 2. count the total nb of values that are predicted over all attributes total_nb_of_attribute_values_covered: int = 0 for target_attr in self.objective_func_params.f4_target_attr_to_dom_size_map.keys( ): predicted_values: Optional[ Set[TargetVal]] = target_attr_to_val_set_dict.get( target_attr, None) if predicted_values is None: nb_of_predicted_values: int = 0 else: nb_of_predicted_values: int = len(predicted_values) if self.normalize: target_attr_dom_size: int = self.objective_func_params.f4_target_attr_to_dom_size_map[ target_attr] total_nb_of_attribute_values_covered += nb_of_predicted_values / target_attr_dom_size else: total_nb_of_attribute_values_covered += nb_of_predicted_values f4: float = total_nb_of_attribute_values_covered / self.objective_func_params.nb_of_target_attrs self._normalized_boundary_check(f4, 'f4') return f4
def evaluate_mids_model_for_dataset_fold_target_attribute( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int): logger = create_logger( logger_name=f'evaluate_mids_model_tree_derived_' + get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth), log_file_name=os.path.join( get_tree_based_mids_dir(), get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) + '_model_evaluation_tree_derived_rules.log')) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i) logger.info( f"start loading MIDS model from {tree_based_mids_classifier_abs_file_name}" ) mids_classifier: MIDSClassifier = load_mids_classifier( tree_based_mids_classifier_abs_file_name) logger.info("finished loading MIDS model") logger.info(mids_classifier) reconstructed_mids = MIDSValueReuse() reconstructed_mids.classifier = mids_classifier mids_classifier.rule_combination_strategy = RuleCombiningStrategy.WEIGHTED_VOTE mids_classifier.rule_combinator = WeightedVotingRuleCombinator() # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_MIDS_on_its_targets_without_nans( reconstructed_mids, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated MIDS classifier on predictive performance") target_attrs: List[TargetAttr] = mids_classifier.target_attrs for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") # mids_target_attr_to_score_info_abs_file_name: str = get_mids_target_attr_to_score_info_abs_file_name( # dataset_name, fold_i) tree_based_mids_target_attr_to_score_info_abs_file_name: str = \ get_tree_based_mids_target_attr_to_score_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_mids_target_attr_to_score_info( tree_based_mids_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {tree_based_mids_target_attr_to_score_info_abs_file_name}" ) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated MIDS classifier on interpretability") logger.info(interpret_stats.to_str("\n")) # mids_interpret_stats_abs_file_name: str = get_mids_interpret_stats_abs_file_name( # dataset_name, fold_i) tree_based_mids_interpret_stats_abs_file_name: str = get_tree_based_mids_interpret_stats_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) store_mids_interpret_stats(tree_based_mids_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote MIDSInterpretabilityStatistics to {tree_based_mids_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def _f1_upper_bound_single_target_ids(all_rules: MIDSRuleSet) -> int: nb_of_ground_rules: int = len(all_rules) L_max: int = all_rules.max_rule_length() return L_max * nb_of_ground_rules
def _f1_upper_modified_bound(all_rules: MIDSRuleSet) -> int: n_literals_in_ground_set: int = all_rules.sum_rule_length() return n_literals_in_ground_set
def evaluate_single_target_mids_model_for_dataset_fold( dataset_name: str, fold_i: int, logger_name: str, logger_file_name: str, mids_classifier_abs_file_name: str, mids_target_attr_to_score_info_abs_file_name: str, mids_interpret_stats_abs_file_name: str): logger = create_logger(logger_name=logger_name, log_file_name=logger_file_name) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i) logger.info( f"start loading MIDS model from {mids_classifier_abs_file_name}") mids_classifier: MIDSClassifier = load_mids_classifier( mids_classifier_abs_file_name) logger.info("finished loading MIDS model") logger.info(mids_classifier) reconstructed_mids = MIDSValueReuse() reconstructed_mids.classifier = mids_classifier # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_MIDS_on_its_targets_without_nans( reconstructed_mids, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated MIDS classifier on predictive performance") target_attrs: List[TargetAttr] = mids_classifier.target_attrs for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") store_mids_target_attr_to_score_info( mids_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {mids_target_attr_to_score_info_abs_file_name}" ) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated MIDS classifier on interpretability") logger.info(interpret_stats.to_str("\n")) store_mids_interpret_stats(mids_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote MIDSInterpretabilityStatistics to {mids_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def evaluate_greedy_model_for_dataset_fold_target_attribute( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int): logger = create_logger( logger_name=f'evaluate_greedy_model_tree_derived_' + get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth), log_file_name=os.path.join( greedy_models_tree_based_dir(), get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) + '_greedy_model_evaluation_tree_derived_rules.log')) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) logger.info( f"start loading greedy model from {tree_based_greedy_clf_abs_file_name}" ) greedy_classifier: GreedyRoundRobinTargetRuleClassifier = load_greedy_naive_classifier( tree_based_greedy_clf_abs_file_name) logger.info("finished loading greedy model") logger.info(greedy_classifier) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_mt_clf_on_its_targets_without_nans( greedy_classifier, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated greedy classifier on predictive performance") target_attrs: List[TargetAttr] = greedy_classifier.target_attributes for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") tree_based_greedy_clf_target_attr_to_score_info_abs_file_name: str = \ get_tree_based_greedy_clf_target_attr_to_score_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_mids_target_attr_to_score_info( tree_based_greedy_clf_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote greedy Dict[TargetAttr, ScoreInfo] to" f" {tree_based_greedy_clf_target_attr_to_score_info_abs_file_name}") # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(greedy_classifier.learned_rule_set), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated greedy classifier on interpretability") logger.info(interpret_stats.to_str("\n")) tree_based_greedy_clf_interpret_stats_abs_file_name: str = get_tree_based_greedy_clf_interpret_stats_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) store_mids_interpret_stats( tree_based_greedy_clf_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote InterpretabilityStatistics to {tree_based_greedy_clf_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def fit( self, quant_dataframe, use_targets_from_rule_set: bool, targets_to_use: Optional[List[str]] = None, class_association_rules: Optional[Iterable[MCAR]] = None, lambda_array: Optional[List[float]] = None, algorithm="RDGS", cache_cover_checks=True, cache_overlap_checks=True, default_class_type: DefaultClassStrategy = DefaultClassStrategy. MAJORITY_VALUE_OVER_WHOLE_TRAINING_SET, rule_combination_strategy=RuleCombiningStrategy.WEIGHTED_VOTE, debug=True, objective_scale_factor=1, ): """ Run the MIDS object on a dataset. """ type_check_dataframe(quant_dataframe) # --- Make sure the ground rule set is initialized ------------------------------------------------------------ if self.mids_ruleset is None and class_association_rules is not None: ids_rules = list(map( MIDSRule, class_association_rules)) # type: List[MIDSRule] mids_ruleset = MIDSRuleSet(ids_rules) elif self.mids_ruleset is not None: print( "using provided mids ruleset and not class association rules") mids_ruleset = self.mids_ruleset else: raise Exception( "Neither MCARs or MIDSRules are provided for fitting") # --- Use all target or only those in the ruleset? ----------------------------------------------------------- if targets_to_use is not None: targets_to_predict = set(targets_to_use) self.targets_to_predict = targets_to_use else: self.use_targets_from_rule_set = use_targets_from_rule_set if use_targets_from_rule_set: targets_to_predict: Set[ TargetAttr] = mids_ruleset.get_target_attributes() else: targets_to_predict = set(quant_dataframe.columns) self.targets_to_predict = targets_to_predict # --- Initialize objective function -------------------------------------------------------------------------- objective_function_parameters = ObjectiveFunctionParameters( all_rules=mids_ruleset, quant_dataframe=quant_dataframe, lambda_array=lambda_array, target_attributes=list(targets_to_predict)) # --- Initialize cover checker and overlap checker ------------------------------------------------------------ if self.cover_checker is None: if cache_cover_checks: self.cover_checker = CachedCoverChecker( mids_ruleset, quant_dataframe) else: self.cover_checker = CoverChecker() else: print("Reusing previously instantiated cover checker of type", str(type(self.cover_checker))) if self.overlap_checker is None: if cache_overlap_checks: self.overlap_checker = CachedOverlapChecker(mids_ruleset, quant_dataframe, self.cover_checker, debug=debug) else: self.overlap_checker = OverlapChecker(self.cover_checker, debug=debug) else: print("Reusing previously instantiated overlap checker of type", str(type(self.overlap_checker))) # --- Submodular maximization -------------------------------------------------------------------------------- # if len(mids_ruleset) > 0: # pass # else: # warnings.warn("Empty rule list was given") solution_set: Set[MIDSRule] = self._optimize( objective_function_parameters=objective_function_parameters, algorithm=algorithm, objective_scale_factor=objective_scale_factor, debug=debug) optimization_meta_data: MIDSOptimizationMetaData = MIDSOptimizationMetaData( mids_objective_function=self.objective_function, optimization_algorithm=algorithm, solution_set_size=len(solution_set)) self.classifier = MIDSClassifier(solution_set, quant_dataframe, list(targets_to_predict), optimization_meta_data, default_class_type, rule_combination_strategy) return self