def find_best_single_feature_parameters(self, dataset): for feature in dataset.suggested_discretize_features: permutations = self.generate_feature_parameters(feature) print(permutations) best_mean_fcs = self.best_fcs[dataset] best_perm = None for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Current permutation: {}".format(dataset, feature, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][{}][Perm {:03d}][{:03d}] FCS: {}".format(dataset, feature, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Best FCS: {}, Mean FCS {}".format(dataset, feature, p+1, max(f_scores), f_score_mean)) if f_score_mean > best_mean_fcs: best_perm = perm[0] best_mean_fcs = f_score_mean if best_perm is not None: self.best_discretize_feature_params[dataset].append(best_perm) logging.error("[Parameters Tester][{}][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, feature, best_mean_fcs, best_perm))
def main(): dm = DataModel.generate_from_file( PIMA_DIABETES_DATASET, smooth=True, discretize_params=[ DiscretizeParam('Age', kbins_discretize, 10), DiscretizeParam('SkinThickness', kbins_discretize, 10), DiscretizeParam('Pregnancies', kbins_discretize, 10) ]) print(Discretizer.kmean_models)
def find_best_fold(self, dataset): dm = DataModel.generate_from_file(dataset) classes_list = dm.get_classes_list() for fold in FOLDS: f_scores = [] a = 1 for _ in range(fold[1]): for train_set, test_set in dm.generate_k_folds_stratified(fold[0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][CV{:02d}][{:03d}] FCS: {}".format(dataset, fold[0], a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][CV{:02d}] Best FCS: {}, Mean FCS {}".format(dataset, fold[0], max(f_scores), f_score_mean)) self.append_result({'dataset':dataset.name, 'fold':fold[0], 'f_score':f_score_mean, 'permutation':-1}) if f_score_mean > self.best_fcs[dataset]: self.best_fold[dataset] = fold self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best fold: {}".format(dataset, self.best_fcs[dataset], self.best_fold[dataset]))
def find_best_parameters(self, dataset): permutations = self.generate_permutations(dataset) for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][Perm {:08d}] Current permutation: {}".format(dataset, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][Perm {:08d}][{:03d}] FCS: {}".format(dataset, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][Perm {:08d}] Best FCS: {}, Mean FCS {}".format(dataset, p+1, max(f_scores), f_score_mean)) for param in perm: self.append_result({'dataset':dataset.name, 'fold':self.best_fold[dataset][0], 'f_score':f_score_mean, 'permutation':p + 1, 'feature':param.feature_name, 'function':param.discretize_function.__name__, 'bins':param.buckets_amount}) if f_score_mean > self.best_fcs[dataset]: self.best_discretize_parameters[dataset] = perm self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, self.best_fcs[dataset], self.best_discretize_parameters[dataset]))
def visualize(dataset): dm = DataModel.generate_from_file(dataset) visualize_histograms(dm, 'histograms-{}'.format(str(dataset)))