def main(): set_random_seed() X_test, X_train, y_test, y_train = get_dataset() clf = run_and_get_classifier(X_test, X_train, y_test, y_train) # tff (Trefle Fuzzy system File) is a json representation of the fuzzy system. # It is meant to be saved on disk or to be used by LFA Toolbox # (https://github.com/krypty/lfa_toolbox) tff = clf.get_best_fuzzy_system_as_tff() print(tff) # Export: save the fuzzy model to disk with open("my_saved_model.tff", mode="w") as f: f.write(tff) # Import from file fis = TrefleFIS.from_tff_file("my_saved_model.tff") # In the future, it could possible to call clf.predict_classes() directly # see issue #1 y_pred_test = fis.predict(X_test) y_pred_test_bin = round_to_cls(y_pred_test, n_classes=2) print_score(y_pred_test_bin, y_test) # Import from string fis2 = TrefleFIS.from_tff(tff) y_pred_test = fis2.predict(X_test) y_pred_test_bin = round_to_cls(y_pred_test, n_classes=2) print_score(y_pred_test_bin, y_test)
def predict_classes(self, X): y_pred = self.predict(X) for i, n_classes in enumerate(self.n_classes_per_cons): if n_classes > 0: # not a continuous variable y_pred[:, i] = round_to_cls(y_pred[:, i], n_classes) return y_pred
def test_distribution_between_multiclass_output_should_be_equal(): n_classes = 4 raw_outputs = np.linspace(0, 1, 12) * (n_classes - 1) thresholded_outputs = round_to_cls(raw_outputs, n_classes=n_classes) expected_array = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] assert_array_equal(thresholded_outputs, expected_array)
def fit(y_true, y_pred): # y_pred are floats in [0, n_classes-1]. To use accuracy metric we need # to binarize the output using round_to_cls() # Warning /!\ here since it has been one-hot-encoded we need to set # n_classes=2 instead n_classes=N_CLASSES because each consequent # is a binary class y_pred_bin = round_to_cls(y_pred, n_classes=2) return accuracy_score(y_true, y_pred_bin)
def main(): np.random.seed(0) random.seed(0) # Load dataset data = load_iris() # Organize our data X = data["data"] y = data["target"] # y.shape is (150,) y = create_one_hot_from_array(y) # y.shape is now (150,3) # Split our data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Declare the fitness function we want to use def fit(y_true, y_pred): # y_pred are floats in [0, n_classes-1]. To use accuracy metric we need # to binarize the output using round_to_cls() # Warning /!\ here since it has been one-hot-encoded we need to set # n_classes=2 instead n_classes=N_CLASSES because each consequent # is a binary class y_pred_bin = round_to_cls(y_pred, n_classes=2) return accuracy_score(y_true, y_pred_bin) # Initialize our classifier clf = TrefleClassifier( n_rules=3, # here we need to increase the number of rule to 3 # # because we need at least 1 rule per class in the case # # of a one-hot-encoded problem n_classes_per_cons=[2, 2, 2], # there are 3 consequents with 2 classes # # each. n_labels_per_mf=4, # use 4 labels LOW, MEDIUM, HIGH, VERY HIGH default_cons=[0, 0, 1], # default rule yield the class 2 n_max_vars_per_rule=4, # let's use the 4 iris variables (PL, PW, SL, SW) n_generations=30, fitness_function=fit, verbose=True, ) # Train our classifier clf.fit(X_train, y_train) # Make predictions # y_pred = clf.predict_classes(X_test) y_pred_raw = clf.predict(X_test) y_pred = round_to_cls(y_pred_raw, n_classes=2) clf.print_best_fuzzy_system() # Evaluate accuracy # Important /!\ the fitness can be different than the scoring function score = accuracy_score(y_test, y_pred) print("Score on test set: {:.3f}".format(score))
def test_predict_classes_should_return_same_results_as_predict_plus_manual_round( ): X_train, X_test, y_train, y_test = get_sample_data() clf = get_trefle_classifier_instance(X_train, X_test, y_train, y_test) y_pred = clf.predict_X_test() y_pred_rounded = round_to_cls(y_pred, n_classes=3) y_pred_classes = clf.predict_X_test_classes() print(y_pred_classes) assert_array_equal(y_pred_classes, y_pred_rounded)
def getConfusionMatrixValues(y_true, y_pred): """ return tcross validation matrix :param y_true: True labels :param y_pred: Labels predicted by the algorithm :type y_true: [[int]] - required :type y_pred: [[int]] - required :return: The confusion matrix :rtype: Float """ y_pred_bin = round_to_cls(y_pred, n_classes=2) tn, fp, fn, tp = confusion_matrix(y_true, y_pred_bin).ravel() return tn, fp, fn, tp
def fit(y_true, y_pred): y_pred_thresholded = round_to_cls(y_pred, n_classes=2) fitness_val = accuracy_score(y_true, y_pred_thresholded) return fitness_val
def run(): import numpy as np import random np.random.seed(6) random.seed(6) # Load dataset data = load_breast_cancer() # data = load_iris() # Organize our data y_names = data["target_names"] print("target names", y_names) y = data["target"] y = y.reshape(-1, 1) X_names = data["feature_names"] print("features names", X_names) X = data["data"] # X, y = make_classification( # n_samples=1000, n_features=10, n_informative=5, n_classes=2 # ) # y = y.reshape(-1, 1) # multi_class_y_col = np.random.randint(0, 4, size=y.shape) # regr_y_col = np.random.random(size=y.shape) * 100 + 20 # y = np.hstack((y, multi_class_y_col, regr_y_col)) # print(y) # Split our data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) def fit(y_true, y_pred): y_pred_thresholded = round_to_cls(y_pred, n_classes=2) fitness_val = accuracy_score(y_true, y_pred_thresholded) return fitness_val # Initialize our classifier clf = TrefleClassifier( n_rules=3, n_classes_per_cons=[2], default_cons=[1], n_max_vars_per_rule=3, n_generations=10, pop_size=100, n_labels_per_mf=3, verbose=True, dc_weight=1, # p_positions_per_lv=16, n_lv_per_ind_sp1=40, fitness_function=fit, ) # Train our classifier model = clf.fit(X_train, y_train) # Make predictions y_pred = clf.predict(X_test) clf.print_best_fuzzy_system() tff_str = clf.get_best_fuzzy_system_as_tff() print(tff_str) yolo = TrefleFIS.from_tff(tff_str) yolo.describe() # fis = clf.get_best_fuzzy_system() # print("best fis is ", end="") # print(fis) # FISViewer(fis).show() # Evaluate accuracy print("Simple run score: ") y_pred_thresholded = round_to_cls(y_pred, n_classes=2) print("acc", accuracy_score(y_test, y_pred_thresholded)) print(classification_report(y_test, y_pred_thresholded))
def get_recall_and_precision_score(y_true, y_pred): y_pred_bin = round_to_cls(y_pred, n_classes=2) recall = recall_score(y_true, y_pred_bin) precision = precision_score(y_true, y_pred_bin) return recall, precision
def evaluate(y_true, y_pred): # y_pred are floats in [0, n_classes-1]. To use accuracy metric we need # to binarize the output using round_to_cls() y_pred_bin = round_to_cls(y_pred, n_classes=2) return accuracy_score(y_true, y_pred_bin)
def test_distribution_between_binary_outputs_should_be_equal(): raw_outputs = np.linspace(0, 1, 11) thresholded_outputs = round_to_cls(raw_outputs, n_classes=2) expected_array = 6 * [0] + 5 * [1] assert_array_equal(thresholded_outputs, expected_array)
def _fitness(y_true, y_pred): # source of formulae: # https://en.wikipedia.org/wiki/Sensitivity_and_specificity tot_w = 0 fit = 0 y_pred_bin = round_to_cls(y_pred, n_classes=2) tn, fp, fn, tp = confusion_matrix(y_true, y_pred_bin).ravel() # some metrics are set to 0 (np.nan_to_num) because we want to # avoid infinite numbers e.g. when dividing by 0. Oddly, we need # to filter "invalid" too to handle division errors. Be careful, # this only works if the metrics is a "the higher the # better"-metric with np.errstate(divide="ignore", invalid="ignore"): # accuracy # no need to clip, denominator is >0 acc = (tp + tn) / (tp + fp + fn + tn) fit += acc_w * acc tot_w += acc_w # sensitivity sen = np.nan_to_num(tp / (tp + fn)) fit += sen_w * sen tot_w += sen_w # specificity spe = np.nan_to_num(tn / (tn + fp)) fit += spe_w * spe tot_w += spe_w # f1score, ignore ill-defined value, it will be set to 0 with catch_warnings(): filterwarnings("ignore", category=UndefinedMetricWarning) f1 = f1_score(y_true, y_pred_bin) fit += f1_w * f1 tot_w += f1_w # PPV # if either tp or fp is 0, then the result should be 0 too. ppv = np.nan_to_num(tp / (tp + fp)) fit += ppv_w * ppv tot_w += ppv_w # NPV # if either tp or fp is 0, then the result should be 0 too. # note: we don't reuse PPV value because it could have been set # to 0 due to nan result. npv = np.nan_to_num(tn / (tn + fn)) fit += npv_w * npv tot_w += npv_w # FPR fpr = 1 - spe fit += fpr_w * fpr tot_w += fpr_w # FNR fnr = 1 - sen fit += fnr_w * fnr tot_w += fnr_w # FDR fdr = 1 - ppv fit += fdr_w * fdr tot_w += fdr_w # MSE mse = -mean_squared_error(y_true, y_pred) fit += mse_w * mse tot_w += mse_w # handle zero-division return 0 if abs(tot_w) < 1e-6 else fit / tot_w