예제 #1
0
def _ada_boost_classification_train(table,
                                    feature_cols,
                                    label_col,
                                    max_depth=1,
                                    n_estimators=50,
                                    learning_rate=1.0,
                                    algorithm='SAMME.R',
                                    random_state=None):

    x_train = table[feature_cols]
    y_train = table[label_col]

    base_estimator = DecisionTreeClassifier(max_depth=max_depth)

    classifier = AdaBoostClassifier(base_estimator, n_estimators,
                                    learning_rate, algorithm, random_state)

    classifier.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': classifier.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'algorithm': algorithm,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_classification_model')
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_cols, classifier)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
예제 #2
0
def search_bestparam_AdaBoostClassifier(X, y, df_search_best_param):
    print(f"Search best params for AdaBoostClassifier ...")
    model = AdaBoostClassifier()
    print("Supported params", model.get_params())
    param_grid = {
        'n_estimators': [1, 10, 100, 1000],
        'algorithm': ['SAMME', 'SAMME.R']
    }
    search_bestparam(model, param_grid, X, y, df_search_best_param)
예제 #3
0
class AdaBoost(ClassicalModel):
    def __init__(self,
                 input_size,
                 output_size,
                 labels,
                 class_weights=None,
                 **kwargs):
        super().__init__(input_size, output_size, labels, class_weights)
        self.model = AdaBoostClassifier(**kwargs)
        self.name = "AdaBoost Classifier: \n" + str(self.model.get_params())
예제 #4
0
    def test_folder_name(self):
        clf = AdaBoostClassifier(n_estimators=23)
        clf.base_estimator.max_depth = 42
        base = "/hello/world/"
        category = "testing"
        params_path = util.params_to_path(clf.get_params())

        self.assertEqual(
            util.folder_name(base, category, clf),
            os.path.join("/hello/world/AdaBoostClassifier/testing/",
                         params_path))
def adaboostClassifier(X_train, X_test, y_train, y_test):
    print("adaboost")
    model2 = AdaBoostClassifier(random_state=1, learning_rate=0.404)
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)

    print(f1_score(y_test, y_pred))

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    print(model2.get_params())
예제 #6
0
class _AdaBoostClassifierImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=50,
        learning_rate=1.0,
        algorithm="SAMME.R",
        random_state=None,
    ):
        estimator_impl = base_estimator
        if isinstance(estimator_impl, lale.operators.Operator):
            if isinstance(estimator_impl, lale.operators.IndividualOp):
                estimator_impl = estimator_impl._impl_instance()
                wrapped_model = getattr(estimator_impl, "_wrapped_model", None)
                if wrapped_model is not None:
                    estimator_impl = wrapped_model
            else:
                raise ValueError(
                    "If base_estimator is a Lale operator, it needs to be an individual operator. "
                )
        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "algorithm": algorithm,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
예제 #7
0
def randomForest(train_bow_tf_idf, train_labels, bow_test_tf_idf, test_labels):
    model = AdaBoostClassifier(n_estimators=100)
    model.fit(train_bow_tf_idf, train_labels)

    print()
    print('------- Random Forest -------')
    # evaluate the model
    print('Default hyperparameters:')
    print(model.get_params())
    train_pred = model.predict(train_bow_tf_idf)
    print('Random Forest train accuracy = {}'.format(
        (train_pred == train_labels).mean()))
    test_pred = model.predict(bow_test_tf_idf)
    print('Random Forest test accuracy = {}'.format(
        (test_pred == test_labels).mean()))
    return model
예제 #8
0
class SklearnBDT(SklearnDT):
    def __init__(self):

        SklearnDT.__init__(self)

        self.boosting = 'adaptive'

        self.config = self.load_config('bdt')

        self.classifier = AdaBoostClassifier(
            dtree.DecisionTreeClassifier(
                criterion=self.config.get('decision tree', 'criterion'),
                splitter=self.config.get('decision tree', 'splitter'),
                max_depth=self.config.getint('decision tree', 'max_depth'),
                min_samples_split=self.config.getint('decision tree',
                                                     'min_samples_split'),
                min_samples_leaf=self.config.getint('decision tree',
                                                    'min_samples_leaf'),
                min_weight_fraction_leaf=0.0,
                max_features=None,
                random_state=None,
                max_leaf_nodes=None,
                min_impurity_decrease=0.0,
                min_impurity_split=None,
                class_weight=None,
                presort=False),
            algorithm=self.config.get('adaboost', 'algorithm'),
            n_estimators=int(self.config.getint('adaboost', 'n_estimators')),
            learning_rate=self.config.getfloat('adaboost', 'learning_rate'),
            random_state=None)

    def show(self):

        dt_options = self.classifier.get_params()

        print('-' * gl.screenwidth)
        print('--- Boosted decision tree options: ' + self.boosting)
        print('-' * gl.screenwidth)
        for i in dt_options:
            if i is not 'base_estimator':
                print('--- {:50s} {:s}'.format(i, str(dt_options[i])))
        print('-' * gl.screenwidth)

    def eval(self, data):

        return self.classifier.decision_function(data)
예제 #9
0
def cross_validation(X, y):
    #fig = plt.figure()
    #ax = fig.add_subplot(111, projection='3d')
    assert(len(y) == len(X))
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
 
    depth = [8, 16, 32, 64]
    split = [1, 2, 4, 8, 16, 32, 64]
    best_score = 0 
    best_train_score = 0
    best_param = None
    for d in depth:
        for s in split:
            estimator = DecisionTreeClassifier(max_features='sqrt', max_depth = d, min_samples_split = s)
            model = AdaBoostClassifier(n_estimators=500, base_estimator = estimator)
            model = model.fit(X_train, y_train)
            print "Depth: %d  split: %d" % (d, s)
            print "Model trainning score:"
            score_train = model.score(X_train, y_train)
            print score_train
            #ax.scatter(d, s, score_train, c='b', marker='o')
            print "Model test score:"
            score_test = model.score(X_test, y_test)
            print score_test
            #ax.scatter(d, s, score_test, c='r', marker='^')
 
            if score_test > best_score:
                best_score = score_test
                best_train_score = score_train
                best_param = model.get_params()
    print "=================="
    print best_train_score
    print best_score
    print best_param
    return best_param
예제 #10
0
    print "Validation set score: ERF " , clf_etree.score(X_val, y_val)

    clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) 
    clf_boost.fit(X_train, y_train)
    print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val)


    #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) 
    #clf_gboost.fit(X_train, y_train)
    #print "Validation set score:LR " , clf_gboost.score(X_val, y_val)


    print "Classifier:"
    print clf, clf.get_params()
    print clf_etree, clf_etree.get_params()
    print clf_boost, clf_boost.get_params()
    

    if(fe==1): #L1 norm based feature elimination
        clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0)
        clf_fe.fit(X_train, y_train)
        X_train = X_train[:,clf_fe.coef_.ravel()!=0]
        print "Xtrain.shape: ", X_train.shape
        X_val = X_val[:,clf_fe.coef_.ravel()!=0]

        clf2_l = svm.SVC(kernel='linear', C=reg)
        clf2_l.fit(X_train, y_train)
        print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val)
        clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g)
        clf2.fit(X_train, y_train)
        print "Lasso Validation set score filtered coeff: " , clf2.score(X_val, y_val)
class RandomForestAdaRandSearch(object):
    '''This class is the doing the actual work in the following steps:
     * define smaller data frames: database, man_add, transform
     * split the data into training and test set
     * setup and run a randomized search for best paramaters to define a random forest
     * create a new random forest with best parameters
     * predict on this new random forest with test data and cross-validated training data
     * analyse the predisctions with graphs and stats
  '''
    def __init__(self, metrix, output_dir):
        self.metrix = metrix
        self.output_dir = output_dir
        self.prepare_metrix_data()
        self.split_data()
        self.forest_best_params()
        self.predict()
        self.analysis()

    def prepare_metrix_data(self):
        '''Function to create smaller dataframe.
    ******
    Input: large data frame
    Output: smaller dataframe
    '''
        print("*" * 80)
        print("*    Preparing input dataframe")
        print("*" * 80)

        columns = [
            "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF",
            "diffI", "autobuild_success"
        ]

        self.data = self.metrix[columns]

        logging.info(f"Using dataframe with column labels {columns}")

###############################################################################
#
#  creating training and test set
#
###############################################################################

    def split_data(self):
        '''Function which splits the input data into training set and test set.
    ******
    Input: a dataframe that contains the features and labels in columns and the samples
          in rows
    Output: sets of training and test data with an 80/20 split; X_train, X_test, y_train,
            y_test
    '''
        print("*" * 80)
        print("*    Splitting data into test and training set with test=20%")
        print("*" * 80)

        y = self.metrix["autobuild_success"]
        X = self.data[[
            "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF",
            "diffI"
        ]]

        #stratified split of samples
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        y_test_csv = os.path.join(self.output_dir, "y_test.csv")
        np.savetxt(y_test_csv, self.y_test, delimiter=",")

        X_test_csv = os.path.join(self.output_dir, "X_test.csv")
        np.savetxt(X_test_csv, self.X_test, delimiter=",")

        X_train_shape = X_train.shape
        X_test_shape = X_test.shape
        y_train_shape = y_train.shape
        y_test_shape = y_test.shape

        logging.info(f"Shape of test data X_train {X_train_shape}")
        logging.info(f"Shape of test data X_test {X_test_shape}")
        logging.info(f"Shape of test data y_train {y_train_shape}")
        logging.info(f"Shape of test data y_test {y_test_shape}")

###############################################################################
#
#  optional step of over/undersampling if there is a large mis-match between classes
#
###############################################################################

#the weight distribution for the classes used by "class_weight" weights = {0:0.1, 1:0.9}

#print('*' *80)
#print('*    Applying Over/Undersampling and SMOTE')
#print('*' *80)

#oversample = RandomOverSampler(sampling_strategy = 'minority')
#oversample = RandomOverSampler(sampling_strategy = 0.1)
#oversample = SMOTE(sampling_strategy = 0.3, random_state=28)
# fit and apply the transform
#X_over, y_over = oversample.fit_resample(self.X_newdata_transform_train, self.y_train)

#undersample = RandomUnderSampler(sampling_strategy=0.7)
#X_over, y_over = undersample.fit_resample(X_over, y_over)
#self.X_over = X_over
#self.y_over = y_over

###############################################################################
#
#  creating classifier with best parameter from IUCrJ publication
#
###############################################################################

    def forest_best_params(self):
        '''create a new random forest using the best parameter combination found above'''
        print("*" * 80)
        print(
            "*    Building new forest based on best parameter combination and save as pickle"
        )
        print("*" * 80)

        # a blank decision tree with Ada Boost that can be used for hyperparameter search when
        # when starting from scratch
        #    clf2 = DecisionTreeClassifier(**self.best_params_base_estimator,
        #                                  random_state= 0)
        #    self.tree_clf2_new_rand = AdaBoostClassifier(clf2,
        #                                                 **self.best_params_ada,
        #                                                 algorithm ="SAMME.R",
        #                                                 random_state=100)

        # hyperparameters as were used for the classifier published in IUCrJ; this was first run
        # in deployment with really bad performance;
        # the saved model is named: 2019 calibrated_classifier_20190501_1115.pkl
        clf2 = DecisionTreeClassifier(criterion="entropy",
                                      max_depth=3,
                                      max_features=2,
                                      max_leaf_nodes=17,
                                      min_samples_leaf=8,
                                      min_samples_split=18,
                                      random_state=0,
                                      class_weight="balanced")
        self.tree_clf2_new_rand = AdaBoostClassifier(clf2,
                                                     learning_rate=0.6355,
                                                     n_estimators=5694,
                                                     algorithm="SAMME.R",
                                                     random_state=5)

        # hyperparameters for a new classifier; this one was found after adding some user data
        # from run1 2020 to the training data; this one is now running in the automated data
        # analysis pipelines; the saved model is named: calibrated_classifier_20200408_1552.pkl
        #    clf2 = DecisionTreeClassifier(criterion="entropy",
        #                                  max_depth=5,
        #                                  max_features=2,
        #                                  max_leaf_nodes=15,
        #                                  min_samples_leaf=5,
        #                                  min_samples_split=3,
        #                                  random_state= 0,
        #                                  class_weight = "balanced")
        #    self.tree_clf2_new_rand = AdaBoostClassifier(
        #                                           clf2,
        #                                           learning_rate=0.6846,
        #                                           n_estimators=4693,
        #                                           algorithm ="SAMME.R",
        #                                           random_state=5)

        classifier_params = self.tree_clf2_new_rand.get_params()
        print(classifier_params)

        self.tree_clf2_new_rand.fit(self.X_train, self.y_train)

        logging.info(
            f"Created classifier based on IUCrJ publication and fitted training data.\n"
            f"Classifier parameters: {classifier_params}")

        ###############################################################################
        #
        #  Bootstrapping to find the 95% confidence interval
        #
        ###############################################################################

        # Trying some bootstrap to assess confidence interval for classification
        print("*" * 80)
        print(
            "*    Calculating confidence interval for best decisiontree with AdaBoost"
        )
        print("*" * 80)

        def bootstrap_calc(data_train, data_test, train_labels, test_labels,
                           found_model):
            # configure bootstrap
            n_iterations = 1000
            n_size = int(len(data_train))

            # run bootstrap
            stats = list()
            for i in range(n_iterations):
                # prepare train and test sets
                train_boot = resample(data_train, n_samples=n_size)
                test_boot = train_labels
                # fit model
                model = found_model
                model.fit(train_boot, test_boot)
                # evaluate model
                predictions = model.predict(data_test)
                score = accuracy_score(test_labels, predictions)
                stats.append(score)

            # plot scores
            plt.hist(stats)
            plt.savefig(os.path.join(self.output_dir,
                                     "bootstrap_hist_ada.png"),
                        dpi=600)
            plt.close()
            # confidence interval
            alpha = 0.95
            p = ((1.0 - alpha) / 2.0) * 100
            lower = max(0.0, np.percentile(stats, p))
            p = (alpha + ((1.0 - alpha) / 2.0)) * 100
            upper = min(1.0, np.percentile(stats, p))

            lower_boundary = round((lower * 100), 2)
            upper_boundary = round((upper * 100), 2)

            logging.info(
                f"Calculating 95% confidence interval from bootstrap exercise\n"
                f"Lower boundary: {lower_boundary}\n"
                f"Upper boundary: {upper_boundary}")

        bootstrap_calc(self.X_train, self.X_test, self.y_train, self.y_test,
                       self.tree_clf2_new_rand)

        ###############################################################################
        #
        #  get feature importances for best tree and full classifier;
        #  plot feature importances for both
        #
        ###############################################################################

        #print(self.tree_clf2_new_rand.estimators_)
        #print(self.tree_clf2_new_rand.feature_importances_)

        attr = [
            "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF",
            "diffI"
        ]

        feature_importances = self.tree_clf2_new_rand.feature_importances_
        feature_importances_ls = sorted(zip(feature_importances, attr),
                                        reverse=True)
        #print(feature_importances_transform_ls)
        feature_importances_tree_mean = np.mean([
            tree.feature_importances_
            for tree in self.tree_clf2_new_rand.estimators_
        ],
                                                axis=0)

        feature_importances_tree_mean_ls = sorted(zip(
            feature_importances_tree_mean, attr),
                                                  reverse=True)
        logging.info(
            f"Feature importances, for best tree in classifier: {feature_importances_ls}\n"
            f"Plotting bar plot of feature importances for best tree in classifier\n"
            f"Feature importances, mean over all trees: {feature_importances_tree_mean_ls}\n"
            f"Plotting bar plot of feature importances with mean and std for classifier"
        )

        def feature_importances_best_estimator(feature_list, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            feature_list.sort(key=lambda x: x[1], reverse=True)
            feature = list(zip(*feature_list))[1]
            score = list(zip(*feature_list))[0]
            x_pos = np.arange(len(feature))
            plt.bar(x_pos, score, align="center")
            plt.xticks(x_pos, feature, rotation=90, fontsize=18)
            plt.title(
                "Histogram of Feature Importances for best tree in best classifier"
            )
            plt.xlabel("Features")
            plt.tight_layout()
            plt.savefig(os.path.join(
                directory,
                "feature_importances_besttree_bestclassifier_bar_plot_" +
                datestring + ".png"),
                        dpi=600)
            plt.close()

        feature_importances_best_estimator(feature_importances_ls,
                                           self.output_dir)

        def feature_importances_pandas(clf, X_train, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            feature_list = []
            for tree in clf.estimators_:
                feature_importances_ls = tree.feature_importances_
                feature_list.append(feature_importances_ls)

            df = pd.DataFrame(feature_list, columns=X_train.columns)
            df_mean = df[X_train.columns].mean(axis=0)
            df_std = df[X_train.columns].std(axis=0)
            df_mean.plot(kind="bar",
                         color="b",
                         yerr=[df_std],
                         align="center",
                         figsize=(20, 10),
                         rot=90,
                         fontsize=18)
            plt.title(
                "Histogram of Feature Importances over all trees in best classifier with std"
            )
            plt.xlabel('Features')
            plt.tight_layout()
            plt.savefig(os.path.join(
                directory,
                "feature_importances_mean_std_bestclassifier_bar_plot_" +
                datestring + ".png"),
                        dpi=600)
            plt.close()

        feature_importances_pandas(self.tree_clf2_new_rand, self.X_train,
                                   self.output_dir)

        #feature_importances_pandas(self.tree_clf_rand_ada_new_transform, self.X_over, 'newdata_minusEP', self.newdata_minusEP)

        ###############################################################################
        #
        #  save best classifier as pickle file for future use
        #
        ###############################################################################

        def write_pickle(forest, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            joblib.dump(
                forest,
                os.path.join(directory, "best_classifier_rand_ada_" +
                             datestring + ".pkl"))

        write_pickle(self.tree_clf2_new_rand, self.output_dir)

        logging.info(f"Saving best classifier.")

        print("*" * 80)
        print("*    Getting basic stats for new forest")
        print("*" * 80)

        ###############################################################################
        #
        #  get basic stats for 3-fold cross-validation on the training data
        #
        ###############################################################################

        def basic_stats(forest, data_train, labels_train, directory):
            #distribution --> accuracy
            accuracy_each_cv = cross_val_score(forest,
                                               data_train,
                                               labels_train,
                                               cv=3,
                                               scoring="accuracy")
            accuracy_mean_cv = round(
                cross_val_score(forest,
                                data_train,
                                labels_train,
                                cv=3,
                                scoring="accuracy").mean(), 4)
            ## calculate cross_val_scoring with different scoring functions for CV train set
            train_roc_auc = round(
                cross_val_score(forest,
                                data_train,
                                labels_train,
                                cv=3,
                                scoring="roc_auc").mean(), 4)
            train_recall = round(
                cross_val_score(forest,
                                data_train,
                                labels_train,
                                cv=3,
                                scoring="recall").mean(), 4)
            train_precision = round(
                cross_val_score(forest,
                                data_train,
                                labels_train,
                                cv=3,
                                scoring="precision").mean(), 4)
            train_f1 = round(
                cross_val_score(forest,
                                data_train,
                                labels_train,
                                cv=3,
                                scoring="f1").mean(), 4)

            logging.info(
                f"Get various cross_val_scores to evaluate clf performance for best parameters\n"
                f"Training accuracy for individual folds in 3-fold CV: {accuracy_each_cv}\n"
                f"Mean training accuracy over all folds in 3-fold CV: {accuracy_mean_cv}\n"
                f"Mean training recall for 3-fold CV: {train_recall}\n"
                f"Mean training precision for 3-fold CV: {train_precision}\n"
                f"Mean training ROC_AUC for 3-fold CV: {train_roc_auc}\n"
                f"Mean training F1 score for 3-fold CV: {train_f1}")

        basic_stats(self.tree_clf2_new_rand, self.X_train, self.y_train,
                    self.output_dir)

###############################################################################
#
#  predicting with test set
#
###############################################################################

    def predict(self):
        '''do predictions using the best classifier and the test set and doing some
       initial analysis on the output'''

        print("*" * 80)
        print("*    Predict using new forest and test set")
        print("*" * 80)

        #try out how well the classifier works to predict from the test set
        self.y_pred = self.tree_clf2_new_rand.predict(self.X_test)
        self.y_pred_proba = self.tree_clf2_new_rand.predict_proba(self.X_test)
        self.y_pred_proba_ones = self.y_pred_proba[:,
                                                   1]  #test data to be class 1
        self.y_pred_proba_zeros = self.y_pred_proba[:,
                                                    0]  #test data to be class 0

        y_pred_csv = os.path.join(self.output_dir, "y_pred.csv")
        y_pred_proba_csv = os.path.join(self.output_dir, "y_pred_proba.csv")

        np.savetxt(y_pred_csv, self.y_pred, delimiter=",")
        np.savetxt(y_pred_proba_csv, self.y_pred_proba, delimiter=",")

        #    with open(y_pred_csv, "w", newline="") as pred_csv:
        #      pred_out = csv.writer(pred_csv)
        #      pred_out.writerows(self.y_pred)

        logging.info(
            f"Storing predictions for test set to y_pred.\n"
            f"Storing probabilities for predictions for the test set to y_pred_proba"
        )

        print("*" * 80)
        print("*    Calculate prediction stats")
        print("*" * 80)

        def prediction_stats(y_test, y_pred, directory):
            # calculate accuracy
            y_accuracy = accuracy_score(y_test, y_pred)

            # examine the class distribution of the testing set (using a Pandas Series method)
            class_dist = self.y_test.value_counts()
            class_zero = class_dist[0]
            class_one = class_dist[1]

            self.biggest_class = 0
            if class_zero > class_one:
                self.biggest_class = class_zero
            else:
                self.biggest_class = class_one

            # calculate the percentage of ones
            # because y_test only contains ones and zeros,
            # we can simply calculate the mean = percentage of ones
            ones = round(y_test.mean(), 4)

            # calculate the percentage of zeros
            zeros = round(1 - y_test.mean(), 4)

            # calculate null accuracy in a single line of code
            # only for binary classification problems coded as 0/1
            null_acc = round(max(y_test.mean(), 1 - y_test.mean()), 4)

            logging.info(
                f"Accuracy score or agreement between y_test and y_pred: {y_accuracy}\n"
                f"Class distribution for y_test: {class_dist}\n"
                f"Percent 1s in y_test: {ones}\n"
                f"Percent 0s in y_test: {zeros}\n"
                f"Null accuracy in y_test: {null_acc}")

        prediction_stats(self.y_test, self.y_pred, self.output_dir)


###############################################################################
#
#  detailed analysis and stats
#
###############################################################################

    def analysis(self):
        '''detailed analysis of the output:
       * create a confusion matrix
       * split the data into TP, TN, FP, FN for test and train_CV
       * determine accuracy score
       * determine classification error
       * determine sensitivity
       * determine specificity
       * determine false-positive rate
       * determine precision
       * determine F1 score
       calculate prediction probabilities and draw plots
       * histogram for probability to be class 1
       * precision-recall curve
       * look for adjustments in classification thresholds
       * ROC curve
       * determine ROC_AUC
       * try different scoring functions for comparison'''

        print("*" * 80)
        print("*    Detailed analysis and plotting")
        print("*" * 80)

        ###############################################################################
        #
        #  calculate and draw confusion matrix for test set predictions
        #
        ###############################################################################

        # IMPORTANT: first argument is true values, second argument is predicted values
        # this produces a 2x2 numpy array (matrix)

        conf_mat_test = confusion_matrix(self.y_test, self.y_pred)

        logging.info(f"confusion matrix using test set: {conf_mat_test}")

        def draw_conf_mat(matrix, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            labels = ["0", "1"]
            ax = plt.subplot()
            sns.heatmap(matrix,
                        annot=True,
                        ax=ax,
                        annot_kws={"size": 18},
                        vmin=0,
                        vmax=self.biggest_class)
            plt.title("Confusion matrix of the classifier")
            ax.set_xticklabels(labels, fontdict={"fontsize": 18})
            ax.set_yticklabels(labels, fontdict={"fontsize": 18})
            plt.xlabel("Predicted", fontsize=20)
            plt.ylabel("True", fontsize=20)
            plt.tight_layout()
            plt.savefig(os.path.join(
                directory, "confusion_matrix_for_test_set_predictions" +
                datestring + ".png"),
                        dpi=600)
            plt.close()

        draw_conf_mat(conf_mat_test, self.output_dir)

        ###############################################################################
        #
        #  calculate stats for the test set using classification outcomes
        #
        ###############################################################################

        TP = conf_mat_test[1, 1]
        TN = conf_mat_test[0, 0]
        FP = conf_mat_test[0, 1]
        FN = conf_mat_test[1, 0]

        logging.info(f"False-positives in predicting the test set: {FP}")
        logging.info(f"False-negatives in predicting the test set: {FN}")

        #calculate accuracy
        acc_score_man_test = round((TP + TN) / float(TP + TN + FP + FN), 4)
        acc_score_sklearn_test = round(
            accuracy_score(self.y_test, self.y_pred), 4)
        #classification error
        class_err_man_test = round((FP + FN) / float(TP + TN + FP + FN), 4)
        class_err_sklearn_test = round(
            1 - accuracy_score(self.y_test, self.y_pred), 4)
        #sensitivity/recall/true positive rate; correctly placed positive cases
        sensitivity_man_test = round(TP / float(FN + TP), 4)
        sensitivity_sklearn_test = round(
            recall_score(self.y_test, self.y_pred), 4)
        #specificity
        specificity_man_test = round(TN / (TN + FP), 4)
        #false positive rate
        false_positive_rate_man_test = round(FP / float(TN + FP), 4)
        #precision/confidence of placement
        precision_man_test = round(TP / float(TP + FP), 4)
        precision_sklearn_test = round(
            precision_score(self.y_test, self.y_pred), 4)
        #F1 score; uses precision and recall
        f1_score_sklearn_test = round(f1_score(self.y_test, self.y_pred), 4)

        logging.info(
            f"Detailed stats for the test set\n"
            f"Accuracy score:\n"
            f"accuracy score manual test: {acc_score_man_test}\n"
            f"accuracy score sklearn test: {acc_score_sklearn_test}\n"
            f"Classification error:\n"
            f"classification error manual test: {class_err_man_test}\n"
            f"classification error sklearn test: {class_err_sklearn_test}\n"
            f"Sensitivity/Recall/True positives:\n"
            f"sensitivity manual test: {sensitivity_man_test}\n"
            f"sensitivity sklearn test: {sensitivity_sklearn_test}\n"
            f"Specificity:\n"
            f"specificity manual test: {specificity_man_test}\n"
            f"False positive rate or 1-specificity:\n"
            f"false positive rate manual test: {false_positive_rate_man_test}\n"
            f"Precision or confidence of classification:\n"
            f"precision manual: {precision_man_test}\n"
            f"precision sklearn: {precision_sklearn_test}\n"
            f"F1 score:\n"
            f"F1 score sklearn test: {f1_score_sklearn_test}")

        data_dict = {
            "group": "prediction",
            "ACC (%)": (acc_score_man_test * 100),
            "Class Error (%)": (class_err_man_test * 100),
            "Sensitivity (%)": (sensitivity_man_test * 100),
            "Specificity (%)": (specificity_man_test * 100),
            "FPR (%)": (false_positive_rate_man_test * 100),
            "Precision (%)": (precision_man_test * 100),
            "F1 score (%)": (f1_score_sklearn_test * 100)
        }

        df = pd.DataFrame(data=data_dict, index=[0])

        def plot_radar_chart(df, directory):
            datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')

            # ------- PART 1: Create background

            # number of variable
            categories = list(df)[1:]
            print(categories)
            N = len(categories)

            # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
            angles = [n / float(N) * 2 * pi for n in range(N)]
            angles += angles[:1]

            # Initialise the spider plot
            #fig = plt.figure(figsize=(9, 9))
            fig = plt.figure(figsize=(7, 6))
            ax = fig.add_subplot(111, polar=True)

            # If you want the first axis to be on top:
            ax.set_theta_offset(pi / 2)
            ax.set_theta_direction(-1)

            # Draw one axe per variable + add labels labels yet
            ax.set_xticks(angles[:-1])
            ax.set_xticklabels(categories, fontsize=20, wrap=True)
            #plt.xticks(angles[:-1], categories)

            # Draw ylabels
            ax.set_rlabel_position(15)
            ax.set_yticks([20, 40, 60, 80, 100])
            ax.set_yticklabels(["20", "40", "60", "80", "100%"],
                               fontsize=20,
                               wrap=True)
            ax.set_ylim(0, 100)

            # ------- PART 2: Add plots
            #values = df.loc[0].values.flatten().tolist()
            values = df.loc[0].drop('group').values.flatten().tolist()
            print(values)
            values += values[:1]
            ax.plot(angles,
                    values,
                    linewidth=2,
                    linestyle="solid",
                    label="Test set")
            ax.fill(angles, values, "b", alpha=0.1)
            plt.savefig(os.path.join(
                directory, "radar_chart_for_test_set_" + datestring + ".png"),
                        dpi=600)
            plt.close()

        plot_radar_chart(df, self.output_dir)

        ###############################################################################
        #
        #  plot histogram of test set probabilities
        #
        ###############################################################################

        #plot histograms of probabilities
        def plot_hist_pred_proba(y_pred_proba, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            plt.hist(y_pred_proba[1], bins=20, color="b", label="class 1")
            plt.hist(y_pred_proba[0], bins=20, color="g", label="class 0")
            plt.xlim(0, 1)
            plt.title(
                "Histogram of predicted probabilities for class 1 in the test set"
            )
            plt.xlabel("Predicted probability of EP_success")
            plt.ylabel("Frequency")
            plt.legend(loc="best")
            plt.tight_layout()
            plt.savefig(os.path.join(directory,
                                     "hist_pred_proba_" + datestring + ".png"),
                        dpi=600)
            plt.close()

        plot_hist_pred_proba(self.y_pred_proba, self.output_dir)

        ###############################################################################
        #
        #  plot precision-recall curve for class 1 samples in test set
        #
        ###############################################################################

        #plot Precision Recall Threshold curve for test set class 1
        precisions, recalls, thresholds = precision_recall_curve(
            self.y_test, self.y_pred_proba_ones)

        def plot_precision_recall_vs_threshold(precisions, recalls, thresholds,
                                               directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
            plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
            plt.title("Precsion-Recall plot for classifier, test set, class 1")
            plt.xlabel("Threshold")
            plt.legend(loc="upper left")
            plt.ylim([0, 1])
            plt.tight_layout()
            plt.savefig(os.path.join(
                directory, "Precision_Recall_class1_" + datestring + ".png"),
                        dpi=600)
            plt.close()

        plot_precision_recall_vs_threshold(precisions, recalls, thresholds,
                                           self.output_dir)

        ###############################################################################
        #
        #  plot ROC curve, calculate AUC and explore thresholds for class 1 samples in test set
        #
        ###############################################################################

        #IMPORTANT: first argument is true values, second argument is predicted probabilities
        #we pass y_test and y_pred_prob
        #we do not use y_pred, because it will give incorrect results without generating an error
        #roc_curve returns 3 objects fpr, tpr, thresholds
        #fpr: false positive rate
        #tpr: true positive rate
        fpr_1, tpr_1, thresholds_1 = roc_curve(self.y_test,
                                               self.y_pred_proba_ones)
        AUC_test_class1 = round(
            roc_auc_score(self.y_test, self.y_pred_proba_ones), 4)
        logging.info(f"AUC score for class 1 in test set: {AUC_test_class1}")

        #plot ROC curves manual approach
        def plot_roc_curve(fpr, tpr, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            plt.plot(fpr, tpr, linewidth=2)
            plt.plot([0, 1], [0, 1], "k--")
            plt.axis([0, 1, 0, 1])
            plt.title("ROC curve for classifier, test set, class 1")
            plt.xlabel("False Positive Rate (1 - Specificity)")
            plt.ylabel("True Positive Rate (Sensitivity)")
            plt.grid(True)
            plt.text(0.7, 0.1, r"AUC = {AUC_test_class1}")
            plt.tight_layout()
            plt.savefig(os.path.join(directory, "ROC_curve_class1_" +
                                     datestring + ".png"),
                        dpi=600)
            plt.close()

        plot_roc_curve(fpr_1, tpr_1, self.output_dir)

        #plot ROC curves using scikit_plot method
        def plot_roc_curve_skplot(y_test, y_proba, directory):
            datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M")
            skplt.metrics.plot_roc(y_test, y_proba, title="ROC curve")
            plt.tight_layout()
            plt.savefig(os.path.join(
                directory, "ROC_curve_skplt_class1_" + datestring + ".png"),
                        dpi=600)
            plt.close()

        plot_roc_curve_skplot(self.y_test, self.y_pred_proba, self.output_dir)

        # define a function that accepts a threshold and prints sensitivity and specificity
        def evaluate_threshold(tpr, fpr, thresholds, threshold):
            sensitivity = round(tpr[thresholds > threshold][-1], 4)
            specificity = round(1 - fpr[thresholds > threshold][-1], 4)

            logging.info(
                f"Sensitivity for class 1 at threshold {threshold}: {sensitivity}\n"
                f"Specificity for class 1 at threshold {threshold}: {specificity}"
            )

        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.7)
        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.6)
        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.5)
        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.4)
        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.3)
        evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.2)

        # Try to copy log file if it was created in training.log
        try:
            shutil.copy("training.log", self.output_dir)
        except FileExistsError:
            logging.warning("Could not find training.log to copy")
        except Exception:
            logging.warning("Could not copy training.log to output directory")
예제 #12
0
파일: adaBoost.py 프로젝트: tibristo/mva
class adaBoost:
    __all__=['run','plotFeatureRanking','plotScores']

    def __init__(self, foundVariables, trainingData, trainingClasses, trainingWeights, testingData, testingClasses, adaName, bkg_name):
        """Build a forest and compute the feature importances.
        
        Keyword args:
        foundVariables -- The list of the names of found variabes, can get using Sample_x.returnFoundVariables()
        trainingData -- The training data
        trainingClasses -- The training data classes
        testingData -- the testing data
        testingClasses -- the testing data classes
        adaName -- the name of the object (eg. sig+bkg_name)
        """
        self.ada = AdaBoostClassifier(DecisionTreeClassifier(compute_importances=True,max_depth=4,min_samples_split=2,min_samples_leaf=100),n_estimators=400, learning_rate=0.5, algorithm="SAMME",compute_importances=True)
        #class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_density=0.10000000000000001, max_features=None, compute_importances=False, random_state=None)
        self.foundVariables = foundVariables
        self.trainingData = trainingData
        self.trainingClasses = trainingClasses
        self.testingData = testingData
        self.testingClasses = testingClasses
        self.trainingWeights = trainingWeights
        self.name = adaName
        self.bkg_name = bkg_name
        self.elapsed = 0.0

    def returnName(self):
        return self.name

    def run(self):
        """Run the fitting and testing."""

    #start the fitting and time it
        start = clock()
        print 'starting training on AdaBoostClassifier'
        self.ada.fit(self.trainingData, self.trainingClasses, self.trainingWeights)
        self.elapsed = clock()-start
        print 'time taken for training: ' + str(self.elapsed)
    #set up the arrays for testing/ eval
        #xtA_C = copy.deepcopy(self.testingData)
        #pred = self.ada.predict(xtA_C)
        #import createHists
        #createHists.drawSigBkgDistrib(xtA_C, pred, self.foundVariables) # draw the signal and background distributions together

    # list the importances of each variable in the bdt, get the score on the test data
        self.importancesada = self.ada.feature_importances_
        print 'importances'
        print self.importancesada
        self.score= self.ada.score(self.testingData,self.testingClasses)
        self.params = self.ada.get_params()
        self.std_mat = np.std([tree.feature_importances_ for tree in self.ada.estimators_],
                           axis=0)
        self.indicesada = np.argsort(self.importancesada)[::-1]
        self.variableNamesSorted = []
        for i in self.indicesada:
            self.variableNamesSorted.append(self.foundVariables[i])

# Print the feature ranking
        print "Feature ranking:"

        for f in xrange(12):
            print "%d. feature %d (%f)" % (f + 1, self.indicesada[f], self.importancesada[self.indicesada[f]]) + " " +self.variableNamesSorted[f]
        self.twoclass_output = self.ada.decision_function(self.testingData)
        self.twoclass_output_train = self.ada.decision_function(self.trainingData)
        self.class_proba = self.ada.predict_proba(self.testingData)[:, -1]



    def plotFeatureRanking(self):
        # We need this to run in batch because it complains about not being able to open display
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl

        #plot the feature ranking
        pl.figure()
        pl.title("Feature importances Ada")
        pl.bar(xrange(len(self.variableNamesSorted)), self.importancesada[self.indicesada],
               color="r", yerr=self.std_mat[self.indicesada], align="center")
        pl.xticks(xrange(12), self.variableNamesSorted)#indicesada)
        pl.xlim([-1, 12])
        pl.show()

    def plotScores(self, returnROC = False, rocInput = []):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl
        from sklearn.metrics import roc_curve, auc

        plot_colors = "rb"
        plot_step = 1000.0
        class_names = "AB"
    # Plot the training points 
        pl.subplot(131)
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            idx = np.where(self.trainingClasses == i)
            pl.scatter(self.trainingData[idx, 0], self.trainingData[idx, 1],
                       c=c, cmap=pl.cm.Paired,
                       label="Class %s" % n)
        pl.axis("tight")
        pl.legend(loc='upper right')
        pl.xlabel("Decision Boundary")

    # Plot the class probabilities


        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.class_proba[self.testingClasses == i],
                    bins=50,
                    range=(0, 1),
                    facecolor=c,
                    label='Class %s' % n)
        pl.legend(loc='upper center')
        pl.ylabel('Samples')
        pl.xlabel('Class Probability')
    # Plot the two-class decision scores/ bdt scores
        pl.subplot(133)
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.twoclass_output[self.testingClasses == i],
                    bins=50,
                    range=(-1, 1),
                    facecolor=c,
                    label='Class %s' % n, normed=True)
        pl.legend(loc='upper right')
        pl.ylabel('Samples')
        pl.xlabel('Two-class Decision Scores')
    
        pl.subplots_adjust(wspace=0.25)
        mean_tpr = 0.0
        mean_fpr = pl.linspace(0, 1, 100)
    
        pl.subplot(132)
        beginIdx = 0
        endIdx = len(self.testingData)#/2

        fpr_arr = []
        tpr_arr = []
        roc_auc_arr = []
        rej_arr = []

        for i in range(1):
            probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx])
            #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i])
    # Compute ROC curve and area the curve
            fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1])
            #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i)
    #mean_tpr += interp(mean_fpr, fpr, tpr)
    #mean_tpr[0] = 0.0
            roc_auc = auc(tpr,rej)#auc(fpr, tpr)
            fpr_arr.append(fpr)
            tpr_arr.append(tpr)
            roc_auc_arr.append(roc_auc)
            rej_arr.append(rej)
            pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc_arr[i]), color=plot_colors[i])
            beginIdx = endIdx
            endIdx = len(self.testingData)
        if len(rocInput)>0:
            pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC fold %d (area = %0.2f)' % (2, rocInput[2][0]), color=plot_colors[1])
        if returnROC:
            return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr]

        pl.show()

    def plotBDTScores(self):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl

        plot_colors = "rb"
        plot_step = 1000.0
        alpha_h = [1.0, 0.7]
        class_names = ['Background', 'Signal']
        for i, n, c in zip(xrange(2), class_names, plot_colors):
            pl.hist(self.twoclass_output[self.testingClasses == i],
                    bins=50,
                    range=(-1, 1),
                    facecolor=c,
                    alpha=alpha_h[i],
                    label='Class %s' % n, normed=True)
        pl.legend(loc='upper right')
        pl.ylabel('Samples')
        pl.xlabel('BDT Scores')        
        pl.savefig('BDTScores'+self.name+'.png')

    def plotROC(self, returnROC = False, rocInput = []):
        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
        import matplotlib.pyplot as plt
        import pylab as pl
        from sklearn.metrics import roc_curve, auc

        beginIdx = 0
        endIdx = len(self.testingData)#/2
        plot_colors = "rb"
        plot_step = 1000.0
        class_names = "AB"
        fpr_arr = []
        tpr_arr = []
        roc_auc_arr = []
        rej_arr = []
        names = []

        pl.xlabel("Signal Efficiency")
        pl.ylabel("Background Rejection") 
        pl.title("ROC curves")

        for i in range(1):
            probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx])
            #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i])
    # Compute ROC curve and area the curve
            fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1])
            #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i)
    #mean_tpr += interp(mean_fpr, fpr, tpr)
    #mean_tpr[0] = 0.0
            roc_auc = auc(tpr,rej)#auc(fpr, tpr)
            fpr_arr.append(fpr)
            tpr_arr.append(tpr)
            roc_auc_arr.append(roc_auc)
            rej_arr.append(rej)
            names.append(self.name)

            beginIdx = endIdx
            endIdx = len(self.testingData)
        if len(rocInput)>0:
            label_bkg = rocInput[4][0]
            if '_A' in rocInput[4][0]:
                label_bkg = 'even event number'
            pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, rocInput[2][0]), color=plot_colors[1])
    
        if not returnROC:
            label_bkg = self.name
            if '_B' in self.name:
                label_bkg = 'odd event number'
            pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, roc_auc_arr[i]), color=plot_colors[i])
        pl.legend(loc='lower left')
        pl.savefig("roc_combined_"+self.name+".png")
        if returnROC:
            return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr, names]
        pl.show()
        
    def plotDecisionBoundaries(self):
        import numpy as np
        import pylab as pl
        from matplotlib.colors import ListedColormap
        from sklearn.preprocessing import StandardScaler
        #from sklearn.cross_validation import train_test_split
         # just plot the dataset first
        cm = pl.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        #self.trainingData = StandardScaler().fit_transform(self.trainingData)
        #self.testingData = StandardScaler().fit_transform(self.testingData)
        #X_train = StandardScaler().fit_transform(self.twoclass_output_train)
        h = 0.1
        h2 = 0.01
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
        # get most important variable indices
        idx1 = self.foundVariables.index(self.variableNamesSorted[0])
        idx2 = self.foundVariables.index(self.variableNamesSorted[1])
        
        x_min, x_max = self.trainingData[np.argmin(self.trainingData[:, idx1])][idx1] - .1, self.trainingData[np.argmax(self.trainingData[:, idx1])][idx1] + .1
        y_min, y_max = self.trainingData[np.argmin(self.trainingData[:, idx2])][idx2]- .01, self.trainingData[np.argmax(self.trainingData[:,idx2])][idx2] + .01
        x_min2, x_max2 = self.testingData[np.argmin(self.testingData[:, idx1])][idx1] - .1, self.testingData[np.argmax(self.testingData[:, idx1])][idx1] + .1
        y_min2, y_max2 = self.testingData[np.argmin(self.testingData[:, idx2])][idx2] - .01, self.testingData[np.argmax(self.testingData[:, idx2])][idx2] + .01

        xmin = min(x_min,x_min2)
        xmax = max(x_max,x_max2)
        ymin = min(y_min, y_min2)
        ymax = max(y_max,y_max2)
        xx, yy = np.meshgrid(np.arange(xmin, xmax, float((xmax-xmin)/25.0)),
                             np.arange(ymin, ymax, float((ymax-ymin)/25.0)))

        # get mean values for other variables
        means = np.mean(self.testingData, axis=0)
        means = np.tile(means, (xx.shape[1]*xx.shape[0],1))
        for j in xrange(xx.shape[0]):
            for k in xrange(xx.shape[1]):
                means[(j+1)*(k+1)-1][idx1] = xx[0][j]
                means[(j+1)*(k+1)-1][idx2] = yy[k][0]
        #print 'shape X: '
        #print X.shape
        print 'shape xx: '
        print xx.shape
        print 'shape yy: '
        print yy.shape

        #rav = np.c_[xx.ravel(), yy.ravel()]
        print 'shape means: '
        print means.shape
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        #if hasattr(clf, "decision_function"):
        #    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        #else:
        Z = self.ada.predict_proba(means)[:, 1]
        print 'Z shape:'
        print Z.shape
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        figure = pl.figure()
        ax = pl.axes()
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        #for i, n in zip(xrange(2), class_names):
        #    idx = np.where(self.trainingClasses == i)
        ax.scatter(self.trainingData[:, idx1], self.trainingData[:, idx2],
                   c=self.trainingClasses[:], cmap=cm_bright)
        #for i, n in zip(xrange(2), class_names):
        #    idx = np.where(self.testingClasses == i)
        ax.scatter(self.testingData[:, idx1], self.testingData[:, idx2],
                       c=self.testingClasses[:], cmap=cm_bright, alpha=0.6)

        #ax.scatter(X_train[:, 0], X_training[:, 1], c=self.trainingClasses, cmap=cm_bright)
        # and testing points
        #ax.scatter(X[:, 0], X[:, 1], c=self.testingClasses, cmap=cm_bright,
        #           alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title("adaBoost")
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % self.score).lstrip('0'),
                size=15, horizontalalignment='right')
        pl.savefig("adaBoostDecisionBoundaries"+self.name+".png")
        pl.show()
예제 #13
0
class AdaBoost(Classifier):
    r"""Implementation of AdaBoost classifier.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Reference:
        Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995.
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

    See Also:
        * :class:`niaaml.classifiers.Classifier`
    """
    Name = 'AdaBoost'

    def __init__(self, **kwargs):
        r"""Initialize AdaBoost instance.
        """
        warnings.filterwarnings(action='ignore',
                                category=ChangedBehaviorWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataConversionWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataDimensionalityWarning)
        warnings.filterwarnings(action='ignore', category=EfficiencyWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=NonBLASDotWarning)
        warnings.filterwarnings(action='ignore',
                                category=UndefinedMetricWarning)

        self._params = dict(n_estimators=ParameterDefinition(
            MinMax(min=10, max=111), np.uint),
                            algorithm=ParameterDefinition(['SAMME',
                                                           'SAMME.R']))
        self.__ada_boost = AdaBoostClassifier()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__ada_boost.set_params(**kwargs)

    def fit(self, x, y, **kwargs):
        r"""Fit AdaBoost.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.
            y (pandas.core.series.Series): n classes of the samples in the x array.
        """
        self.__ada_boost.fit(x, y)

    def predict(self, x, **kwargs):
        r"""Predict class for each sample (row) in x.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.

        Returns:
            pandas.core.series.Series: n predicted classes.
        """
        return self.__ada_boost.predict(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return Classifier.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__ada_boost.get_params()))
예제 #14
0
else:
    print(
        'Low precision and recall, DecisionTree is not a good classifier with set parameters'
    )

print(
    '################### Try AdaBoostClassifier ###################################'
)
# AdaBoostClassifier - 2
ab_clf_1 = AdaBoostClassifier()

pipe = Pipeline([('feature_selection', SelectKBest(k=k)),
                 ('classification', ab_clf_1)])

# Check the parameters that can be set for AdaBoostClassifier, and create a param_grid
estimated = ab_clf_1.get_params().keys()
print('param_keys########################', estimated)

param_grid = {'classification__n_estimators': [10, 50, 100]}

scorer = make_scorer(f1_score)
ab_clf_1 = GridSearchCV(pipe, param_grid=param_grid, scoring=scorer)
ab_clf_1.fit(X_training_features, y_train_poi)

scores = sklearn.cross_validation.cross_val_score(ab_clf_1, features, labels)
print(scores)
print('AdaBoostClassifier mean score:', scores.mean())

clf_best = ab_clf_1.best_estimator_

y_poi_predicted = clf_best.predict(X_test_features)
예제 #15
0
                     index=["Predicted No", "Predicted Yes"],
                     columns=["Actual No", "Actual Yes"])
plt.figure()
sns.heatmap(df_cm, cmap="bwr", annot=True)
print(classification_report(y_test, prediction_RF, target_names=["no", "yes"]))
#AUC
probs_RF = rf_random.predict_proba(x_test)
preds_RF = probs_RF[:, 1]
fprrfc, tprrfc, thresholdrfc = metrics.roc_curve(y_test, preds_RF)
roc_aucrfc = metrics.auc(fprrfc, tprrfc)
#%% 6.1.2. ADABOOST
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ADA = AdaBoostClassifier()
#explore the hyperparameters
pprint(ADA.get_params())
#learning rate shrinks the contribution of each tree by learning_rate.
learning_rate = [
    round(float(x), 2) for x in np.linspace(start=0.2, stop=2, num=10)
]
#algorithm ===================================================================
# If ‘SAMME.R’ then use the SAMME.R real boosting algorithm.
# base_estimator must support calculation of class probabilities.
# If ‘SAMME’ then use the SAMME discrete boosting algorithm.
# The SAMME.R algorithm typically converges faster than SAMME,
# achieving a lower test error with fewer boosting iterations.
# =============================================================================
algorithm = ["SAMME", "SAMME.R"]
n_estimators = [500]
#The base estimator from which the boosted ensemble is built
base_estimator = [
print(LGB.score(X_val, y_val))

from joblib import dump, load

dump(LGB, 'LGB.joblib')
dump(LBM_1, 'LGB.joblib')

from sklearn.ensemble import AdaBoostClassifier
from pprint import pprint

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

print(ada.score(X_test, y_test))
print('Parameters currently in use:\n')
pprint(ada.get_params())

# XGBoost
# XGBoost and Adaboost are bad in multiclassification. The reason can be c=found in the lightgbm literature review.

from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
    test_error = []
    train_error = []
    for train_index, test_index in skf:
       print("for iteration {}".format(i))
       X_train, X_test = X[train_index], X[test_index]
       y_train, y_test = Y[train_index], Y[test_index]
       clf = clf.fit(X_train,y_train)
        
       y_pred = clf.predict(X_test)
       test_error.append(accuracy_score(y_pred,y_test))
    

       y_pred = clf.predict(X_train)
       train_error.append(accuracy_score(y_pred,y_train))
       print(clf.get_params())
        
    


    print('Time to fit the dataset of alpha = {} is {}'.format(i,time.clock()-start))
#     y_pred = clf.predict(X)
#     train_error = mean_absolute_error(y_pred,Y)

#     y_pred = clf.predict(X_test)
    test_error = sum(test_error)/len(test_error)
    train_error = sum(train_error)/len(train_error)
    f.write('{},{},{}\n'.format(i,train_error,test_error))
    print('{},{},{}\n'.format(i,train_error,test_error))
    f.flush()
예제 #18
0
class AdaBoost(Model):

    # X represents the features, Y represents the labels
    X = None
    Y = None
    prediction = None
    model = None

    def __init__(self):
        pass

    def __init__(self,
                 X=None,
                 Y=None,
                 label_headers=None,
                 n_estimators=100,
                 type='regressor',
                 cfg=False):
        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        self.mapping_dict = None
        self.label_headers = label_headers

        self.type = type
        self.cfg = cfg

        if type == 'regressor':
            self.model = AdaBoostRegressor(n_estimators=n_estimators)
        else:
            self.model = AdaBoostClassifier(n_estimators=n_estimators)

    def fit(self, X=None, Y=None):
        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        if self.type == 'classifier':
            self.map_str_to_number(Y)

        print('AdaBoost Train started............')
        self.model.fit(self.X, self.Y)
        print('AdaBoost Train completed..........')

        return self.model

    def predict(self, test_features):
        print('Prediction started............')
        self.predictions = self.model.predict(test_features)
        print('Prediction completed..........')
        return self.predictions

    def save(self, filename='adaboost_model.pkl'):
        if self.cfg:
            f = open('adaboost_configs.txt', 'w')
            f.write(json.dumps(self.model.get_params()))
            f.close()
        pickle.dump(self.model, open(filename, 'wb'))

    def featureImportance(self):

        # Get numerical feature importances
        # importances = list(self.model.feature_importances_)
        # List of tuples with variable and importance
        # feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_headers, importances)]
        # Sort the feature importances by most important first
        # feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
        # Print out the feature and importances
        # [print('Variable: {!s:20} Importance: {}'.format(*pair)) for pair in feature_importances];

        return self.model.feature_importances_

    def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8):
        if self.type == 'classifier':
            correct = 0
            df = pd.DataFrame(data=predictions.flatten())
            for i in range(len(df)):
                if (df.values[i] == test_labels.values[i]):
                    correct = correct + 1
        else:
            correct = 0
            df = pd.DataFrame(data=predictions.flatten())
            for i in range(len(df)):
                if 1 - abs(df.values[i] - test_labels.values[i]) / abs(
                        df.values[i]) >= hitmissr:
                    correct = correct + 1
        return float(correct) / len(df)

    def getConfusionMatrix(self, test_labels, predictions, label_headers):
        if self.type == 'classifier':
            df = pd.DataFrame(data=predictions.flatten())
            index = 0
            for label_header in label_headers:
                classes = test_labels[label_header].unique()
                title = 'Normalized confusion matrix for AdaBoost (' + label_header + ')'
                self.plot_confusion_matrix(test_labels.ix[:, index],
                                           df.ix[:, index],
                                           classes=classes,
                                           normalize=True,
                                           title=title)
                index = index + 1
        else:
            return 'No Confusion Matrix for Regression'

    def getRSquare(self, test_labels, predictions, mode='single'):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            if mode == 'multiple':
                errors = r2_score(test_labels,
                                  df,
                                  multioutput='variance_weighted')
            else:
                errors = r2_score(test_labels, df)
            return errors
        else:
            return 'No RSquare for Classification'

    def getMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = mean_squared_error(test_labels, df)
            return errors
        else:
            return 'No MSE for Classification'

    def getMAPE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = np.mean(np.abs(
                (test_labels - df.values) / test_labels)) * 100
            return errors.values[0]
        else:
            return 'No MAPE for Classification'

    def getRMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = sqrt(mean_squared_error(test_labels, df))
            return errors
        else:
            return 'No RMSE for Classification'
예제 #19
0
#     n_estimators=50,
#     learning_rate=1.5,
#     algorithm="SAMME")
# bdt_discrete.fit(X_train, Y1_train)
print('finished fit')
print(confusion_table(Y1_test, bdt_real.predict(X_test)))


prob = bdt_real.predict_proba(X_test)
pred = Y1_test.tolist()

table = pd.DataFrame(prob)
table = pd.concat([table, pd.DataFrame(pred)], 1)
table.columns = [-1, 0, 1, 'true']
table.sort(1, ascending=False)[:1000]
bdt_real.get_params()

# draw trees in every
round_count = 0
for trees in bdt_real.estimators_:
    dot_data = tree.export_graphviz(trees, out_file=None, filled=True, rounded=True,
                                    special_characters=True,
                                    feature_names=X_train.columns)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf(data_output_dir + "tree_round_" + str(round_count) + ".pdf")
    round_count += 1
real_test_errors = []
# discrete_test_errors = []

# for real_test_predict, discrete_train_predict in zip(
#         bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
예제 #20
0
class _AdaBoostClassifierImpl:
    def __init__(
        self,
        base_estimator=None,
        *,
        n_estimators=50,
        learning_rate=1.0,
        algorithm="SAMME.R",
        random_state=None,
    ):
        if base_estimator is None:
            estimator_impl = None
        else:
            estimator_impl = _FitSpecProxy(base_estimator)

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "algorithm": algorithm,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = _FitSpecProxy(
                feature_transformer >> self._hyperparams["base_estimator"])
            self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def predict_log_proba(self, X):
        return self._wrapped_model.predict_log_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
예제 #21
0
def main():
    """magic happens here"""
    # preprocess, then train, test, and split
    chess_num_datatrain, chess_num_datatest, chess_num_targettrain, chess_num_targettest = tts_chess_numeric(
    )
    iris_num_datatrain, iris_num_datatest, iris_num_targettrain, iris_num_targettest = tts_iris_numeric(
    )
    letter_num_datatrain, letter_num_datatest, letter_num_targettrain, letter_num_targettest = tts_letter_numeric(
    )

    # For each dataset
    ## Try at least 3 different "regular" learning algorithms and note the results.
    ### DS1 - chess
    print("")
    ##### method 1 - MLP **
    clf_chess_num_MLP = MLPClassifier(solver='adam',
                                      alpha=1e-5,
                                      hidden_layer_sizes=(40, 30),
                                      random_state=1)
    clf_chess_num_MLP.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_MLP.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "Chess - Neural Network")
    ##### method 2 - Decision Tree
    clf_chess_num_DT = DecisionTreeClassifier(random_state=0)
    clf_chess_num_DT.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_DT.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "Chess - Decision Tree")
    ##### method 3 - KNN
    clf_chess_num_KNN = KNeighborsClassifier(n_neighbors=7)
    clf_chess_num_KNN.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_KNN.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "Chess - KNN")
    ### DS2 - iris
    print("")
    ##### method 1 - MLP
    clf_iris_num_MLP = MLPClassifier(solver='adam',
                                     alpha=1e-5,
                                     hidden_layer_sizes=(10, 7),
                                     random_state=1)
    clf_iris_num_MLP.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_MLP.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "Iris - Neural Network")
    # clf_iris_num_MLP_gs = MLPClassifier()
    # iris_param_grid = [
    #     {
    #         'activation' : ['identity', 'logistic', 'tanh', 'relu'],
    #         'solver' : ['lbfgs', 'sgd', 'adam'],
    #         'hidden_layer_sizes': [
    #          (9,1),(9,2),(9,3),(9,4),(9,5),(9,6),(9,7),(9,8),(9,10),(9,11),(9,12),
    #          (10,1),(10,2),(10,3),(10,4),(10,5),(10,6),(10,7),(10,8),(10,10),(10,11),(10,12),
    #          (11,1),(11,2),(11,3),(11,4),(11,5),(11,6),(11,7),(11,8),(11,10),(11,11),(11,12)
    #          ]
    #     }
    #    ]
    # grid_clf = GridSearchCV(clf_iris_num_MLP_gs, iris_param_grid, cv=3,
    #                        scoring='accuracy')
    # grid_clf.fit(iris_num_datatrain, iris_num_targettrain)
    # print("the best parameters out of those chosen are: ")
    # print(grid_clf.best_params_)
    ##### method 2 - Decision Tree
    clf_iris_num_DT = DecisionTreeClassifier()
    clf_iris_num_DT.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_DT.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "Iris - Decision Tree")
    ##### method 3 - KNN
    clf_iris_num_KNN = KNeighborsClassifier(n_neighbors=3)
    clf_iris_num_KNN.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_KNN.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "Iris - KNN")
    ### DS3
    print("")
    ##### method 1 - MLP
    clf_letter_num_MLP = MLPClassifier(solver='adam',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(40, 30),
                                       random_state=1)
    clf_letter_num_MLP.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_MLP.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "Letter - Neural Network")
    ##### method 2 - Decision Tree
    clf_letter_num_DT = DecisionTreeClassifier()
    clf_letter_num_DT.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_DT.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "Letter - Decision Tree")
    ##### method 3 - KNN
    clf_letter_num_KNN = KNeighborsClassifier(n_neighbors=3)
    clf_letter_num_KNN.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_KNN.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "Letter - KNN")
    print("")

    ## Use Bagging and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20)
    clf_chess_num_Bagging.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_Bagging.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "BAGGING - Chess")
    ### DS2 - Iris
    clf_iris_num_Bagging = BaggingClassifier(bootstrap=True)
    clf_iris_num_Bagging.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_Bagging.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "BAGGING - Iris")
    ### DS3 - Letter
    clf_letter_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20)
    clf_letter_num_Bagging.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_Bagging.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "BAGGING - Letter")
    print("")

    ## Use AdaBoost and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_AdaBoost = AdaBoostClassifier()
    clf_chess_num_AdaBoost.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_AdaBoost.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest, "ADABOOST - Chess")
    params = clf_chess_num_AdaBoost.get_params()
    print(params)
    ### DS2 - Iris
    clf_iris_num_AdaBoost = AdaBoostClassifier(learning_rate=0.3)
    clf_iris_num_AdaBoost.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_AdaBoost.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest, "ADABOOST - Iris")
    params = clf_iris_num_AdaBoost.get_params()
    print(params)
    ### DS3 - Letter
    clf_letter_num_AdaBoost = AdaBoostClassifier(n_estimators=200)
    clf_letter_num_AdaBoost.fit(letter_num_datatrain, letter_num_targettrain)
    predictions = clf_letter_num_AdaBoost.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest, "ADABOOST - Letter")
    params = clf_letter_num_AdaBoost.get_params()
    print(params)
    print("")

    ## Use a random forest and note the results. (Play around with a few different options)
    ### DS1 - Chess
    clf_chess_num_RandomForest = RandomForestClassifier(criterion='entropy',
                                                        bootstrap=False,
                                                        n_estimators=30)
    clf_chess_num_RandomForest.fit(chess_num_datatrain, chess_num_targettrain)
    predictions = clf_chess_num_RandomForest.predict(chess_num_datatest)
    display_similarity(predictions, chess_num_targettest,
                       "RANDOM FOREST - Chess")
    ### DS2 - Iris
    clf_iris_num_RandomForest = RandomForestClassifier()
    clf_iris_num_RandomForest.fit(iris_num_datatrain, iris_num_targettrain)
    predictions = clf_iris_num_RandomForest.predict(iris_num_datatest)
    display_similarity(predictions, iris_num_targettest,
                       "RANDOM FOREST - Iris")
    ### DS3 - Letter
    clf_letter_num_RandomForest = RandomForestClassifier(bootstrap=False)
    clf_letter_num_RandomForest.fit(letter_num_datatrain,
                                    letter_num_targettrain)
    predictions = clf_letter_num_RandomForest.predict(letter_num_datatest)
    display_similarity(predictions, letter_num_targettest,
                       "RANDOM FOREST - Letter")
예제 #22
0
    score_grid[clf] = [accuracy, precision, recall, f1, f2]

### print out AdaBoost parameters tuning result ###############################################################
print "\nAdaBoost Tuning:"
print """Note: if 'min_samples_split' and 'max_featues' are None, base_estimator = None.
Else, base_estimator = DecisionTreeClassifier(min_samples_split = min_samples_split, 
max_features = max_features)"""
print "\n"

print "{:^17}{:^16}{:^16}{:^16}{:^11}{:^11}{:^11}{:^11}{:^11}"\
.format("", "n_estimator","min_samples_split","max_features","accuracy", "precision", "recall", "f1", "f2")

scoring_methods = ["accuracy", "precision", "recall", "f1", "f2"]
for sm in scoring_methods:
    clf = find_best(sm, score_grid)
    ne = clf.get_params().get("n_estimators")
    msp = clf.get_params().get('base_estimator__min_samples_split')
    mf = clf.get_params().get('base_estimator__max_features')
    accuracy, precision, recall, f1, f2 = score_grid[clf]
    print "{:^17}{:^16}{:^16}{:^16}{:^11.3f}{:^11.3f}{:^11.3f}{:^11.3f}{:^11.3f}"\
    .format("best_"+sm, ne, msp, mf, accuracy, precision, recall, f1, f2)

### try PCA, the best estimator above(based on f1 score) and sf3 to do classification #############################
clf_f1 = find_best("f1", score_grid)
sf_in_use = sfs_dict["sf3"]

print "\nPCA Analysis:\n"
print "clf:", clf_f1, "\n"
print "{:^15}{:^12}{:^12}{:^12}{:^12}{:^12}".format("n_components", "accuracy",
                                                    "precision", "recall",
                                                    "f1", "f2")
예제 #23
0
# Accuracy of Extratrees classifier on test set: 0.8295

#******************************************************************************
#******************************************************************************

# *** Applying Machine Learning Technique #6 ***

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=5)

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(Adab.get_params())

Adab.fit(X_train, y_train)

score_ABC = Adab.score(X_test, y_test)
print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ABC))

# Accuracy of Extratrees classifier on test set: 0.8224
#******************************************************************************
#******************************************************************************

# *** Applying Machine Learning Technique #7 ***

from sklearn.ensemble import RandomForestClassifier

Rando = RandomForestClassifier(n_estimators=5)
예제 #24
0
class Algorithm(object):
    # Initialize the chosen algorithm with parameters set by user
    def __init__(
        self,
        algorithm='decision tree',
        parameters={}
    ):  # algorithm: String, Parameters: Dict {paramName:paramValue}

        self.algorithm = algorithm
        if self.algorithm == 'decision tree':  # needed parameter: max_depth
            if parameters == {}:
                self.classifier = DecisionTreeClassifier(max_depth=2)
            else:
                self.classifier = DecisionTreeClassifier(
                    max_depth=parameters['max_depth'])
        elif self.algorithm == 'support vector machine':  # needed parameters: penalty, tol, C, class_weight              # needed parameter: max_depth
            if parameters == {}:
                self.classifier = LinearSVC(dual=False,
                                            penalty='l1',
                                            tol=0.00001,
                                            C=1.0,
                                            class_weight=None)
            else:
                self.classifier = LinearSVC(
                    dual=False,
                    penalty=parameters['penalty'],
                    tol=parameters['tol'],
                    C=parameters['C'],
                    class_weight=parameters['class_weight'])
        elif self.algorithm == 'random forest':  # needed parameters: n_estimators, max_features, min_samples_leaf, max_depth              # needed parameter: max_depth
            if parameters == {}:
                self.classifier = RandomForestClassifier(n_estimators=20,
                                                         max_features='auto',
                                                         min_samples_leaf=3,
                                                         max_depth=3)
            else:
                self.classifier = RandomForestClassifier(
                    n_estimators=parameters['n_estimators'],
                    max_features=parameters['max_features'],
                    min_samples_leaf=parameters['min_samples_leaf'],
                    max_depth=parameters['max_depth'])
        elif self.algorithm == 'adaboost':  # needed parameters: base_estimator, n_estimators, learning_rate, algorithm.
            print 'adaboost start'
            if parameters == {}:
                self.classifier = AdaBoostClassifier(
                    base_estimator=DecisionTreeClassifier(max_depth=2),
                    n_estimators=60,
                    learning_rate=1,
                    algorithm='SAMME')
            else:
                print 'cls1 ' + parameters['base_estimator']
                if parameters['base_estimator'] == 'decision tree':
                    baseEstimator = DecisionTreeClassifier(max_depth=2)
                elif parameters['base_estimator'] == 'support vector machine':
                    baseEstimator = LinearSVC(dual=False,
                                              penalty='l1',
                                              tol=0.00001,
                                              C=1.0,
                                              class_weight=None)
                elif parameters['base_estimator'] == 'random forest':
                    baseEstimator = RandomForestClassifier(n_estimators=20,
                                                           max_features='auto',
                                                           min_samples_leaf=3,
                                                           max_depth=3)
                else:
                    print 'cls2 ' + parameters['base_estimator']
                self.classifier = AdaBoostClassifier(
                    base_estimator=baseEstimator,
                    n_estimators=parameters['n_estimators'],
                    learning_rate=parameters['learning_rate'],
                    algorithm=parameters['algorithm'])

    def getAlgorithm(self):
        return self.algorithm

    def getParameters(self):
        return self.classifier.get_params()

    def fit(self, X, Y):
        self.classifier.fit(X, Y)

    def predict(self, X):
        return self.classifier.predict(X)

    def aggregation(self, Y, maxGap):
        loc1_1 = 0
        for i in range(len(Y) - maxGap):
            if Y[i] > 0.5:
                loc1_2 = loc1_1
                loc1_1 = i
                if 1.5 < loc1_1 - loc1_2 < maxGap:
                    for iter in range(loc1_2 + 1, loc1_1):
                        Y[iter] = 1