Пример #1
0
    def general_analysis(self):
        print("\n######")
        print("eXtreme Gradient Boosted Decision Tree Classifier:")
        print("Default baseline values")

        clf = XGBClassifier(n_jobs=-1)
        plot_learning_curve(clf,
                            '{} XGB Learning Curve'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5)

        print("~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        training_time, testing_time = measure_execution_time(
            clf,
            self.data.drop(columns=[self.target], axis=1),
            self.data[self.target],
            iterations=5)
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time)))
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time)))

        print("\n~~~~~~")
        scores = cross_val_score(clf,
                                 pd.concat([X_train, X_test]),
                                 pd.concat([y_train, y_test]),
                                 cv=10,
                                 n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(),
            scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
              accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
              f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
              accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
              f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
        print('~~~~~~\n')
Пример #2
0
    def general_analysis(self):
        print("\n######")
        print("KNN Classifier:")
        print("Default Baseline values (5 neighbors)")

        clf = KNeighborsClassifier(n_jobs=-1)
        plot_learning_curve(clf,
                            '{} KNN Learning Curve (uniform)'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5,
                            scale=True)

        clf = KNeighborsClassifier(weights='distance', n_jobs=-1)
        plot_learning_curve(clf,
                            '{} KNN Learning Curve (distance)'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5,
                            scale=True)

        print("\n~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        training_time, testing_time = measure_execution_time(
            clf,
            pd.concat([pd.DataFrame(X_train_std),
                       pd.DataFrame(X_test_std)]), pd.concat([y_train,
                                                              y_test]))
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time)))
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time)))

        for w in ['uniform', 'distance']:
            print("\n~~~~~~")
            print('{} weights:'.format(w.capitalize()))
            clf = KNeighborsClassifier(weights=w, n_jobs=-1)
            scores = cross_val_score(clf,
                                     pd.concat([
                                         pd.DataFrame(X_train_std),
                                         pd.DataFrame(X_test_std)
                                     ]),
                                     pd.concat([y_train, y_test]),
                                     cv=10,
                                     n_jobs=-1)
            print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".
                  format(scores.mean(),
                         scores.std() * 2))

            clf.fit(X_train_std, y_train)
            preds_train = clf.predict(X_train_std)
            preds_test = clf.predict(X_test_std)
            print("Training Accuracy:",
                  accuracy_score(y_true=y_train, y_pred=preds_train))
            print(
                "Training F1:",
                f1_score(y_true=y_train,
                         y_pred=preds_train,
                         average='weighted'))
            print("Testing Accuracy:",
                  accuracy_score(y_true=y_test, y_pred=preds_test))
            print(
                "Testing F1:",
                f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))

        print("~~~~~~\n")
Пример #3
0
    def general_analysis(self):
        print("\n######")
        print("Decision Tree Classifier:")
        print("Default Baseline values (no max depth or max leaf nodes)\n")

        clf = DecisionTreeClassifier(random_state=self.random_state)
        plot_learning_curve(clf, '{} Decision Tree Learning Curve'.format(self.data.index.name), self.data, self.target, cv=5)

        print("\n~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(self.data, self.target, random_state=self.random_state)

        training_time, testing_time = measure_execution_time(clf,
            self.data.drop(columns=[self.target], axis=1), self.data[self.target])
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time))
            )
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time))
            )

        print("\n~~~~~~")
        print('Split on Gini Importance:')
        scores = cross_val_score(clf,
            pd.concat([X_train, X_test]),
            pd.concat([y_train, y_test]),
            cv=10, n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(), scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
            accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
            f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
            accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
            f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))

        print('\n~~~~~~')
        print('Split on Entropy Gain:')
        clf = DecisionTreeClassifier(criterion='entropy', random_state=7308)
        scores = cross_val_score(clf,
            pd.concat([X_train, X_test]),
            pd.concat([y_train, y_test]),
            cv=10, n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(), scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
            accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
            f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
            accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
            f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
        print("~~~~~~\n")
Пример #4
0
    def general_analysis(self):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("\n######")
            print("Multilayer Perceptron Classifier:")
            print('Default Baseline values\n')

            clf = MLPClassifier(random_state=self.random, max_iter=1000)
            plot_learning_curve(clf,
                                '{} MLP Learning Curve'.format(
                                    self.data.index.name),
                                self.data,
                                self.target,
                                cv=5,
                                scale=True)

            print("\n~~~~~~")
            print("Execution time metrics")
            X_train, X_test, y_train, y_test = prep_data_for_clf(
                self.data, self.target, random_state=self.random)

            sclr = StandardScaler()
            sclr.fit(X_train.astype('float'))
            X_train_std = sclr.transform(X_train.astype('float'))
            X_test_std = sclr.transform(X_test.astype('float'))
            training_time, testing_time = measure_execution_time(
                clf,
                pd.concat(
                    [pd.DataFrame(X_train_std),
                     pd.DataFrame(X_test_std)]), pd.concat([y_train, y_test]))
            print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
                X_train.shape, np.mean(training_time), np.std(training_time)))
            print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
                X_test.shape, np.mean(testing_time), np.std(testing_time)))

            print("\n~~~~~~")
            scores = cross_val_score(clf,
                                     pd.concat([
                                         pd.DataFrame(X_train_std),
                                         pd.DataFrame(X_test_std)
                                     ]),
                                     pd.concat([y_train, y_test]),
                                     cv=10,
                                     n_jobs=-1)

            print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".
                  format(scores.mean(),
                         scores.std() * 2))

            clf.fit(X_train_std, y_train)
            preds_train = clf.predict(X_train_std)
            preds_test = clf.predict(X_test_std)

            print("Training Accuracy:",
                  accuracy_score(y_true=y_train, y_pred=preds_train))
            print(
                "Training F1:",
                f1_score(y_true=y_train,
                         y_pred=preds_train,
                         average='weighted'))
            print("Testing Accuracy:",
                  accuracy_score(y_true=y_test, y_pred=preds_test))
            print(
                "Testing F1:",
                f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
            print('~~~~~~\n')