Пример #1
0
    def fit(self, X_train, y_train):

        # intrusion
        if intrusion:
            X_intrusion = FeatureEngineering(
                X_train[self.features],
                "intrusion_cutoff").engineer_features().values
            y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x))

            self.pipe_intrusion = Pipeline(
                steps=[('feature_selection', SelectFpr(
                    alpha=0.05)), ('sampling',
                                   BorderlineSMOTE(k_neighbors=10)),
                       ('classifier',
                        XGBClassifier(n_estimators=300, max_depth=5))])

            scores = cross_val_score(self.pipe_intrusion,
                                     X_intrusion,
                                     y_intrusion,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"intrusion {sum(scores)/5}")
            self.pipe_intrusion.fit(X_intrusion, y_intrusion)

        # avoidance
        if avoidance:
            X_avoidance = FeatureEngineering(
                X_train[self.features],
                "avoidance_cutoff").engineer_features().values
            y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x))

            self.pipe_avoidance = Pipeline(
                steps=[('feature_selection',
                        RFE(estimator=XGBClassifier(scale_pos_weight=5.88,
                                                    n_estimators=100),
                            n_features_to_select=20)),
                       ('classifier',
                        BalancedRandomForestClassifier(n_estimators=300,
                                                       max_depth=10))])

            scores = cross_val_score(self.pipe_avoidance,
                                     X_avoidance,
                                     y_avoidance,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"avoidance {sum(scores)/5}")
            self.pipe_avoidance.fit(X_avoidance, y_avoidance)

        # hypertension
        if hypertension:
            X_hypertension = FeatureEngineering(
                X_train[self.features],
                "hypertention_cutoff").engineer_features().values
            y_hypertention = X_train["hypertention_cutoff"].apply(
                lambda x: int(x))

            self.pipe_hypertension = Pipeline(
                steps=[('feature_selection',
                        RFE(estimator=XGBClassifier(n_estimators=100,
                                                    scale_pos_weight=3.51),
                            n_features_to_select=20)
                        ), ('sampling', SMOTE(k_neighbors=10)),
                       ('classifier',
                        BalancedRandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_hypertension,
                                     X_hypertension,
                                     y_hypertention,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"hypertension {sum(scores)/5}")
            self.pipe_hypertension.fit(X_hypertension, y_hypertention)

        # depression
        if depression:
            X_depression = FeatureEngineering(
                X_train[self.features],
                "depression_cutoff").engineer_features().values
            y_depression = X_train["depression_cutoff"].apply(lambda x: int(x))

            self.pipe_depression = Pipeline(
                steps=[('feature_selection', SelectFdr(
                    alpha=0.1)), ('sampling', SMOTE(k_neighbors=5)),
                       ('classifier',
                        RandomForestClassifier(n_estimators=100))])

            scores = cross_val_score(self.pipe_depression,
                                     X_depression,
                                     y_depression,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"depression {sum(scores)/5}")
            self.pipe_depression.fit(X_depression, y_depression)

        # only_avoidance
        if only_avoidance:
            X_only_avoidance = FeatureEngineering(
                X_train[self.features],
                "only_avoidance_cutoff").engineer_features().values
            y_only_avoidance = X_train["only_avoidance_cutoff"].apply(
                lambda x: int(x))

            self.pipe_only_avoidance = Pipeline(
                steps=[('feature_selection',
                        RFE(XGBClassifier(n_estimators=100, max_depth=3),
                            n_features_to_select=10)),
                       ('classifier',
                        BalancedRandomForestClassifier(n_estimators=500,
                                                       max_depth=10))])

            scores = cross_val_score(self.pipe_only_avoidance,
                                     X_only_avoidance,
                                     y_only_avoidance,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"only_avoidance {sum(scores)/5}")
            self.pipe_only_avoidance.fit(X_only_avoidance, y_only_avoidance)

        # pcl_strict3
        if PCL_Strict3:
            X_PCL_Strict3 = FeatureEngineering(
                X_train[self.features],
                "PCL_Strict3").engineer_features().values
            y_PCL_Strict3 = y_train["PCL_Strict3"].apply(lambda x: int(x))

            self.pipe_PCL_Strict3 = Pipeline(
                steps=[('feature_selection',
                        SelectKBest(k=20)), ('sampling', SMOTE(k_neighbors=5)),
                       ('classifier',
                        XGBClassifier(max_depth=3, n_estimators=100))])

            scores = cross_val_score(self.pipe_PCL_Strict3,
                                     X_PCL_Strict3,
                                     y_PCL_Strict3,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"PCL_Strict3 {sum(scores)/5}")
            self.pipe_PCL_Strict3.fit(X_PCL_Strict3, y_PCL_Strict3)

        # cutoff_33
        if regression_cutoff_33:
            X_regression_cutoff_33 = FeatureEngineering(
                X_train[self.features],
                "regression_cutoff_33").engineer_features().values
            y_regression_cutoff_33 = X_train["regression_cutoff_33"].apply(
                lambda x: int(x))

            self.pipe_regression_cutoff_33 = Pipeline(
                steps=[('feature_selection', SelectFpr(
                    alpha=0.033)), ('sampling', SMOTE(k_neighbors=10)),
                       ('classifier',
                        RandomForestClassifier(n_estimators=100, max_depth=5)
                        )])

            scores = cross_val_score(self.pipe_regression_cutoff_33,
                                     X_regression_cutoff_33,
                                     y_regression_cutoff_33,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"regression_cutoff_33 {sum(scores)/5}")
            self.pipe_regression_cutoff_33.fit(X_regression_cutoff_33,
                                               y_regression_cutoff_33)

        # cutoff 50
        if regression_cutoff_50:
            X_regression_cutoff_50 = FeatureEngineering(
                X_train[self.features],
                "regression_cutoff_50").engineer_features().values
            y_regression_cutoff_50 = X_train["regression_cutoff_50"].apply(
                lambda x: int(x))

            self.pipe_regression_cutoff_50 = Pipeline(
                steps=[('feature_selection',
                        SelectKBest(k=10)), ('sampling',
                                             SMOTE(k_neighbors=10)),
                       ('classifier',
                        XGBClassifier(max_depth=2, n_estimators=100))])

            scores = cross_val_score(self.pipe_regression_cutoff_50,
                                     X_regression_cutoff_50,
                                     y_regression_cutoff_50,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"regression_cutoff_50 {sum(scores)/5}")
            self.pipe_regression_cutoff_50.fit(X_regression_cutoff_50,
                                               y_regression_cutoff_50)

        # tred_cutoff
        if tred_cutoff:
            X_tred_cutoff = FeatureEngineering(
                X_train[self.features],
                "tred_cutoff").engineer_features().values
            y_tred_cutoff = X_train["tred_cutoff"].apply(lambda x: int(x))

            self.pipe_tred_cutoff = Pipeline(
                steps=[('feature_selection',
                        SelectKBest(k=20)), ('sampling',
                                             SMOTE(k_neighbors=10)),
                       ('classifier',
                        XGBClassifier(n_estimators=100, max_depth=2))])

            scores = cross_val_score(self.pipe_tred_cutoff,
                                     X_tred_cutoff,
                                     y_tred_cutoff,
                                     scoring='f1',
                                     cv=StratifiedKFold(5))
            print(f"tred_cutoff {sum(scores)/5}")
            self.pipe_tred_cutoff.fit(X_tred_cutoff, y_tred_cutoff)

        # target
        if intrusion:
            y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion)
        else:
            y_pred_intrusion = 1

        if avoidance:
            y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance)
        else:
            y_pred_avoidance = 1

        if hypertension:
            y_pred_hypertension = self.pipe_hypertension.predict(
                X_hypertension)
        else:
            y_pred_hypertension = 1

        if depression:
            y_pred_depression = self.pipe_depression.predict(X_depression)
        else:
            y_pred_depression = 1

        if only_avoidance:
            y_pred_only_avoidance = self.pipe_only_avoidance.predict(
                X_only_avoidance)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(
                X_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(
                X_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion
                  & y_pred_depression & y_pred_only_avoidance
                  & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33
                  & y_pred_regression_cutoff_50 & y_pred_tred_cutoff)
        y_target = y_train

        acc = accuracy_score(y_target, y_pred)
        f1 = f1_score(y_target, y_pred)
        recall = recall_score(y_target, y_pred)
        precision = precision_score(y_target, y_pred)
        print("training scores")
        print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
Пример #2
0
    def fit(self, X_train, y_train):

        predictions_list = []

        for target in self.targets_list:
            if self.use_feature_engineering:
                X = FeatureEngineering(X_train[self.features], target).engineer_features().values
            else:
                X = X_train[self.features].values

            if target == "PCL_Strict3":
                y = y_train[target].apply(lambda x: int(x))
            else:
                y = X_train[target].apply(lambda x: int(x))

            pipeline = pipeline_per_target[target]
            scores = cross_val_score(pipeline, X, y, scoring='f1', cv=StratifiedKFold(5))
            print(f"{target} - {sum(scores)/len(scores)}")

            if self.train_on_partial_prediction:
                combined_y = pd.DataFrame(y, columns=[target])
                if target != "PCL_Strict3":
                    combined_y["PCL_Strict3"] = y_train["PCL_Strict3"].apply(lambda x: int(x))

                _X_train, _X_test, _y_train, _y_test = \
                    train_test_split(X, combined_y, test_size=0.25)
                self.trained_pipelines[target] = pipeline.fit(_X_train, _y_train[target])
                y_pred = self.trained_pipelines[target].predict(_X_test)
                predictions_list.append(self.trained_pipelines[target].predict_proba(_X_test)[:, 0])
                print("test f1", target, f1_score(_y_test[target], y_pred))
                self.trained_pipelines[target] = pipeline.fit(X, y)
                y = _y_test["PCL_Strict3"]
            else:
                self.trained_pipelines[target] = pipeline.fit(X, y)
                predictions_list.append([self.trained_pipelines[target].predict(X)])
                y = y_train["PCL_Strict3"]

            if self.check_on_test_set:
                if target == "PCL_Strict3":
                    y_test = self.y_test[target].apply(lambda x: int(x))
                else:
                    y_test = X_train[target].apply(lambda x: int(x))
                if self.use_feature_engineering:
                    X_test = FeatureEngineering(self.X_test[self.features], target).engineer_features().values
                else:
                    X_test = self.X_test[self.features].values

                model = self.trained_pipelines[target]
                y_pred = model.predict(X_test)
                s_f = f1_score(self.y_test, y_pred)
                s_p = precision_score(self.y_test, y_pred)
                s_r = recall_score(self.y_test, y_pred)
                print(f"test f1 {target}", s_f)
                print(f"test recall {target}", s_r)
                print(f"test precision {target}", s_p)

        #pipe = Pipeline(steps=[
        #    ('scaling', StandardScaler()),
        #    ('sampling', SMOTE()),
        #    ('classifier', LogisticRegression(penalty='l1'))])
        #c = ((len(y) - sum(y)) / sum(y))

        if not self.use_and_func:
            c = 2
            pipe = Pipeline(steps=[
                                   ('clf', XGBClassifier(scale_pos_weight=c))])
            X = predictions_list
            self.combined_model = pipe.fit(np.array(X).reshape(-1, len(predictions_list)), y)
Пример #3
0
    def predict(self, X_test):

        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(
                X_test[self.features],
                "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(
                X_test_intrusion_cutoff)
        else:
            y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(
                X_test[self.features],
                "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(
                X_test_avoidance_cutoff)
        else:
            y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(
                X_test[self.features],
                "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(
                X_test_hypertention_cutoff)
        else:
            y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(
                X_test[self.features],
                "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(
                X_test_depression_cutoff)
        else:
            y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(
                X_test[self.features],
                "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(
                X_test_only_avoidance_cutoff)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(
                X_test[self.features],
                "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(
                X_test_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(
                X_test[self.features],
                "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(
                X_test_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(
                X_test[self.features],
                "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(
                X_test_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(
                X_test[self.features],
                "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(
                X_test_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion
                  & y_pred_depression & y_pred_only_avoidance
                  & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33
                  & y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        return y_pred
    def predict(self, X_test):

        if intrusion:
            X_test_intrusion_cutoff = FeatureEngineering(X_test[self.features],
                                                         "intrusion_cutoff").engineer_features().values
            y_pred_intrusion = self.pipe_intrusion.predict(X_test_intrusion_cutoff)
        else:
            y_pred_intrusion = 1

        if avoidance:
            X_test_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                         "avoidance_cutoff").engineer_features().values
            y_pred_avoidance = self.pipe_avoidance.predict(X_test_avoidance_cutoff)
        else:
            y_pred_avoidance = 1

        if hypertension:
            X_test_hypertention_cutoff = FeatureEngineering(X_test[self.features],
                                                            "hypertention_cutoff").engineer_features().values
            y_pred_hypertension = self.pipe_hypertension.predict(X_test_hypertention_cutoff)
        else:
            y_pred_hypertension = 1

        if depression:
            X_test_depression_cutoff = FeatureEngineering(X_test[self.features],
                                                          "depression_cutoff").engineer_features().values
            y_pred_depression = self.pipe_depression.predict(X_test_depression_cutoff)
        else:
            y_pred_depression = 1

        if only_avoidance:
            X_test_only_avoidance_cutoff = FeatureEngineering(X_test[self.features],
                                                              "only_avoidance_cutoff").engineer_features().values

            y_pred_only_avoidance = self.pipe_only_avoidance.predict(X_test_only_avoidance_cutoff)
        else:
            y_pred_only_avoidance = 1

        if PCL_Strict3:
            X_test_PCL_Strict3 = FeatureEngineering(X_test[self.features], "PCL_Strict3").engineer_features().values
            y_pred_PCL_Strict3 = self.pipe_PCL_Strict3.predict(X_test_PCL_Strict3)
        else:
            y_pred_PCL_Strict3 = 1

        if regression_cutoff_33:
            X_test_regression_cutoff_33 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_33").engineer_features().values
            y_pred_regression_cutoff_33 = self.pipe_regression_cutoff_33.predict(X_test_regression_cutoff_33)
        else:
            y_pred_regression_cutoff_33 = 1

        if regression_cutoff_50:
            X_test_regression_cutoff_50 = FeatureEngineering(X_test[self.features],
                                                             "regression_cutoff_50").engineer_features().values
            y_pred_regression_cutoff_50 = self.pipe_regression_cutoff_50.predict(X_test_regression_cutoff_50)
        else:
            y_pred_regression_cutoff_50 = 1

        if tred_cutoff:
            X_test_tred_cutoff = FeatureEngineering(X_test[self.features], "tred_cutoff").engineer_features().values
            y_pred_tred_cutoff = self.pipe_tred_cutoff.predict(X_test_tred_cutoff)
        else:
            y_pred_tred_cutoff = 1

        y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_depression &
                  y_pred_only_avoidance & y_pred_PCL_Strict3 & y_pred_regression_cutoff_33 &
                  y_pred_regression_cutoff_50 & y_pred_tred_cutoff)

        preds = ["y_pred_hypertension", "y_pred_avoidance", "y_pred_intrusion", "y_pred_regression"]

        X_combined = X_test[['q6.11_NUMB_pcl2', 'q6.13_SLEEP_pcl1', 'intrusion_pcl2', 'phq2'] + preds].values

        y_pred = self.pipe_combined.predict(X_combined)
        return y_pred