Exemplo n.º 1
0
def balance_classes(X, X_y, X_strat, target, train_cols, method):

    y = X_strat.astype(str)
    x = X.copy()
    # x[target] = X_y

    if method == "over":
        from imblearn.over_sampling import RandomOverSampler
        x, y = RandomOverSampler().fit_sample(x, y)

    elif method == "under":
        from imblearn.under_sampling import RandomUnderSampler
        x, y = RandomUnderSampler().fit_sample(x, y)

    elif method == "smote":
        from imblearn.over_sampling import SMOTE
        features = list(
            set(
                x.select_dtypes(include=[
                    "uint8", "int16", "int32", "int64", "float16", "float32",
                    "float64"
                ]).columns) - set(X_strat.name))
        no_action = list(set(x.columns) - set(features))
        x_sm = x[features].copy()
        x_no_action = x[no_action].copy()
        x, y = SMOTE().fit_sample(x_sm, y)
        x[no_action] = x_no_action[no_action]
        x[X_strat.name] = y

    X = x.copy()
    X_y = x[target]
    gc.collect()
    return X, X_y
Exemplo n.º 2
0
    def train(self):
        fit_params = {
            'early_stopping_rounds': 50,
            'eval_metric': 'auc',
            'eval_names': ['valid'],
            'verbose': 100,
            'categorical_feature': 'auto'
        }

        param_test = {
            'num_leaves': sp_randint(6, 50),
            'min_child_samples': sp_randint(100, 500),
            'min_child_weight':
            [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
            'subsample': sp_uniform(loc=0.2, scale=0.8),
            'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
            'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
            'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
        }
        # split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.df_train_features,
            self.df_train_label,
            test_size=self.test_size)
        # random oversampling
        ros = RandomOverSampler(random_state=0)
        X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
        clf = lgb.LGBMClassifier(max_depth=-1,
                                 random_state=314,
                                 silent=True,
                                 metric='None',
                                 n_jobs=4,
                                 n_estimators=900)
        # set randomsearch optimization params
        random_search = RandomizedSearchCV(estimator=clf,
                                           param_distributions=param_test,
                                           scoring='roc_auc',
                                           cv=5,
                                           refit=True,
                                           random_state=314,
                                           verbose=False)

        fit_params = {**fit_params, **{'eval_set': [(X_test, y_test)]}}

        # fit random search
        random_search.fit(X_train.copy(), y_train.copy(), **fit_params)

        # build final classifier
        clf_final = lgb.LGBMClassifier(
            **random_search.best_estimator_.get_params())
        clf_final.fit(
            X_train,
            y_train,
            **fit_params,
            callbacks=[
                lgb.reset_parameter(
                    learning_rate=self.learning_rate_010_decay_power_0995)
            ])

        # save model
        joblib.dump(clf_final, os.path.join(MODELS_DIR, "{}.pkl".format("ML")))

        predicted = clf_final.predict(X_test)
        return {
            "accuracy":
            accuracy_score(y_test, predicted),
            "roc":
            roc_auc_score(y_test, predicted),
            "cm":
            confusion_matrix(y_test, predicted, normalize="true"),
            "feature_importance":
            pd.DataFrame(sorted(
                zip(clf_final.feature_importances_, X_test.columns)),
                         columns=['Value', 'Feature'])
        }
    def train(self):
        # split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.df_train_features,
            self.df_train_label,
            test_size=self.test_size
        )
        # random oversampling
        ros = RandomOverSampler(random_state=0)
        X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
        clf = lgb.LGBMClassifier(
            max_depth=-1, 
            random_state=314, 
            silent=True, 
            metric='None', 
            n_jobs=4, 
            n_estimators=10000)
        # set randomsearch optimization params
        random_search = RandomizedSearchCV(
            estimator=clf, 
            param_distributions=config.param_test, 
            scoring='roc_auc',
            cv=5,
            refit=True,
            random_state=314,
            verbose=False)

        fit_params = {
            ** config.fit_params, 
            **{'eval_set': [(X_test, y_test)]}}
        
        # fit random search
        random_search.fit(
            X_train.copy(),
            y_train.copy(), 
            **fit_params)
        
        # build final classifier
        clf_final = lgb.LGBMClassifier(**random_search.best_estimator_.get_params())
        clf_final.fit(
            X_train, 
            y_train, 
            **fit_params, 
            callbacks=[
                lgb.reset_parameter(
                    learning_rate=self.learning_rate_010_decay_power_0995
                    )])
        
        # save model 
        joblib.dump(clf_final, os.path.join(
            config.MODELS_DIR,
            "{}.pkl".format("ML")))

        predicted = clf_final.predict(X_test)
        return {
            "accuracy": accuracy_score(y_test, predicted),
            "roc": roc_auc_score(y_test, predicted),
            "cm": confusion_matrix(y_test, predicted, normalize="true"), 
            "feature_importance": pd.DataFrame(
                sorted(
                    zip(
                        clf_final.feature_importances_,
                        X_test.columns)), 
                    columns=['Value','Feature'])
        }