예제 #1
0
def hp_space_optuna(trial: trial.Trial) -> Dict[str, float]:
    return {
        "learning_rate":
        trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs":
        trial.suggest_int("num_train_epochs", 3, 7),
        "per_device_train_batch_size":
        trial.suggest_categorical("per_device_train_batch_size", [64, 128]),
        "fp16":
        trial.suggest_categorical("fp16", [True, False]),
        "weight_decay":
        trial.suggest_float("weight_decay", 0.005, 0.02, log=True),
    }
예제 #2
0
    def f(trial):
        # type: (optuna.trial.Trial) -> float

        x = trial.suggest_uniform('x', -10, 10)
        y = trial.suggest_loguniform('y', 10, 20)
        z = trial.suggest_categorical('z', (10, 20.5, 30))

        return x**2 + y**2 + z
예제 #3
0
    def f(trial: optuna.trial.Trial) -> float:

        x = trial.suggest_uniform("x", -10, 10)
        y = trial.suggest_loguniform("y", 10, 20)
        z = trial.suggest_categorical("z", (10.0, 20.5, 30.0))
        assert isinstance(z, float)

        return x**2 + y**2 + z
예제 #4
0
    def objective(trial: BaseTrial) -> float:

        a = trial.suggest_uniform("a", 0.0, 10.0)
        b = trial.suggest_loguniform("b", 0.1, 10.0)
        c = trial.suggest_discrete_uniform("c", 0.0, 10.0, 1.0)
        d = trial.suggest_int("d", 0, 10)
        e = trial.suggest_categorical("e", [0, 1, 2])
        f = trial.suggest_int("f", 1, 10, log=True)

        assert isinstance(e, int)
        return a + b + c + d + e + f
예제 #5
0
    def __call__(self, trial):

        # optional variables sampling ( to implement in the future)

        # if x_var_num_range is not None:
        #   number_of_var = list(range(self.x_var_num_range[0], self.x_var_num_range[1]+1))
        #   x_var = np.random.choice(list(x_var.columns), number_of_var)
        # else:
        #   x_var = x_train.columns

        if self.params is not None:
            params = exec(self.params)
        else:
            params = None

        if self.method == 'SVC':  # support vectors machine

            SVM_params = {
                'svc_c': trial.suggest_loguniform('svc_c', 1e-10, 1e10)
            }
            if params is not None:
                SVM_params.update(params)
            model = sklearn.svm.SVC(**SVM_params)

        elif self.method == 'RF':  # random forest

            # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.feature_importances_
            RF_params = {
                'n_estimators':
                int(trial.suggest_uniform('n_estimators', 20, 300)),
                'max_depth':
                int(trial.suggest_uniform('max_depth', 2, 50)),
                'criterion':
                trial.suggest_categorical('criterion', ['gini', 'entropy']),
                'bootstrap':
                trial.suggest_categorical('bootstrap', [True, False]),
                'class_weight':
                trial.suggest_categorical(
                    'class_weight', ['balanced', 'balanced_subsample', None])
            }
            if params is not None:
                RF_params.update(params)
            model = RandomForestClassifier(**RF_params, n_jobs=15)

        elif self.method == 'AB':  # ada boost

            AB_params = {
                'n_estimators':
                int(trial.suggest_uniform('n_estimators', 20, 300)),
                'learning_rate':
                trial.suggest_uniform('learning_rate', 0.05, 0.3),
                'algorithm':
                trial.suggest_categorical('algorithm', ['SAMME.R', 'SAMME'])
            }
            if params is not None:
                AB_params.update(params)
            model = AdaBoostClassifier(**AB_params)

        elif self.method == 'GB':  # gradient boost

            GB_params = {
                'n_estimators':
                int(trial.suggest_loguniform('n_estimators', 20, 300)),
                'learning_rate':
                trial.suggest_loguniform('learning_rate', 0.01, 0.5),
                'max_depth':
                int(trial.suggest_loguniform('max_depth', 2, 50)),
                'loss':
                trial.suggest_categorical('loss', ['deviance', 'exponential'])
            }
            if params is not None:
                GB_params.update(params)
            model = GradientBoostingClassifier(**GB_params)

        elif self.method == 'NB':  # naive bayes

            NB_params = {
                'var_smoothing':
                trial.suggest_loguniform('var_smoothing', 1e-10, 1e-05)
            }
            if params is not None:
                NB_params.update(params)
            model = skl.naive_bayes.GaussianNB(**NB_params)

        elif self.method == 'KNN':  # k-nearest neighbours

            KNN_params = {
                'n_neighbors':
                int(trial.suggest_loguniform('n_neighbors', 3, 7))
            }
            if params is not None:
                KNN_params.update(params)
            model = skl.neighbors.KNeighborsClassifier(**KNN_params)

        elif self.method == 'LR':  # logistic retression
            LR_params = {
                'penalty':
                trial.suggest_categorical('penalty', ['l1', 'l2']),
                'fit_intercept':
                trial.suggest_categorical('fit_intercept', [True, False])
            }
            if params is not None:
                LR_params.update(params)
            model = skl.linear_model.LogisticRegression(**LR_params)

        elif self.method == 'LGBM':  # logistic retression
            LGBM_params = {
                'penalty':
                trial.suggest_categorical('penalty', ['l1', 'l2']),
                'fit_intercept':
                trial.suggest_categorical('fit_intercept', [True, False])
            }
            if params is not None:
                LGBM_params.update(params)
            model = LGBMClassifier(**LGBM_params)

        elif self.method == 'XGB':  # XGBoost

            # https://xgboost.readthedocs.io/en/latest/parameter.html
            XGB_params = {
                'n_estimators':
                int(trial.suggest_loguniform('n_estimators', 5, 300)),
                'booster':
                trial.suggest_categorical('booster', ['dart', 'gbtree']),
                'eta':
                trial.suggest_loguniform('eta', 0.01, 0.5),
                'max_depth':
                int(trial.suggest_loguniform('max_depth', 3, 30))
                # ,'reg_lambda':       trial.suggest_loguniform( 'reg_lambda', 0, 1)
                # ,'reg_alpha' :       trial.suggest_loguniform( 'reg_alpha', 0, 1)
            }
            if params is not None:
                XGB_params.update(params)
            model = XGBClassifier(**XGB_params, nthread=15)

        elif self.method == 'CAT':  # catboost

            # https://catboost.ai/docs/concepts/python-reference_parameters-list.html#python-reference_parameters-list
            CAT_params = {
                'iterations':
                int(trial.suggest_loguniform('iterations', 20, 300)),
                'learning_rate':
                trial.suggest_loguniform('learning_rate', 0.01, 0.5),
                'depth':
                int(trial.suggest_loguniform('depth', 2, 16))
            }
            if params is not None:
                CAT_params.update(params)
            model = CatBoostClassifier(**CAT_params,
                                       early_stopping_rounds=75,
                                       logging_level='Silent')

        # calibration (optional)
        if self.calibration_method is not None:
            model = CalibratedClassifierCV(model,
                                           cv=3,
                                           method=self.calibration_method)

        # model fit
        model.fit(self.x_train, self.y_train)

        # y labels
        y_labels = list(self.y_train.drop_duplicates())
        y_labels.sort()
        self.y_labels = y_labels

        # model prediction and classification
        prob_train = pd.DataFrame(model.predict_proba(self.x_train),
                                  columns=[str(x) for x in model.classes_])
        prob_train = prob_train.reset_index(drop=True)

        prob_test = pd.DataFrame(model.predict_proba(self.x_test),
                                 columns=[str(x) for x in model.classes_])
        prob_test = prob_test.reset_index(drop=True)

        if self.priori is not None:

            for p in range(prob_test.shape[1]):
                var = prob_test.columns[p]
                priori_p = self.priori[var]
                prob_test.iloc[:, p] = prob_test.iloc[:, p] * priori_p
                prob_train.iloc[:, p] = prob_train.iloc[:, p] * priori_p
                prob_train_sum = prob_train.apply(lambda x: 1 / sum(x), axis=1)
                prob_test_sum = prob_test.apply(lambda x: 1 / sum(x), axis=1)

                for p in self.y_labels:
                    prob_train.loc[:,
                                   p] = prob_train.loc[:, p] * prob_train_sum
                    prob_test.loc[:, p] = prob_test.loc[:, p] * prob_test_sum

        if prob_test.shape[1] == 2 and self.threshold is not None:
            var_1 = list(self.threshold.keys())[0]
            var_2 = list(self.threshold.keys())[1]
            threshold_var_1 = list(self.threshold.values())[0]
            threshold_var_2 = list(self.threshold.values())[1]
            classification_train = pd.DataFrame([
                var_1 if x >= threshold_var_1 else var_2
                for x in prob_train[str(var_1)]
            ])
            classification_test = pd.DataFrame([
                var_1 if x >= threshold_var_1 else var_2
                for x in prob_test[str(var_1)]
            ])

        else:
            classification_train = pd.DataFrame(prob_train.idxmax(
                axis=1))  # zapis jako DataFrame by dalej zrobić 'concat'
            classification_test = pd.DataFrame(prob_test.idxmax(
                axis=1))  # zapis jako DataFrame by dalej zrobić 'concat'

        # selection and determining score function to optimize

        balanced_accuracy = skm.balanced_accuracy_score(
            y_true=self.y_test, y_pred=classification_test)
        accuracy = skm.accuracy_score(y_true=self.y_test,
                                      y_pred=classification_test)

        # ta funkcja nie jest napisane pod katem modeli multilabel
        aps = average_precision_score(y_true=self.y_test,
                                      y_score=prob_test[str(self.pos_label)],
                                      pos_label=str(self.pos_label))

        skm.recall_score(y_true=self.y_test.astype(int),
                         y_pred=classification_test.astype(int),
                         average='binary',
                         pos_label=self.pos_label)

        # recall
        try:
            if len(self.y_labels) < 3:
                recall_train = skm.recall_score(
                    y_true=self.y_train.astype(int),
                    y_pred=classification_train.astype(int),
                    average='binary',
                    pos_label=self.pos_label)
                recall_test = skm.recall_score(
                    y_true=self.y_test.astype(int),
                    y_pred=classification_test.astype(int),
                    average='binary',
                    pos_label=self.pos_label)
            else:
                recall_train = skm.recall_score(
                    y_true=self.y_train.astype(str),
                    y_pred=classification_train.astype(str),
                    average='weighted')
                recall_test = skm.recall_score(
                    y_true=self.y_test.astype(str),
                    y_pred=classification_test.astype(str),
                    average='weighted')
        except:
            recall_train = np.nan
            recall_test = np.nan

        # precision
        try:
            if len(self.y_labels) < 3:
                precision_train = skm.precision_score(
                    y_true=self.y_train.astype(int),
                    y_pred=classification_train.astype(int),
                    average='binary',
                    pos_label=self.pos_label)
                precision_test = skm.precision_score(
                    y_true=self.y_test.astype(int),
                    y_pred=classification_test.astype(int),
                    average='binary',
                    pos_label=self.pos_label)
            else:
                precision_train = skm.precision_score(
                    y_true=self.y_train.astype(str),
                    y_pred=classification_train.astype(str),
                    average='weighted')
                precision_test = skm.precision_score(
                    y_true=self.y_test.astype(str),
                    y_pred=classification_test.astype(str),
                    average='weighted')
        except:
            precision_train = np.nan
            precision_test = np.nan

        # f1
        try:
            if len(self.y_labels) < 3:
                f1_train = skm.f1_score(y_true=self.y_train,
                                        y_pred=classification_train,
                                        average='binary',
                                        pos_label=str(self.pos_label))
                f1_test = skm.f1_score(y_true=self.y_test,
                                       y_pred=classification_test,
                                       average='binary',
                                       pos_label=str(self.pos_label))
            else:
                f1_train = skm.f1_score(y_true=self.y_train,
                                        y_pred=classification_train,
                                        average='weighted')
                f1_test = skm.f1_score(y_true=self.y_test,
                                       y_pred=classification_test,
                                       average='weighted')
        except:
            f1_train = np.nan
            f1_test = np.nan

        scores = pd.DataFrame([[
            balanced_accuracy, accuracy, recall_train, recall_test,
            precision_train, precision_test, f1_train, f1_test, aps
        ]],
                              columns=[
                                  'balanced_accuracy', 'accuracy',
                                  'recall_train', 'recall_test',
                                  'precision_train', 'precision_test',
                                  'f1_train', 'f1_test', 'aps'
                              ])

        self.scores = pd.concat([self.scores, scores])

        # zeracanie scoru którego 'optuna' używa do optymalizacji grida (możemy chyba tylko jedną wartość zwrócić.
        if self.opt_function == 'balanced_accuracy':
            return (balanced_accuracy)
        elif self.opt_function == 'aps':
            return (aps)
        elif self.opt_function == 'accuracy':
            return (accuracy)
        elif self.opt_function == 'recall':
            return (recall_test)
        elif self.opt_function == 'precision':
            return (precision_test)