Пример #1
0
def test_automl_default_classification():
    for data_id in [
            179,
            4135,
    ]:
        dataset = fetch_openml(data_id=data_id, as_frame=True)
        dataset.target = dataset.target.astype('category').cat.codes
        if len(dataset.data) < 2000:
            crop = len(dataset.data)
        else:
            crop = 2000
        X_train, X_test, y_train, y_test = train_test_split(
            dataset.data[:crop],
            dataset.target[:crop],
            test_size=0.2,
            random_state=RANDOM_SEED,
        )
        model = AutoMLClassifier(random_state=RANDOM_SEED, )
        model.fit(X_train, y_train, timeout=600)
        predicts = model.predict(X_test)

        score = round(sklearn.metrics.roc_auc_score(y_test, predicts), 4)
        assert score is not None
        assert 0.5 < score <= 1

        model.save('AutoML_model_1', folder=TMP_FOLDER)
        model_new = AutoMLClassifier(random_state=RANDOM_SEED, )
        model_new = model_new.load('AutoML_model_1', folder=TMP_FOLDER)
        predicts = model_new.predict(X_test)
        score2 = round(sklearn.metrics.roc_auc_score(y_test, predicts), 4)
        assert score2 is not None
        assert 0.5 < score2 <= 1
        assert (score - score2) == 0.
Пример #2
0
def test_automl_classifier_bench():
    for data_id in [
            # 179,
            # 4135,
            1461,
            # 1226,
            # 31,
            1471,
            151,
            # 1067,
            # 1046,
            1489,
            1494,
    ]:
        dataset = fetch_openml(data_id=data_id, as_frame=True)
        dataset.target = dataset.target.astype("category").cat.codes

        logger.info("=" * 75)
        logger.info("LOAD DATASET")
        logger.info(f"Dataset: {data_id} {dataset.data.shape}")

        y = dataset.target
        X = dataset.data

        skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=42)

        metrics = []

        for count, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            # if count > 3:
            #    continue
            logger.info(f"START FOLD {count}")
            RANDOM_SEED = count
            EXPERIMENT = count
            np.random.seed(RANDOM_SEED)

            # shuffle columns for more randomization experiment
            columns_tmp = list(X.columns.values)
            np.random.shuffle(columns_tmp)
            X = X[columns_tmp]

            # Split
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

            START_EXPERIMENT = time.time()

            model = AutoMLClassifier(random_state=RANDOM_SEED, )
            model.fit(X_train, y_train, timeout=TIME_LIMIT)

            predicts = model.predict(X_test)
            assert predicts is not None

            # model.save(f'AutoML_fold_{count}', folder='./result/')

            logger.info("*" * 75)
            logger.info(f"AUC: {round(roc_auc_score(y_test, predicts),4)}")

            logger.info(
                f"predict_model_1 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_1),4)}"
            )
            logger.info(
                f"predict_model_2 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_2),4)}"
            )
            # logger.info(f'predict_model_3 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_3),4)}')
            # logger.info(f'predict_model_4 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_4),4)}')
            # logger.info(f'predict_model_5 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_5),4)}')
            logger.info("-" * 75)

            END_EXPERIMENT = time.time()

            metrics.append({
                "AUC":
                round(roc_auc_score(y_test, predicts), 4),
                "log_loss":
                round(log_loss(y_test, predicts), 4),
                "Accuracy":
                round(accuracy_score(y_test, predicts > 0.5), 4),
                "Time_min": (END_EXPERIMENT - START_EXPERIMENT) // 60,
                "Time":
                datetime.datetime.now(),
            })

            pd.DataFrame(metrics).to_csv(
                f"./result/{data_id}_metrics.csv",
                index=False,
            )
            model = None