Exemplo n.º 1
0
    return h2o_auc


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Modeling h2o')
    parser.add_argument('--dataset',
                        '-d',
                        help="pass dataset name",
                        required=True)
    parser.add_argument('--seed', '-s', help='random seed', default=2020)

    args = parser.parse_args()

    data_name = str(args.dataset)
    seed = int(args.seed)

    h2o.init()

    X_train, X_test, y_train, y_test = load_data(data_name)

    start_time = time.time()
    score = h2o_train(X_train, X_test, y_train, y_test, seed=2020)
    end_time = time.time()

    hour = (end_time - start_time) / 3600.0

    with open('h2o_result.txt', 'a') as f:
        f.write(f"{data_name}\t{score}\t{hour}\n")
Exemplo n.º 2
0
                                             batch_size=batch_size),
                           dim=1)
        logits = check_numpy(logits)
    return logits


if __name__ == '__main__':

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    hours = 2  # 2 hours for training
    res = []
    for data_name, d in data_config.items():

        # split dataset train/test = 0.7:0.3
        X_train, X_test, y_train, y_test = load_data(data_name,
                                                     combine_y=False,
                                                     split_seed=2020,
                                                     test_size=0.3)

        # general feature generator;
        feature_generator = AutoMLFeatureGenerator()

        print("#" * 50, 'training set preprocessing')
        X_train = feature_generator.fit_transform(X_train,
                                                  drop_duplicates=False)
        print("#" * 50, 'testing set preprocessing')
        X_test = feature_generator.transform(X_test)

        feature_types_metadata = feature_generator.feature_types_metadata

        problem_type = 'binary'
        path = f'LR-{data_name}'
"""
wide and deep test, follow code from autogluon
autogluon's NN architecture is based on wide and deep network
"""
from autogluon import TabularPrediction as task
from data_config.data_config import load_data, data_config

if __name__ == '__main__':
    res = {}
    for data_name in data_config.keys():
        ylabel = data_config[data_name]['ylabel']

        X_train, X_valid = load_data(data_name, combine_y=True)
        train_data = task.Dataset(df=X_train)
        test_data = task.Dataset(df=X_valid)
        savedir = f'{data_name}/'  # where to save trained models
        predictor = task.fit(
            train_data=train_data,
            label=ylabel,
            output_directory=savedir,
            eval_metric='roc_auc',
            verbosity=2,
            visualizer='tensorboard',
            random_seed=0,
            save_space=True,
            keep_only_best=True,
        )
        auc = predictor.evaluate(X_valid)
        res[data_name] = auc

    print(res)
Exemplo n.º 4
0
import pickle
import torch

from sklearn.model_selection import StratifiedKFold
from data_config.data_config import data_config, load_data
import numpy as np
from sklearn.metrics import roc_auc_score

if __name__ == '__main__':

    res = {}
    for data_name in data_config.keys():

        epoch = 100

        train, test, y_train, y_test = load_data(data_name, combine_y=False)

        types = train.dtypes
        #
        categorical_columns = []
        categorical_dims = {}

        features = list(train.columns)

        print(train.shape)

        for col in train.columns:
            if types[col] == 'object':
                train[col] = train[col].fillna("VV_likely")
                test[col] = test[col].fillna('VV_likely')
                d = train[col].unique()