Reviews3 = makeTrainingModel(ReviewsKey, target, knobs, drop=dropped)

dropped = ['Score', 'ProfileName', 'Time', 'Description']
knobs = [6, 2, 7]
Reviews4 = makeTrainingModel(ReviewsKey, target, knobs, drop=dropped)

dropped = ['Summary', 'Score', 'ProfileName']
knobs = [6, 2, 7]
Reviews5 = makeTrainingModel(ReviewsKey, target, knobs, drop=dropped)

### Cannabis Time Series
##
##  4/2/6, seed=212507590 120 days 0 gap : 7, 7, 7 minutes
## 4 / 2 / 6 , 28 days, 1 gap 7 minutes

experiment = h2oai.start_experiment_sync(dataset_key=train_dai.key,
                                         testset_key = test_dai.key,
                                         target_col="Weekly_Sales",
                                         is_classification=False,
                                         cols_to_drop = ["sample_weight"],
                                         accuracy=5,
                                         time=3,
                                         interpretability=1,
                                         scorer="RMSE",
                                         enable_gpus=True,
                                         seed=1234,
                                         time_col = "Date",
                                         time_groups_columns = ["Store", "Dept"],
                                         num_prediction_periods = 1,
                                         num_gap_periods = 0)
示例#2
0
train_key = h2oai.get_dataset_split_job(reviews_split_data).entity[0]
test_key = h2oai.get_dataset_split_job(reviews_split_data).entity[1]

# Reviews Default

dropped = [
    'UserID', 'ProductId', 'Id', 'Summary', 'Score', 'HelpfulnessDenominator',
    'HelpfulnessNumerator', 'ProfileName', 'Time'
]
knobs = [8, 2, 7]
reviews1 = h2oai.start_experiment_sync(experiment_name="Reviews NLP Big",
                                       dataset_key=train_key,
                                       testset_key=test_key,
                                       target_col=target,
                                       is_classification=True,
                                       accuracy=knobs[0],
                                       time=knobs[1],
                                       interpretability=knobs[2],
                                       enable_gpus=True,
                                       cols_to_drop=dropped)

reviews1a = h2oai.start_experiment_sync(experiment_name="Reviews NLP Big TF",
                                        dataset_key=train_key,
                                        testset_key=test_key,
                                        target_col=target,
                                        is_classification=True,
                                        accuracy=knobs[0],
                                        time=knobs[1],
                                        interpretability=knobs[2],
                                        enable_gpus=True,
                                        cols_to_drop=dropped,
def test_debug_pyclient():
    from h2oai_client import Client

    pd.set_option('display.max_rows', 50)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    # Login info
    dai_url = "http://****:12345"
    dai_user = "******"
    dai_pwd = "****"

    # Data Information
    data_file_name = "****.csv"
    y = "****"

    # Transformers information
    transformer_file_name = "****.py"

    transformers_noncustom = []
    transformers_custom_nontesting = []

    # All Offical Transformers
    transformers_noncustom = ['CVCatNumEncode', 'CVTargetEncode'
        , 'CatOriginalTransformer', 'ClusterDistTransformer'
        , 'ClusterIdTransformer', 'ClusterTETransformer', 'DatesTransformer'
        , 'EwmaLagsTransformer', 'FrequentTransformer', 'InteractionsTransformer'
        , 'IsHolidayTransformer', 'LagsAggregatesTransformer', 'LagsInteractionTransformer'
        , 'LagsTransformer', 'LexiLabelEncoder', 'NumCatTETransformer', 'NumToCatTETransformer'
        , 'NumToCatWoEMonotonicTransformer', 'NumToCatWoETransformer', 'OneHotEncodingTransformer'
        , 'OriginalTransformer', 'SortedLETransformer', 'StrFeatureTransformer', 'TextClustDistTransformer'
        , 'TextClustTETransformer', 'TextLinModelTransformer', 'TextTransformer', 'TruncSVDNumTransformer'
        , 'WeightOfEvidenceTransformer']

    # Any Installed Custom Transformers you don't want to test
    transformers_custom_nontesting = ['MyLogTransformer']

    all_nontest_transformers = transformers_noncustom + transformers_custom_nontesting

    # STEP ZERO: Connect to Driverless AI
    h2oai = Client(dai_url, dai_user, dai_pwd)

    # STEP ONE: Load data set (and related tasks)

    # view all data sets in DAI
    all_data_sets = h2oai.list_datasets(0, 100)
    all_data_sets = pd.DataFrame({
        'key': list(map(lambda x: x.key, all_data_sets))
        , 'name': list(map(lambda x: x.name, all_data_sets))})

    print("PRE-LOADED DATASETS:")
    print(all_data_sets)

    # check if data was pre-loaded - if so use that data set - if not load data
    if data_file_name in all_data_sets['name'].values:
        print("\nData already loaded ", data_file_name)
        data_key = all_data_sets[all_data_sets["name"] == data_file_name]["key"][0]
        data_load_job = h2oai.get_dataset_job(data_key).entity
    else:
        print("\nLoading file ", data_file_name)
        data_load_job = h2oai.upload_dataset_sync(data_file_name)
        data_key = data_load_job.key

    # STEP TWO: Load custom transformer (and related tasks)
    # probably not good to just upload every time
    # no function to delete from python, only from ssh-ing in
    # rm tmp/contrib/transformers/[function]_randomletters_content.py

    print("\nUploading Transformer ", transformer_file_name)
    my_transformer = h2oai.upload_custom_recipe_sync(transformer_file_name)

    # returns true or false - exit if fails - check DAI UI for error message
    if my_transformer:
        print("\nTransformer uploaded successfully\n")
    else:
        print("\nTransformer uploaded failed, exiting program.\n")
        sys.exit()

    # STEP THREE: Run experiment (and related tasks)
    print("\nStarting Experiment\n")
    experiment = h2oai.start_experiment_sync(
        dataset_key=data_key
        , target_col=y
        , is_classification=True
        , accuracy=1
        , time=1
        , interpretability=10
        , scorer="F1"
        , score_f_name=None
        , config_overrides="""
                                    feature_brain_level=0
                                    exclude_transformers={dont_use}
                                    """.format(dont_use=all_nontest_transformers)
    )

    # experiment = h2oai.get_model_job("lomotare").entity

    # STEP FOUR: Check the transformation was used

    # Download Summary
    summary_path = h2oai.download(src_path=experiment.summary_path, dest_dir=".")
    dir_path = "h2oai_experiment_summary_" + experiment.key
    import zipfile
    with zipfile.ZipFile(summary_path, 'r') as z:
        z.extractall(dir_path)

    # View Features
    features = pd.read_table(dir_path + "/features.txt", sep=',', skipinitialspace=True)
    print(features)

    # STEP FIVE: Transform data and ensure it looks as expected
    transform = h2oai.fit_transform_batch_sync(model_key=experiment.key
                                               , training_dataset_key=data_key
                                               , validation_dataset_key=None
                                               , test_dataset_key=None
                                               , validation_split_fraction=0.25
                                               , seed=1234
                                               , fold_column=None)

    # Download the training and validation transformed data
    transform_train_path = h2oai.download(src_path=transform.training_output_csv_path, dest_dir=".")
    transform_validate_path = h2oai.download(src_path=transform.validation_output_csv_path, dest_dir=".")

    transform_train = pd.read_table(transform_train_path, sep=',', skipinitialspace=True)
    transform_validate = pd.read_table(transform_validate_path, sep=',', skipinitialspace=True)

    print(transform_train.head())
    print(transform_validate.head())

    # STEP 1000: Clean up
    os.remove(summary_path)
    os.remove(transform_train_path)
    os.remove(transform_validate_path)
    shutil.rmtree(dir_path)
示例#4
0
import h2oai_client
from h2oai_client import Client

h2oai = Client(address='http://129.213.63.69:12345',
               username='******',
               password='******')

train = h2oai.create_dataset_sync('/train.csv')
test = h2oai.create_dataset_sync('/test.csv')

experiment = h2oai.start_experiment_sync(dataset_key=train.key,
                                         testset_key=test.key,
                                         accuracy=10,
                                         time=10,
                                         interpretability=1,
                                         is_classification=True,
                                         target_col='LABEL',
                                         is_timeseries=True,
                                         time_col='DATE',
                                         num_gap_periods=1,
                                         num_prediction_periods=1)

print("Final Model Score on Validation Data: " +
      str(round(experiment.valid_score, 3)))
print("Final Model Score on Test Data: " +
      str(round(experiment.test_score, 3)))
    , time_col = ""
    , ratio = ratio
    , seed = 1234
)

train_key = h2oai.get_dataset_split_job(card_split_data).entity[0]
test_key  = h2oai.get_dataset_split_job(card_split_data).entity[1]

# Card Default
knobs = [6, 4, 6]
card_default = h2oai.start_experiment_sync(
      experiment_name = "Card Default"
    , dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = True
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , enable_gpus = True
    , cols_to_drop = dropped
)

# Card Monotonic
knobs = [6, 4, 7]
card_monotonic = h2oai.start_experiment_sync(
      experiment_name = "Card Monotonic"
    , dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = True
    , accuracy = knobs[0]
                                             time_col="",
                                             ratio=ratio,
                                             seed=1234)

train_key = h2oai.get_dataset_split_job(boston_split_data).entity[0]
test_key = h2oai.get_dataset_split_job(boston_split_data).entity[1]
dropped = []

# Housing Experiment #1
knobs = [7, 2, 8]
housing1 = h2oai.start_experiment_sync(experiment_name="Housing",
                                       dataset_key=train_key,
                                       testset_key=test_key,
                                       target_col=target,
                                       is_classification=False,
                                       accuracy=knobs[0],
                                       time=knobs[1],
                                       interpretability=knobs[2],
                                       scorer='RMSE',
                                       enable_gpus=True,
                                       cols_to_drop=dropped)

# Housing Experiment #2
knobs = [4, 2, 8]
housing2 = h2oai.start_experiment_sync(experiment_name="Housing Quick",
                                       dataset_key=train_key,
                                       testset_key=test_key,
                                       target_col=target,
                                       is_classification=False,
                                       accuracy=knobs[0],
                                       time=knobs[1],
                                               fold_col="",
                                               time_col="",
                                               ratio=ratio,
                                               seed=1234)

train_key = h2oai.get_dataset_split_job(diabetes_split_data).entity[0]
test_key = h2oai.get_dataset_split_job(diabetes_split_data).entity[1]
dropped = []

# Diabetes Default
knobs = [8, 2, 8]
diabetes1 = h2oai.start_experiment_sync(experiment_name="Diabetes",
                                        dataset_key=train_key,
                                        testset_key=test_key,
                                        target_col=target,
                                        is_classification=True,
                                        accuracy=knobs[0],
                                        time=knobs[1],
                                        interpretability=knobs[2],
                                        enable_gpus=True,
                                        cols_to_drop=dropped)

# Diabetes GLM
diabetes2 = h2oai.start_experiment_sync(
    experiment_name="Diabetes GLM",
    dataset_key=train_key,
    testset_key=test_key,
    target_col=target,
    is_classification=True,
    accuracy=knobs[0],
    time=knobs[1],
    interpretability=knobs[2],
示例#8
0
                                              ratio=ratio,
                                              seed=1234)

train_key = h2oai.get_dataset_split_job(titanic_split_data).entity[0]
test_key = h2oai.get_dataset_split_job(titanic_split_data).entity[1]

knobs = [8, 2, 8]

# Titanic Default

dropped = ['no.title', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
titanic1 = h2oai.start_experiment_sync(experiment_name="Titanic",
                                       dataset_key=train_key,
                                       testset_key=test_key,
                                       target_col=target,
                                       is_classification=True,
                                       accuracy=knobs[0],
                                       time=knobs[1],
                                       interpretability=knobs[2],
                                       enable_gpus=True,
                                       cols_to_drop=dropped)

# Titanic No Name
dropped = [
    'name', 'no.title', 'cabin', 'embarked', 'boat', 'body', 'home.dest'
]
titanic2 = h2oai.start_experiment_sync(experiment_name="Titanic no Name",
                                       dataset_key=train_key,
                                       testset_key=test_key,
                                       target_col=target,
                                       is_classification=True,
                                       accuracy=knobs[0],