Exemplo n.º 1
0
def get_project_key(con: h2o.Client, project_name: str) -> str:
    """
    Returns the key of the project with name matching project_name. If such a project does not exist, a new project is
    created and its key is returned.

    :param con: Client to H2O Driverless AI
    :param project_name: Name of the project
    :return:
    """
    projects = con.list_projects(offset=0, limit=1000)
    project = next((x for x in projects if x.name == project_name), None)
    if project is None:
        key = con.create_project(project_name, project_name)
        return key
    return project.key
def connect():
    # Login info
    dai_url = "http://IPADDRESS:12345"
    dai_user = "******"
    dai_pwd = "Password"

    return Client(dai_url, dai_user, dai_pwd)
Exemplo n.º 3
0
def upload_dataset_to_project(con: h2o.Client, project_key: str,
                              dataset_file: str, dataset_type: str):
    """
    Uploads the data provided in dataset_file path to Driverless AI and links to the project. If the project already
    has a dataset of the specified type and filename linked, then it is not re-uploaded. For the uploaded dataset, the
    dataset_key of the newly uploaded dataset is returned. If it is not uploaded, then key of the dataset matching the
    file name is returned.

    :param con: Connection to H2O Driverless AI
    :param project_key: Key of the project to link the dataset to
    :param dataset_file: File path of the dataset to upload and link to project
    :param dataset_type: Either 'Training' or 'Testing'
    :return: dataset_key
    """
    file_name = os.path.basename(dataset_file)
    datasets = con.get_datasets_for_project(project_key, dataset_type)
    dataset = next((x for x in datasets if x.name == file_name), None)
    if dataset is None:
        dataset = con.upload_dataset_sync(file_path=dataset_file)
        con.link_dataset_to_project(project_key=project_key,
                                    dataset_key=dataset.key,
                                    dataset_type=dataset_type)
    return dataset.key
## Set up all of the training experiments

import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

address = 'http://52.90.67.220:12345'

username = '******'
password = '******'

h2oai = Client(address = address
               , username = username
               , password = password)

def splitTrainingData(dataPath, basename, target, ratio = 0.8, time=''):
	data = h2oai.create_dataset_sync(dataPath)
	# Split the data
	split_data = h2oai.make_dataset_split(
    	dataset_key = data.key
    	, output_name1 = basename + '_train'
    	, output_name2 = basename + '_test'
    	, target = target
    	, fold_col = ''
    	, time_col = time
    	, ratio = ratio
	)
	# key[0] is train, key[1] is test
def test_debug_pyclient():
    from h2oai_client import Client

    pd.set_option('display.max_rows', 50)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    # Login info
    dai_url = "http://****:12345"
    dai_user = "******"
    dai_pwd = "****"

    # Data Information
    data_file_name = "****.csv"
    y = "****"

    # Transformers information
    transformer_file_name = "****.py"

    transformers_noncustom = []
    transformers_custom_nontesting = []

    # All Offical Transformers
    transformers_noncustom = ['CVCatNumEncode', 'CVTargetEncode'
        , 'CatOriginalTransformer', 'ClusterDistTransformer'
        , 'ClusterIdTransformer', 'ClusterTETransformer', 'DatesTransformer'
        , 'EwmaLagsTransformer', 'FrequentTransformer', 'InteractionsTransformer'
        , 'IsHolidayTransformer', 'LagsAggregatesTransformer', 'LagsInteractionTransformer'
        , 'LagsTransformer', 'LexiLabelEncoder', 'NumCatTETransformer', 'NumToCatTETransformer'
        , 'NumToCatWoEMonotonicTransformer', 'NumToCatWoETransformer', 'OneHotEncodingTransformer'
        , 'OriginalTransformer', 'SortedLETransformer', 'StrFeatureTransformer', 'TextClustDistTransformer'
        , 'TextClustTETransformer', 'TextLinModelTransformer', 'TextTransformer', 'TruncSVDNumTransformer'
        , 'WeightOfEvidenceTransformer']

    # Any Installed Custom Transformers you don't want to test
    transformers_custom_nontesting = ['MyLogTransformer']

    all_nontest_transformers = transformers_noncustom + transformers_custom_nontesting

    # STEP ZERO: Connect to Driverless AI
    h2oai = Client(dai_url, dai_user, dai_pwd)

    # STEP ONE: Load data set (and related tasks)

    # view all data sets in DAI
    all_data_sets = h2oai.list_datasets(0, 100)
    all_data_sets = pd.DataFrame({
        'key': list(map(lambda x: x.key, all_data_sets))
        , 'name': list(map(lambda x: x.name, all_data_sets))})

    print("PRE-LOADED DATASETS:")
    print(all_data_sets)

    # check if data was pre-loaded - if so use that data set - if not load data
    if data_file_name in all_data_sets['name'].values:
        print("\nData already loaded ", data_file_name)
        data_key = all_data_sets[all_data_sets["name"] == data_file_name]["key"][0]
        data_load_job = h2oai.get_dataset_job(data_key).entity
    else:
        print("\nLoading file ", data_file_name)
        data_load_job = h2oai.upload_dataset_sync(data_file_name)
        data_key = data_load_job.key

    # STEP TWO: Load custom transformer (and related tasks)
    # probably not good to just upload every time
    # no function to delete from python, only from ssh-ing in
    # rm tmp/contrib/transformers/[function]_randomletters_content.py

    print("\nUploading Transformer ", transformer_file_name)
    my_transformer = h2oai.upload_custom_recipe_sync(transformer_file_name)

    # returns true or false - exit if fails - check DAI UI for error message
    if my_transformer:
        print("\nTransformer uploaded successfully\n")
    else:
        print("\nTransformer uploaded failed, exiting program.\n")
        sys.exit()

    # STEP THREE: Run experiment (and related tasks)
    print("\nStarting Experiment\n")
    experiment = h2oai.start_experiment_sync(
        dataset_key=data_key
        , target_col=y
        , is_classification=True
        , accuracy=1
        , time=1
        , interpretability=10
        , scorer="F1"
        , score_f_name=None
        , config_overrides="""
                                    feature_brain_level=0
                                    exclude_transformers={dont_use}
                                    """.format(dont_use=all_nontest_transformers)
    )

    # experiment = h2oai.get_model_job("lomotare").entity

    # STEP FOUR: Check the transformation was used

    # Download Summary
    summary_path = h2oai.download(src_path=experiment.summary_path, dest_dir=".")
    dir_path = "h2oai_experiment_summary_" + experiment.key
    import zipfile
    with zipfile.ZipFile(summary_path, 'r') as z:
        z.extractall(dir_path)

    # View Features
    features = pd.read_table(dir_path + "/features.txt", sep=',', skipinitialspace=True)
    print(features)

    # STEP FIVE: Transform data and ensure it looks as expected
    transform = h2oai.fit_transform_batch_sync(model_key=experiment.key
                                               , training_dataset_key=data_key
                                               , validation_dataset_key=None
                                               , test_dataset_key=None
                                               , validation_split_fraction=0.25
                                               , seed=1234
                                               , fold_column=None)

    # Download the training and validation transformed data
    transform_train_path = h2oai.download(src_path=transform.training_output_csv_path, dest_dir=".")
    transform_validate_path = h2oai.download(src_path=transform.validation_output_csv_path, dest_dir=".")

    transform_train = pd.read_table(transform_train_path, sep=',', skipinitialspace=True)
    transform_validate = pd.read_table(transform_validate_path, sep=',', skipinitialspace=True)

    print(transform_train.head())
    print(transform_validate.head())

    # STEP 1000: Clean up
    os.remove(summary_path)
    os.remove(transform_train_path)
    os.remove(transform_validate_path)
    shutil.rmtree(dir_path)
Exemplo n.º 6
0
# TODO: re-write the already uploaded data check to account for numpy warning of type mismatch
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Print and Debug Nicely
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# The following are parameters that need to be set to run these functions
# TODO: to redo this is a nicer way

# Connect to Driverless AI
h2oai = Client('', '', '')

# Data Information
data_file_name = ""
data_file_location = "" + data_file_name
y = ""

# Transformers Information
transformer_name = ""
transformer_file_name = ""
transformer_file_location = "" + transformer_file_name

# Location to Download Files
download_file_location = ""

Exemplo n.º 7
0
import h2oai_client
from h2oai_client import Client

h2oai = Client(address='http://129.213.63.69:12345',
               username='******',
               password='******')

train = h2oai.create_dataset_sync('/train.csv')
test = h2oai.create_dataset_sync('/test.csv')

experiment = h2oai.start_experiment_sync(dataset_key=train.key,
                                         testset_key=test.key,
                                         accuracy=10,
                                         time=10,
                                         interpretability=1,
                                         is_classification=True,
                                         target_col='LABEL',
                                         is_timeseries=True,
                                         time_col='DATE',
                                         num_gap_periods=1,
                                         num_prediction_periods=1)

print("Final Model Score on Validation Data: " +
      str(round(experiment.valid_score, 3)))
print("Final Model Score on Test Data: " +
      str(round(experiment.test_score, 3)))
Exemplo n.º 8
0
import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

ip = '35.175.227.14'
address = 'http://' + ip + ':12345'
username = '******'
password = '******'

h2oai = Client(address=address, username=username, password=password)

### Amaxon Reviews

dataPath = '/data/Training/AmazonFineFoodReviews.csv'
basename = 'Reviews'
target = 'PositiveReview'
ratio = 0.8

reviews_data = h2oai.create_dataset_sync(dataPath)

# Split the data
reviews_split_data = h2oai.make_dataset_split(dataset_key=reviews_data.key,
                                              output_name1=basename + "_train",
                                              output_name2=basename + "_test",
                                              target=target,
                                              fold_col="",
                                              time_col="",
                                              ratio=ratio,
Exemplo n.º 9
0
import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

ip = '35.175.227.14'
address = 'http://' + ip + ':12345'
username = '******'
password = '******'

h2oai = Client(address = address
               , username = username
               , password = password)

dataPath = '/data/Training/CreditCard.csv'
basename = 'Card'
target = 'Default'
ratio = 0.8
dropped = []

card_data = h2oai.create_dataset_sync(dataPath)

# Split the data
card_split_data = h2oai.make_dataset_split(
    dataset_key = card_data.key
    , output_name1 = basename + "_train"
    , output_name2 = basename + "_test"
    , target = target
    , fold_col = ""
Exemplo n.º 10
0
import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

ip = '35.175.227.14'
address = 'http://' + ip + ':12345'
username = '******'
password = '******'

h2oai = Client(address=address, username=username, password=password)

dataPath = '/data/Training/BostonHousing.csv'
basename = 'Housing'
target = 'VALUE'
ratio = 0.8

boston_data = h2oai.create_dataset_sync(dataPath)

# Split the data
boston_split_data = h2oai.make_dataset_split(dataset_key=boston_data.key,
                                             output_name1=basename + "_train",
                                             output_name2=basename + "_test",
                                             target=target,
                                             fold_col="",
                                             time_col="",
                                             ratio=ratio,
                                             seed=1234)
Exemplo n.º 11
0
import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

ip = '35.175.227.14'
address = 'http://' + ip + ':12345'
username = '******'
password = '******'

h2oai = Client(address=address, username=username, password=password)

### Diabetes Models
dataPath = '/data/Training/PimaDiabetes.csv'
basename = 'Diabetes'
target = 'Outcome'
ratio = 0.8

diabetes_data = h2oai.create_dataset_sync(dataPath)

# Split the data
diabetes_split_data = h2oai.make_dataset_split(dataset_key=diabetes_data.key,
                                               output_name1=basename +
                                               "_train",
                                               output_name2=basename + "_test",
                                               target=target,
                                               fold_col="",
                                               time_col="",
                                               ratio=ratio,
Exemplo n.º 12
0
import h2oai_client
import numpy as np
import pandas as pd
# import h2o
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

address = 'http://18.234.58.12:12345'

username = '******'
password = '******'

h2oai = Client(address=address, username=username, password=password)


def splitTrainingData(dataPath, basename, target, ratio=0.8, time=''):
    data = h2oai.create_dataset_sync(dataPath)
    # Split the data
    split_data = h2oai.make_dataset_split(dataset_key=data.key,
                                          output_name1=basename + '_train',
                                          output_name2=basename + '_test',
                                          target=target,
                                          fold_col='',
                                          time_col=time,
                                          ratio=ratio)
    # key[0] is train, key[1] is test
    key = h2oai.get_dataset_split_job(split_data).entity
    return key

Exemplo n.º 13
0
import h2oai_client
import numpy as np
import pandas as pd
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

ip = '35.175.227.14'
address = 'http://' + ip + ':12345'
username = '******'
password = '******'

h2oai = Client(address=address, username=username, password=password)

### Titanic Models
dataPath = '/data/Training/Titanic.csv'
basename = 'Titanic'
target = 'survived'
ratio = 0.8

titanic_data = h2oai.create_dataset_sync(dataPath)

# Split the data
titanic_split_data = h2oai.make_dataset_split(dataset_key=titanic_data.key,
                                              output_name1=basename + "_train",
                                              output_name2=basename + "_test",
                                              target=target,
                                              fold_col="",
                                              time_col="",
                                              ratio=ratio,
                                              seed=1234)