Пример #1
0
def deepwater_custom_lenet():
    if not H2ODeepWaterEstimator.available(): return

    frame = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    print(frame.head(5))
    nclasses = frame[1].nlevels()[0]

    print(
        "Creating the model architecture from scratch using the MXNet Python API"
    )
    lenet(nclasses).save("/tmp/symbol_lenet-py.json")

    print("Importing the model architecture for training in H2O")
    model = H2ODeepWaterEstimator(
        epochs=50,
        learning_rate=1e-3,
        mini_batch_size=32,
        network='user',
        network_definition_file="/tmp/symbol_lenet-py.json",
        image_shape=[28, 28],
        channels=1,
        score_interval=0,
        train_samples_per_iteration=1000,
        gpu=False)
    model.train(x=[0], y=1, training_frame=frame)
    model.show()
    error = model.model_performance(train=True).mean_per_class_error()
    assert error < 0.1, "mean classification error is too high : " + str(error)
Пример #2
0
def deepwater_custom_regression():
    if not H2ODeepWaterEstimator.available(): return

    train = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv"))

    response = 'loan_amnt'
    predictors = list(
        set(train.names) - set([
            response, 'id', 'emp_title', 'title', 'desc', 'revol_util',
            'zip_code'
        ]))  ## remove high-cardinality columns

    print(
        "Creating the model architecture from scratch using the MXNet Python API"
    )
    net().save("/tmp/symbol-py.json")

    print("Importing the model architecture for training in H2O")
    model = H2ODeepWaterEstimator(epochs=100,
                                  learning_rate=1e-4,
                                  mini_batch_size=64,
                                  hidden=[1],
                                  activation="tanh")
    #network='user', network_definition_file="/tmp/symbol-py.json")

    model.train(x=predictors, y=response, training_frame=train, nfolds=3)
    model.show()
    error = model.model_performance(xval=True).rmse()
    assert error < 10, "mean xval rmse is too high : " + str(error)
def deepwater_checkpoint():
  if not H2ODeepWaterEstimator.available(): return

  ## build a model
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0)
  model.train(y=1, training_frame=frame)

  ## save the model
  model_path = h2o.save_model(model)

  ## delete everything - simulate cluster shutdown and restart
  h2o.remove_all()

  ## reimport the model and the frame
  model = h2o.load_model(model_path)
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  
  ## delete the checkpoint file
  os.remove(model_path)

  ## continue training
  model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id)
  model2.train(y=1, training_frame=frame)
  model2.show()
def deepwater_inception_resnet_v2():
    if not H2ODeepWaterEstimator.available():
        return

    frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    print(frame.head(5))
    nclasses = frame[1].nlevels()[0]

    print("Creating the model architecture from scratch using the MXNet Python API")
    get_symbol(nclasses).save("/tmp/symbol_inception_resnet_v2-py.json")

    print("Importing the model architecture for training in H2O")
    model = H2ODeepWaterEstimator(
        epochs=50,  # learning_rate=1e-3, learning_rate_annealing=1e-5,
        mini_batch_size=16,
        ## provide network specific information
        network="user",
        network_definition_file="/tmp/symbol_inception_resnet_v2-py.json",
        image_shape=[299, 299],
        channels=3,
    )

    model.train(x=[0], y=1, training_frame=frame)
    model.show()
    error = model.model_performance(train=True).mean_per_class_error()
    assert error < 0.1, "mean classification error is too high : " + str(error)
def deepwater_custom_lenet_mnist():
  if not H2ODeepWaterEstimator.available(): return

  train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
  test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

  predictors = list(range(0,784))
  resp = 784
  train[resp] = train[resp].asfactor()
  test[resp] = test[resp].asfactor()
  nclasses = train[resp].nlevels()[0]

  print("Creating the lenet model architecture from scratch using the MXNet Python API")
  lenet(nclasses).save("/tmp/symbol_lenet-py.json")

  print("Importing the lenet model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=10,
                                learning_rate=0.05, 
                                learning_rate_annealing=1e-5, 
                                momentum_start=0.9,
                                momentum_stable=0.9,
                                mini_batch_size=128,
                                train_samples_per_iteration=0,
                                score_duty_cycle=0,
                                stopping_rounds=0,
                                ignore_const_cols=False,
                                network_definition_file="/tmp/symbol_lenet-py.json",
				image_shape=[28,28],
                                channels=1)
                                
  model.train(x=predictors,y=resp, training_frame=train)
  model.show()
  print(model.model_performance(valid=True))
  error = model.model_performance(test).mean_per_class_error()
  assert error < 0.1, "mean classification error on validation set is too high : " + str(error)
def deepwater_multi():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-3)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  assert error < 0.1, "mean classification error is too high : " + str(error)
Пример #7
0
def deepwater_lenet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  assert error < 0.1, "mean classification error is too high : " + str(error)
Пример #8
0
def deepwater_regression():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame[1] = frame[1].asnumeric()
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-3)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mae()
  assert error < 0.3, "mean absolute error is too high : " + str(error)
Пример #9
0
def deepwater_lenet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  h2o.remove_all()
  assert error < 0.1, "mean classification error is too high : " + str(error)
def deepwater_inception_bn_feature_extraction():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  nclasses = frame[1].nlevels()[0]

  print("Downloading the model")
  with open("model.json", "wb") as file:
    response = get("https://raw.githubusercontent.com/h2oai/deepwater/master/mxnet/src/main/resources/deepwater/backends/mxnet/models/Inception/Inception_BN-symbol.json")
    file.write(response.content)
  with open("model.params", "wb") as file:
    response = get("https://raw.githubusercontent.com/h2oai/deepwater/master/mxnet/src/main/resources/deepwater/backends/mxnet/models/Inception/Inception_BN-0039.params")
    file.write(response.content)
  with open("mean_224.nd", "wb") as file:
    response = get("https://raw.githubusercontent.com/h2oai/deepwater/master/mxnet/src/main/resources/deepwater/backends/mxnet/models/Inception/mean_224.nd")
    file.write(response.content)

  print("Importing the model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=0, ## no training - just load the state - NOTE: training for this 3-class problem wouldn't work since the model has 1k classes
                                mini_batch_size=32, ## mini-batch size is used for scoring
                                ## all parameters below are needed
                                network='user', 
                                network_definition_file=os.getcwd() + "/model.json", 
                                network_parameters_file=os.getcwd() + "/model.params", 
                                mean_image_file=os.getcwd() + "/mean_224.nd",
                                image_shape=[224,224],
                                channels=3
  )
  model.train(x=[0],y=1, training_frame=frame) ## must call train() to initialize the model, but it isn't training

  ## Extract deep features from final layer before going into Softmax.
  extracted_features = model.deepfeatures(frame, "global_pool_output")
  extracted_features2 = model.deepfeatures(frame, "conv_5b_double_3x3_1_output")

  ## Cleanup (first)
  os.remove("model.json")
  os.remove("model.params")
  os.remove("mean_224.nd")

  print(extracted_features.ncol)
  assert extracted_features.ncol == 1024

  print(extracted_features2.ncol)
  assert extracted_features2.ncol == 10976

  ## Find the squared cosine similarity between the first 10 images and the rest
  df = extracted_features[:10,:].distance(extracted_features[10:,:], "cosine_sq")
  print(df)
  assert df.shape[0] == 257
  assert df.shape[1] == 10
Пример #11
0
def deepwater_tweets():
  if not H2ODeepWaterEstimator.available(): return

  tweets = h2o.import_file(pyunit_utils.locate("/home/arno/tweets.txt"), col_names=["text"], sep="|")
  labels = h2o.import_file(pyunit_utils.locate("/home/arno/labels.txt"), col_names=["label"])
  frame = tweets.cbind(labels)
  print(frame.head(5))

#  cnn = make_text_cnn(sentence_size=100, num_embed=300, batch_size=32,
#            vocab_size=100000, dropout=dropout, with_embedding=with_embedding)
  model = H2ODeepWaterEstimator(epochs=50000, learning_rate=1e-3, hidden=[100,100,100,100,100])
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  assert error < 0.1, "mean classification error is too high : " + str(error)
def deepwater_checkpoint():
    if not H2ODeepWaterEstimator.available(): return

    ## build a model
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()
    print(frame.head(5))
    model = H2ODeepWaterEstimator(epochs=50,
                                  learning_rate=1e-5,
                                  stopping_rounds=0,
                                  score_duty_cycle=1,
                                  train_samples_per_iteration=-1,
                                  score_interval=0)
    model.train(y=1, training_frame=frame)

    ## save the model
    model_path = h2o.save_model(model)

    ## delete everything - simulate cluster shutdown and restart
    h2o.remove_all()

    ## reimport the model and the frame
    model = h2o.load_model(model_path)
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()

    ## delete the checkpoint file
    os.remove(model_path)

    ## continue training
    model2 = H2ODeepWaterEstimator(epochs=100,
                                   learning_rate=1e-5,
                                   stopping_rounds=0,
                                   score_duty_cycle=1,
                                   train_samples_per_iteration=-1,
                                   score_interval=0,
                                   checkpoint=model.model_id)
    model2.train(y=1, training_frame=frame)
    model2.show()
def deepwater_custom_alexnet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  nclasses = frame[1].nlevels()[0]

  print("Creating the Alexnet model architecture from scratch using the MXNet Python API")
  alexnet(nclasses).save("/tmp/symbol_alexnet-py.json")

  print("Importing the Alexnet model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-2, mini_batch_size=128, learning_rate_annealing=1e-5,
                                network='user', network_definition_file="/tmp/symbol_alexnet-py.json", image_shape=[224,224], channels=3,
                                score_interval=0, train_samples_per_iteration=1000,
                                gpu=True)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
def deepwater_lenet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000)
  model.train(x=[0],y=1, training_frame=frame)

  extracted = model.deepfeatures(frame, "pooling1_output")
  #print(extracted.describe())
  print(extracted.ncols)
  assert extracted.ncols == 800, "extracted frame doesn't have 800 columns"

  extracted = model.deepfeatures(frame, "activation2_output")
  #print(extracted.describe())
  print(extracted.ncols)
  assert extracted.ncols == 500, "extracted frame doesn't have 500 columns"

  h2o.remove_all()
Пример #15
0
def deepwater_inception_resnet_v2():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  nclasses = frame[1].nlevels()[0]

  print("Creating the model architecture from scratch using the MXNet Python API")
  get_symbol(nclasses).save("/tmp/symbol_inception_resnet_v2-py.json")

  print("Importing the model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=50, #learning_rate=1e-3, learning_rate_annealing=1e-5,
                                mini_batch_size=16,
                                ## provide network specific information
                                network='user', network_definition_file="/tmp/symbol_inception_resnet_v2-py.json", image_shape=[299,299], channels=3)

  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  assert error < 0.1, "mean classification error is too high : " + str(error) 
Пример #16
0
def deepwater_custom_lenet_mnist():
    if not H2ODeepWaterEstimator.available(): return

    train = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = list(range(0, 784))
    resp = 784
    train[resp] = train[resp].asfactor()
    test[resp] = test[resp].asfactor()
    nclasses = train[resp].nlevels()[0]

    print(
        "Creating the lenet model architecture from scratch using the MXNet Python API"
    )
    lenet(nclasses).save("/tmp/symbol_lenet-py.json")

    print("Importing the lenet model architecture for training in H2O")
    model = H2ODeepWaterEstimator(
        epochs=10,
        learning_rate=0.05,
        learning_rate_annealing=1e-5,
        momentum_start=0.9,
        momentum_stable=0.9,
        mini_batch_size=128,
        train_samples_per_iteration=0,
        score_duty_cycle=0,
        stopping_rounds=0,
        ignore_const_cols=False,
        network_definition_file="/tmp/symbol_lenet-py.json",
        image_shape=[28, 28],
        channels=1)

    model.train(x=predictors, y=resp, training_frame=train)
    model.show()
    print(model.model_performance(valid=True))
    error = model.model_performance(test).mean_per_class_error()
    assert error < 0.1, "mean classification error on validation set is too high : " + str(
        error)
def deepwater_custom_regression():
  if not H2ODeepWaterEstimator.available(): return

  train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv"))

  train = train[:39786,:] ## only the first 39786 records are correct, the rest is from a different dataset

  response = 'loan_amnt'
  predictors = list(set(train.names) - set([response, 'id','emp_title','title','desc','revol_util','zip_code'])) ## remove high-cardinality columns

  print("Creating the model architecture from scratch using the MXNet Python API")
  PATH = "/tmp/symbol-py.json"
  net().save(PATH)

  print("Importing the model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=20, nfolds=3, network_definition_file=PATH)
                                
  model.train(x=predictors, y=response, training_frame=train)
  model.show()
  error = model.model_performance(xval=True).rmse()
  assert error < 2000, "mean xval rmse is too high : " + str(error)
def deepwater_custom_cnn_mnist():
    if not H2ODeepWaterEstimator.available(): return

    train = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = list(range(0, 784))
    resp = 784
    train[resp] = train[resp].asfactor()
    test[resp] = test[resp].asfactor()
    nclasses = train[resp].nlevels()[0]

    print(
        "Creating the cnn model architecture from scratch using the MXNet Python API"
    )
    cnn(nclasses).save("/tmp/symbol_custom-py.json")

    print("Importing the cnn model architecture for training in H2O")
    model = H2ODeepWaterEstimator(
        epochs=100,
        learning_rate=1e-3,
        mini_batch_size=64,
        network='user',
        network_definition_file="/tmp/symbol_custom-py.json",
        image_shape=[28, 28],
        channels=1)

    model.train(x=predictors,
                y=resp,
                training_frame=train,
                validation_frame=test)
    model.show()
    error = model.model_performance(valid=True).mean_per_class_error()
    assert error < 0.1, "mean classification error on validation set is too high : " + str(
        error)
Пример #19
0
def cv_airlines():
    if not H2ODeepWaterEstimator.available(): return

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"))
    predictors = [
        "Year", "Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime",
        "UniqueCarrier", "FlightNum"
    ]
    response_col = "IsDepDelayed"

    dl = H2ODeepWaterEstimator(  # cross-validation
        nfolds=3,
        # network (fully-connected)
        hidden=[200, 200],
        activation="Rectifier",
        # regularization
        hidden_dropout_ratios=[0.1, 0.1],
        input_dropout_ratio=0.0,
        # learning rate
        learning_rate=5e-3,
        learning_rate_annealing=1e-6,
        # momentum
        momentum_start=0.9,
        momentum_stable=0.99,
        momentum_ramp=1e7,
        # early stopping
        epochs=100,
        stopping_rounds=4,
        train_samples_per_iteration=30000,
        # score often for early stopping
        mini_batch_size=32,
        score_duty_cycle=0.25,
        score_interval=1)

    dl.train(x=predictors, y=response_col, training_frame=df)
    print(dl.show())
Пример #20
0
def deepwater_demo():
  if not H2ODeepWaterEstimator.available(): return

  # Training data
  train_data = h2o.import_file(path=tests.locate("smalldata/gbm_test/ecology_model.csv"))
  train_data = train_data.drop('Site')
  train_data['Angaus'] = train_data['Angaus'].asfactor()
  print(train_data.describe())
  train_data.head()

  # Testing data
  test_data = h2o.import_file(path=tests.locate("smalldata/gbm_test/ecology_eval.csv"))
  test_data['Angaus'] = test_data['Angaus'].asfactor()
  print(test_data.describe())
  test_data.head()

  # Run DeepWater (ideally, use a GPU - this would be slow on CPUs)

  dl = H2ODeepWaterEstimator(epochs=50, hidden=[4096,4096,4096], hidden_dropout_ratios=[0.2,0.2,0.2])
  dl.train(x=list(range(1,train_data.ncol)),
           y="Angaus",
           training_frame=train_data,
           validation_frame=test_data)
  dl.show()
def cv_airlines():
  if not H2ODeepWaterEstimator.available(): return

  df =  h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"))
  predictors = ["Year","Month","DayofMonth","DayOfWeek","CRSDepTime","CRSArrTime","UniqueCarrier","FlightNum"]
  response_col = "IsDepDelayed"

  dl = H2ODeepWaterEstimator(# cross-validation
			     nfolds=3,                                                          
			     # network (fully-connected)
			     hidden=[200,200], activation="Rectifier",                          
			     # regularization
	 		     hidden_dropout_ratios=[0.1,0.1], input_dropout_ratio=0.0,          
	 		     # learning rate
			     learning_rate=5e-3, learning_rate_annealing=1e-6,                                    
			     # momentum
			     momentum_start=0.9, momentum_stable=0.99, momentum_ramp=1e7,       
		             # early stopping
		             epochs=100, stopping_rounds=4, train_samples_per_iteration=30000,  
			     # score often for early stopping
			     mini_batch_size=32, score_duty_cycle=0.25, score_interval=1)       

  dl.train(x=predictors, y=response_col, training_frame=df)
  print(dl.show())
def deepwater_custom_cnn_mnist():
  if not H2ODeepWaterEstimator.available(): return

  train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
  test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))

  predictors = list(range(0,784))
  resp = 784
  train[resp] = train[resp].asfactor()
  test[resp] = test[resp].asfactor()
  nclasses = train[resp].nlevels()[0]

  print("Creating the cnn model architecture from scratch using the MXNet Python API")
  cnn(nclasses).save("/tmp/symbol_custom-py.json")

  print("Importing the cnn model architecture for training in H2O")
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, mini_batch_size=64,
                                network='user', network_definition_file="/tmp/symbol_custom-py.json",
				image_shape=[28,28], channels=1)
                                
  model.train(x=predictors,y=resp, training_frame=train, validation_frame=test)
  model.show()
  error = model.model_performance(valid=True).mean_per_class_error()
  assert error < 0.1, "mean classification error on validation set is too high : " + str(error)
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs
    is set to be too short.  See PUBDEV-4802.
    '''
    global model_within_max_runtime
    seed = 12345

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA, pca_method=Power
    training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=Randomized
    model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Randomized", compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=GLRM
    model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol-1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10])
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print("******************** Skip testing stack ensemble.  Not an iterative algo.")

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol-1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print("******************** Skip testing Naives Bayes.  Not an iterative algo.")

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50, hidden=[4096, 4096, 4096], hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(model, training1_data, x_indices, y_index)
        cleanUp([training1_data, model])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE",
                                           recover_svd=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    if sum(model_within_max_runtime)>0:
        sys.exit(1)
Пример #24
0
def cv_cars_dw():
  if not H2ODeepWaterEstimator.available(): return

  # read in the dataset and construct training set (and validation set)
  cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

  # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
  # 2:multinomial
  problem = random.sample(list(range(2)),1)[0] + 1  # only do classification

  # pick the predictors and the correct response column
  predictors = ["displacement","power","weight","acceleration","year"]
  if problem == 1   :
    response_col = "economy_20mpg"
    cars[response_col] = cars[response_col].asfactor()
  elif problem == 2 :
    response_col = "cylinders"
    cars[response_col] = cars[response_col].asfactor()
  else              :
    response_col = "economy"

  print("Response column: {0}".format(response_col))

  ## cross-validation
  # 1. basic

  dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),fold_assignment="Modulo",hidden=[20,20],epochs=10)
  dl.train(x=predictors, y=response_col, training_frame=cars)

  # 2. check that cv metrics are different over repeated "Random" runs
  nfolds = random.randint(3,10)
  dl1 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10)
  dl1.train(x=predictors,y=response_col,training_frame=cars)
  dl2 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10)
  try:
    pyunit_utils.check_models(dl1, dl2, True)
    assert False, "Expected models to be different over repeated Random runs"
  except AssertionError:
    assert True

  # 3. folds_column
  num_folds = random.randint(2,5)
  fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for _ in range(cars.nrow)])
  fold_assignments.set_names(["fold_assignments"])
  cars = cars.cbind(fold_assignments)

  dl = H2ODeepWaterEstimator(keep_cross_validation_predictions=True,hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars,fold_column="fold_assignments")

  num_cv_models = len(dl._model_json['output']['cross_validation_models'])
  assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                   "{1}".format(num_folds, num_cv_models)
  cv_model1 = h2o.get_model(dl._model_json['output']['cross_validation_models'][0]['name'])
  cv_model2 = h2o.get_model(dl._model_json['output']['cross_validation_models'][1]['name'])


  # 4. keep_cross_validation_predictions
  cv_predictions = dl1._model_json['output']['cross_validation_predictions']



  ## boundary cases
  # 1. nfolds = number of observations (leave-one-out cross-validation)
  dl = H2ODeepWaterEstimator(nfolds=cars.nrow, fold_assignment="Modulo",hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars)

  # 2. nfolds = 0
  dl = H2ODeepWaterEstimator(nfolds=0,hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars)

  # 3. cross-validation and regular validation attempted
  dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),hidden=[20,20],epochs=10)
  dl.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars)


  ## error cases
  # 1. nfolds == 1 or < 0
  try:
    dl = H2ODeepWaterEstimator(nfolds=random.sample([-1,1], 1)[0],hidden=[20,20],epochs=10)
    dl.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds is 1 or < 0"
  except EnvironmentError:
    assert True

  # 2. more folds than observations
  try:
    dl = H2ODeepWaterEstimator(nfolds=cars.nrow+1,fold_assignment="Modulo",hidden=[20,20],epochs=10)
    dl.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    dl = H2ODeepWaterEstimator(nfolds=3, hidden=[20, 20], epochs=10)
    dl.train(x=predictors, y=response_col, fold_column="fold_assignments", training_frame=cars)
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True
Пример #25
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all
    h2o algos.  See PUBDEV-4702.
    '''
    global model_within_max_runtime
    global err_bound
    seed = 12345

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(
            path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50,
                                      hidden=[4096, 4096, 4096],
                                      hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices,
                        y_index)
        cleanUp([training1_data, model])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE")
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(err_bound * 3, 1.2, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
Пример #26
0
from h2o.estimators.random_forest import H2ORandomForestEstimator
import sys, os
import h2o
from h2o.estimators.deepwater import H2ODeepWaterEstimator
import os.path
import pandas as pd
import random
import os.path


!nvidia-smi

PATH=os.path.expanduser("~/default/")

h2o.init(nthreads=-1)
if not H2ODeepWaterEstimator.available(): exit



frame = h2o.import_file(PATH + "data_path")
submit_frame=h2o.import_file(PATH + "data_path")
print(frame.dim)
print(frame.head(5))



r = frame.runif(seed=123)
trial_frame = frame[r  < 0.01]                 ## 10% for trial run
train_ensemble=frame[r  < 0.8] 
test_ensemble=frame[r  > 0.8]