예제 #1
0
def group_by():
    # Connect to a pre-existing cluster
    

    h2o_iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names[0:4]

    print "Running smoke test"

    # smoke test
    for na in na_handling:
      grouped = h2o_iris.group_by("class")
      grouped \
        .count(na=na) \
        .min(  na=na) \
        .max(  na=na) \
        .mean( na=na) \
        .var(  na=na) \
        .sd(   na=na) \
        .ss(   na=na) \
        .sum(  na=na)
      print grouped.get_frame()
def getModelKmeans(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)  # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2,7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        #TODO: impement h2o.getModel()
        model = h2o.getModel(km_h2o._key)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
예제 #3
0
def frame_slicing(ip,port):
    
    

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = iris[0]
    assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = prostate[13, 3]
    assert abs(res2 - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = airlines[12, 0:3]
    assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = iris[5:8, 1]
    assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = prostate[5:8, 0:3]
    assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
예제 #4
0
def plot_test():
    
    
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY],
                      distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_file(h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
예제 #5
0
def fiftycatGBM(ip,port):
  
  

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
def anomaly(ip, port):
    h2o.init(ip, port)

    print "Deep Learning Anomaly Detection MNIST"

    train = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_frame(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = range(0,784)
    resp = 784

    # unsupervised -> drop the response column (digit: 0-9)
    train = train[predictors]
    test = test[predictors]

    # 1) LEARN WHAT'S NORMAL
    # train unsupervised Deep Learning autoencoder model on train_hex
    ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True,
                                hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1)

    # 2) DETECT OUTLIERS
    # anomaly app computes the per-row reconstruction error for the test data set
    # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
    test_rec_error = ae_model.anomaly(test)

    # 3) VISUALIZE OUTLIERS
    # Let's look at the test set points with low/median/high reconstruction errors.
    # We will now visualize the original test set points and their reconstructions obtained
    # by propagating them through the narrow neural net.

    # Convert the test data into its autoencoded representation (pass through narrow neural net)
    test_recon = ae_model.predict(test)
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def offsets_and_distributions(ip,port):

    # cars
    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.setNames(["x1"])
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
예제 #9
0
def fiftycatRF(ip, port):

    # Training set has only 45 categories cat1 through cat45
    # Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    # Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    # train.summary()

    # Train H2O DRF Model:
    # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    # Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_file(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    # Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    # test.summary()

    # Predict on test dataset with DRF model:
    # Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    # Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
def link_functions_binomial(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,2]
	sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

	print("Testing for family: BINOMIAL")
	print("Set variables for h2o.")
	myY = "CAPSULE"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

	print("Create models with canonical link: LOGIT")
	h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
	sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

	print("Compare model deviances for link function logit")
	h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance']
	sm_deviance = sm_model.deviance / sm_model.null_deviance
	assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
예제 #11
0
def wide_dataset_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
def link_functions_gamma(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,5]
	sm_data_features = sm_data[:,[1,2,3,4,6,7,8,9]]

	print("Testing for family: GAMMA")
	print("Set variables for h2o.")
	myY = "DPROS"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","CAPSULE"]

	print("Create models with canonical link: INVERSE")
	h2o_model_in = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="inverse",alpha=[0.5], Lambda=[0])
	sm_model_in = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.inverse_power)).fit()

	print("Compare model deviances for link function inverse")
	h2o_deviance_in = h2o_model_in._model_json['output']['residual_deviance'] / h2o_model_in._model_json['output']['null_deviance']
	sm_deviance_in = sm_model_in.deviance / sm_model_in.null_deviance
	assert h2o_deviance_in - sm_deviance_in < 0.01, "expected h2o to have an equivalent or better deviance measures"

	print("Create models with canonical link: LOG")
	h2o_model_log = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gamma", link="log",alpha=[0.5], Lambda=[0])
	sm_model_log = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gamma(sm.families.links.log)).fit()

	print("Compare model deviances for link function log")
	h2o_deviance_log = h2o_model_log._model_json['output']['residual_deviance'] / h2o_model_log._model_json['output']['null_deviance']
	sm_deviance_log = sm_model_log.deviance / sm_model_log.null_deviance
	assert h2o_deviance_log - sm_deviance_log < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_functions_gaussian(ip,port):
    
    

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).
                          open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:,9]
    sm_data_features = sm_data[:,1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
                      family=sm.families.Gaussian(sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
예제 #14
0
def pubdev_1953():

    # small_test = [h2o.locate("bigdata/laptop/citibike-nyc/2013-10.csv")]
    # data = h2o.import_file(path=small_test)
    # startime = data["starttime"]
    # secsPerDay=1000*60*60*24
    # data["Days"] = (startime/secsPerDay).floor()
    # grouped = data.group_by(["Days","start station name"])
    # bpd = grouped.count(name="bikes").get_frame()
    # secs = bpd["Days"]*secsPerDay
    # bpd["Month"]     = secs.month().asfactor()
    # bpd["DayOfWeek"] = secs.dayOfWeek()
    # wthr1 = h2o.import_file(path=[h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), h2o.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
    # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
    # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
    # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
    # wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
    # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
    # secsPerDay=1000*60*60*24
    # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
    # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
    # rain = wthr4["Rain (mm)"]
    # rain[ rain.isna() ] = 0
    # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
    # r = bpd_with_weather['Days'].runif(seed=356964763)
    # train = bpd_with_weather[  r  < 0.6]
    # test  = bpd_with_weather[(0.6 <= r) & (r < 0.9)]

    predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)']

    train = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_train.csv"))
    test = h2o.import_file(h2o.locate("smalldata/glm_test/citibike_small_test.csv"))

    glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
예제 #15
0
def smallcatGBM(ip,port):
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  
  

  #Log.info("Importing alphabet_cattest.csv data...\n")
  alphabet = h2o.import_file(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
                         converters={0:lambda s: ord(s.split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  gbm_h2o.show()
  
  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
예제 #16
0
def dim_checks():
  
  

  # Log.info("Uploading logreg/princeton/cuse.dat")
  h2o_data = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))
  np_data = np.loadtxt(h2o.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1)

  h2o_rows, h2o_cols = h2o_data.dim
  np_rows, np_cols = list(np_data.shape)

  print 'The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols)
  print 'The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols)

  assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows"

  # Log.info("Slice out a column and data frame it, try dim on it...")

  h2o_slice = h2o_data[4]
  np_slice = np_data[:,4]

  h2o_rows, h2o_cols = h2o_slice.dim
  np_rows = np_slice.shape[0]

  print 'The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols)
  print 'The dimensions of numpy array column slice is: {0} x 1'.format(np_rows)

  assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows"

  # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")

  h2oColAmpFive = h2o_slice & 5

  assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
예제 #17
0
def sdev(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                          delimiter=',',
                          skip_header=1,
                          usecols=(0, 1, 2, 3))

  sd_np = np.std(iris_np, axis=0, ddof=1)
  for i in range(4):
    sd_h2o = iris_h2o[i].sd()
    assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same"

  try:
    iris_h2o[4].sd()
    assert False, "expected an error. column is categorical."
  except EnvironmentError:
    assert True

  try:
    iris_h2o[0:2].sd()
    assert False, "expected an error. more than one column."
  except EnvironmentError:
    assert True
예제 #18
0
def asnumeric(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    h2oframe =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    rows = h2oframe.nrow()

    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor"

    # H2OFrame case
    h2oframe = h2o.asnumeric(h2oframe)
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())

    h2oframe =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars.csv"))
    h2oframe['cylinders'] = h2oframe['cylinders'].ascharacter()
    assert h2oframe["cylinders"].isfactor(), "expected the column to be a factor"

    # H2OVec case
    h2oframe['cylinders'] = h2o.asnumeric(h2oframe['cylinders'])
    h2oframe['cylinders'] = h2oframe['cylinders'] - h2oframe['cylinders']
    h2oframe = h2oframe[h2oframe['cylinders'] == 0]
    assert h2oframe.nrow() == rows, "expected the same number of rows as before {0}, but got {1}".format(rows, h2oframe.nrow())
예제 #19
0
def group_by(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"]
    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names()[0:4]

    print "Running smoke test"

    # smoke test
    for a in h2o_agg_funcs:
       for n in na_handling:
           for c in col_names:
               print "group by : " + str(a) + "; " + str(n) + "; " + str(c)
               h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]})

    # h2o/pandas/numpy comparison test
    h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum}
    for k in h2o_np_agg_dict.keys():
        for c in col_names:
            print "group by comparison: " + str(k) + "; " + str(c)
            h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]})
            pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k])
            for i in range(3):
                h2o_val = h2o_res[i,1]
                pd_val = pd_res[h2o_res[i,0]]
                assert abs(h2o_val - pd_val) < 1e-06, \
                    "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \
                    "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
예제 #20
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def shuffling_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=h2o.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


    print("Create model on original Arcene dataset.")
    h2o_model = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = h2o.glm(x=train_data[0:1000], y=train_data[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000], y=train_data_shuffled[1000], family="binomial", lambda_search=True, alpha=[0.5], use_all_factor_levels=True)

    print("Assert that number of predictors remaining and their respective coefficients are equal.")

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s._model_json['output']['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
        if isinstance(x[1],float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2],float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
예제 #22
0
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
예제 #23
0
def test_locate():

    iris_path = h2o.locate("smalldata/iris/iris.csv")

    try:
        h2o.locate("smalldata/iris/afilethatdoesnotexist.csv")
        assert False, "Expected h2o.locate to raise a ValueError"
    except ValueError:
        assert True
예제 #24
0
def frame_show(ip, port):

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()
예제 #25
0
def bernoulliGBM(ip,port):
  
  

  #Log.info("Importing prostate.csv data...\n")
  prostate_train = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

  #Log.info("H2O Summary of prostate frame:\n")
  #prostate.summary()

  # Import prostate_train.csv as numpy array for scikit comparison
  trainData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  trainDataResponse = trainData[:,0]
  trainDataFeatures = trainData[:,1:]

  ntrees = 100
  learning_rate = 0.1
  depth = 5
  min_rows = 10
  # Build H2O GBM classification model:
  #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5,
  # min_rows = 10, learn_rate = 0.1\n", sep = ""))
  gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate,
                    max_depth=depth, min_rows=min_rows, distribution="bernoulli")

  # Build scikit GBM classification model
  #Log.info("scikit GBM with same parameters\n")
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  #Log.info("Importing prostate_test.csv data...\n")
  prostate_test = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_test.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

  # Import prostate_test.csv as numpy array for scikit comparison
  testData = np.loadtxt(h2o.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1)
  testDataResponse = testData[:,0]
  testDataFeatures = testData[:,1:]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(prostate_test)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
예제 #26
0
def headers(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    headers = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv"))
    headers_and = h2o.import_frame(h2o.locate("smalldata/airlines/allyears2k.zip"), col_names=headers)
    print headers.names()
    print headers_and.names()
    assert headers.names() == headers_and.names(), "Expected the same column names but got {0} and {1}". \
        format(headers.names(), headers_and.names())
예제 #27
0
def headers():

    headers = h2o.import_file(h2o.locate("smalldata/airlines/allyears2k_headers_only.csv"))
    headers_and = h2o.import_file(h2o.locate("smalldata/airlines/allyears2k.zip"))
    headers_and.set_names(headers.names)
    print headers.names
    print headers_and.names
    assert headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".format(
        headers.names, headers_and.names
    )
def pubdev_1839(ip, port):

    train = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test  = h2o.import_file(h2o.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x           =train.drop("bikes"),
                   y           =train     ["bikes"],
                   validation_x=test .drop("bikes"),
                   validation_y=test      ["bikes"],
                   family="poisson")
def deeplearning_autoencoder(ip, port):
    

    resp = 784
    nfeatures = 20 # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(h2o.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(x=train_unsupervised[0:resp],
                                activation="Tanh",
                                autoencoder=True,
                                hidden=[nfeatures],
                                epochs=1,
                                reproducible=True, #slow, turn off for real problems
                                seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol() == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.1078) < 0.001, "Error. Expected 0.1078, but got {0}".format(cm.cell_values[10][10])
def pub_445_long_request_uri(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    mnistTrain = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))
    mnistTest = h2o.import_frame(path=h2o.locate("bigdata/laptop/mnist/train.csv.gz"))

    mnistTrain[784]._name = "label"
    mnistTest[784]._name = "label"

    mnistModel = h2o.gbm(x=mnistTrain.drop("label"), y=mnistTrain["label"], validation_x=mnistTest.drop("label"), validation_y=mnistTest["label"], ntrees=100, max_depth=10)
예제 #31
0
def swpredsRF(ip, port):
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]],
                               y=swpreds["y"],
                               ntrees=50,
                               max_depth=20,
                               nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1", "X2"]],
                               y=swpreds["y"],
                               ntrees=50,
                               max_depth=20,
                               nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
예제 #32
0
def var_test(ip, port):

    iris_h2o = h2o.import_file(
        path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                            delimiter=',',
                            skip_header=1,
                            usecols=(0, 1, 2, 3))

    var_np = np.var(iris_np, axis=0, ddof=1)
    for i in range(4):
        var_h2o = iris_h2o[i].var()
        assert abs(var_np[i] - var_h2o) < 1e-10, "expected equal variances"

    var_cov_h2o = iris_h2o[0:4].var()
    var_cov_np = np.cov(iris_np, rowvar=0, ddof=1)
예제 #33
0
def nfold_predict(ip,port):
  fr = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate_train.csv"))
  m  = h2o.gbm(x=fr[2:], y=fr[1], nfolds=10, ntrees=10)
  xval_models = m.get_xval_models()
  fr["weights"]=1
  preds = [model.predict(fr) for model in xval_models]
  (sum(preds)/10).show()
예제 #34
0
def multi_dim_slicing(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    prostate = h2o.import_frame(
        path=h2o.locate("smalldata/logreg/prostate.csv"))

    # prostate[int,int] case
    # 48,0,68,1,2,1,12.3,16.3,8
    pros = prostate[47:51, 7]
    assert pros[0] == 16.3, "Incorrect slicing result"
    pros = prostate[172, 8]
    assert pros == 7, "Incorrect slicing result"

    # prostate[slice,int] case
    # rows:
    # 171,1,74,1,3,1,7,0,6
    # 172,1,71,1,3,1,3.3,0,6
    # 173,1,60,1,4,1,7.3,0,7
    # 174,1,62,1,2,1,17.2,0,7
    # 175,0,71,1,2,1,3.8,19,6
    # 176,0,67,1,3,1,5.7,15.4,6
    pros = prostate[170:176, 2]
    assert pros[0] == 74, "Incorrect slicing result"
    assert pros[1] == 71, "Incorrect slicing result"
    assert pros[2] == 60, "Incorrect slicing result"
    assert pros[3] == 62, "Incorrect slicing result"
    assert pros[4] == 71, "Incorrect slicing result"
    assert pros[5] == 67, "Incorrect slicing result"

    # # prostate[int,list] case
    # # 353,0,54,1,3,1,21.6,25,7
    # # 226,0,68,1,2,1,12.7,0,7
    # # 238,0,66,1,2,1,11,36.6,6
    # pros = prostate[6,[352,225,237]]
    # assert (pros[0,0] - 12.7) < 1e-10, "Incorrect slicing result"
    # assert (pros[0,1] - 11) < 1e-10, "Incorrect slicing result"
    # assert (pros[0,2] - 21.6) < 1e-10, "Incorrect slicing result"

    # prostate [int,slice] case
    # 189,1,69,1,3,2,8,31.2,6
    pros = prostate[188, 0:3]
    assert pros[0, 0] == 189, "Incorrect slicing result"
    assert pros[0, 1] + 1 == 2, "Incorrect slicing result"
    assert pros[0, 2] == 69, "Incorrect slicing result"

    # prostate [slice,slice] case
    # 84,0,75,1,2,1,11,35,7
    # 85,0,75,1,1,1,9.9,15.4,7
    # 86,1,75,1,3,1,3.7,0,6
    pros = prostate[83:86, 1:4]
    assert pros[0, 0] == 0, "Incorrect slicing result"
    assert pros[0, 1] == 75, "Incorrect slicing result"
    assert pros[0, 2] - 1 == 0, "Incorrect slicing result"
    assert pros[1, 0] == 0, "Incorrect slicing result"
    assert pros[1, 1] + 75 == 150, "Incorrect slicing result"
    assert pros[1, 2] == 1, "Incorrect slicing result"
    assert pros[2, 0] + 1 == 2, "Incorrect slicing result"
    assert pros[2, 1] == 75, "Incorrect slicing result"
    assert pros[2, 2] == 1, "Incorrect slicing result"
예제 #35
0
def sub_gsub_check(ip, port):
    # Connect to a pre-existing cluster

    frame = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    # single column (frame)
    frame["C5"] = frame["C5"].gsub("s", "z")
    assert frame[
        0, 4] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(
            frame[0, 4])

    frame["C5"] = frame["C5"].sub("z", "s")
    assert frame[
        1, 4] == "Iris-zetoza", "Expected 'Iris-zetoza', but got {0}".format(
            frame[1, 4])

    # single column (vec)
    vec = frame["C5"]
    vec = vec.sub("z", "s")
    assert vec[
        2, 0] == "Iris-setoza", "Expected 'Iris-setoza', but got {0}".format(
            vec[2, 0])

    vec = vec.gsub("s", "z")
    assert vec[
        3, 0] == "Iriz-zetoza", "Expected 'Iriz-zetoza', but got {0}".format(
            vec[3, 0])
예제 #36
0
def demo_prostateGBM(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    # execute ipython notebook
    h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/prostate_gbm.ipynb"),
                          save_and_norun=False)
예제 #37
0
def rf_balance_classes(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    # execute ipython notebook
    h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/rf_balance_classes.ipynb"),
                          save_and_norun=False)
def link_functions_tweedie_basic(ip, port):

    print "Read in prostate data."
    hdf = h2o.upload_file(
        h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))

    print "Testing for family: TWEEDIE"
    print "Set variables for h2o."
    y = "CAPSULE"
    x = ["AGE", "RACE", "DCAPS", "PSA", "VOL", "DPROS", "GLEASON"]

    print "Create models with canonical link: TWEEDIE"
    model_h2o_tweedie = h2o.glm(x=hdf[x],
                                y=hdf[y],
                                family="tweedie",
                                link="tweedie",
                                alpha=[0.5],
                                Lambda=[0])

    print "Compare model deviances for link function tweedie (using precomputed values from R)"
    deviance_h2o_tweedie = model_h2o_tweedie.residual_deviance(
    ) / model_h2o_tweedie.null_deviance()

    assert 0.721452 - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than R's. h2o: " \
                                                    "{0}, r: {1}".format(deviance_h2o_tweedie, 0.721452)
예제 #39
0
def expr_show(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    # expr[int], expr._data is pending
    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    # expr[int], expr._data is remote
    res3 = res[0]
    print "res3:"
    res3.show()

    # expr[int], expr._data is local
    expr = Expr([1, 2, 3])
    print "expr:"
    expr.show()

    # expr[tuple], expr._data is local
    expr = Expr([[1, 2, 3], [4, 5, 6]])
    print "expr:"
    expr.show()
예제 #40
0
def center_scale(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))[0:4]

    # frame (default args)
    foo = iris.scale()
    # TODO: the below assertion fails. Should it?
    #assert abs(foo[0,0] - -0.8976739) < 1e-6 and  abs(foo[0,1] - 1.01560199) < 1e-6 and abs(foo[0,2] - -1.335752) < 1e-6 \
    #       and abs(foo[0,3] - -1.311052) < 1e-6, "h2o differed from r. h2o got {0}, {1}, {2}, and {3}" \
    #                                             "".format(foo[0,0],foo[0,1],foo[0,2],foo[0,3])

    # frame (centers=True, scale=False)
    foo = iris.scale(center=True, scale=False)

    # frame (centers=False, scale=True)
    foo = iris.scale(center=False, scale=True)

    # frame (centers=False, scale=False)
    foo = iris.scale(center=False, scale=False)

    # vec (default args)
    foo = iris[0].scale()

    # vec (centers=True, scale=False)
    foo = iris[1].scale(center=True, scale=False)

    # vec (centers=False, scale=True)
    foo = iris[2].scale(center=False, scale=True)

    # vec (centers=False, scale=False)
    foo = iris[3].scale(center=False, scale=False)
예제 #41
0
def demo_citibike_small(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    # execute ipython notebook
    h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/citi_bike_small.ipynb"),
                          save_and_norun=False)
예제 #42
0
def offset_gaussian(ip,port):
    # Connect to a pre-existing cluster
    

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", ntrees=600, max_depth=1, min_rows=1,
                  learn_rate=.1, offset_column="offset", training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from R's gbm:
    #	fit2 <- gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,
    #               shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #   data = Insurance, distribution ="gaussian", n.trees = 600)
    #   pg = predict(fit2, newdata = Insurance, type = "response", n.trees=600)
    #   pr = pg - - log(Insurance$Holders)
    assert abs(44.33016 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(44.33016, gbm._model_json['output']['init_f'])
    assert abs(1491.135 - gbm.mse()) < 1e-2, "expected mse to be {0}, but got {1}".format(1491.135, gbm.mse())
    assert abs(49.23438 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
        format(49.23438, predictions.mean())
    assert abs(-45.5720659304 - predictions.min()) < 1e-2, "expected prediction min to be {0}, but got {1}". \
        format(-45.5720659304, predictions.min())
    assert abs(207.387 - predictions.max()) < 1e-2, "expected prediction max to be {0}, but got {1}". \
        format(207.387, predictions.max())
def benign(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    training_data = h2o.import_frame(h2o.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = [x for x in range(2, 11) if x != Y]

    #Log.info("Build the model")
    model = h2o.glm(y=training_data[Y].asfactor(),
                    x=training_data[X],
                    family="binomial",
                    alpha=[0],
                    Lambda=[1e-5])

    #Log.info("Check that the columns used in the model are the ones we passed in.")
    #Log.info("===================Columns passed in: ================")
    in_names = [training_data.names()[i] for i in X]
    #Log.info("===================Columns passed out: ================")
    out_names = [
        model._model_json['output']['coefficients_table'].cell_values[c][0]
        for c in range(len(X))
    ]
    assert in_names == out_names
def deep_learning_metrics_test(ip, port):
  h2o.init(ip, port)               # connect to existing cluster
  df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

  del df['ID']                               # remove ID
  df['CAPSULE'] = df['CAPSULE'].asfactor()   # make CAPSULE categorical
  vol = df['VOL']
  vol[vol == 0] = None                       # 0 VOL means 'missing'

  r = vol.runif()                            # random train/test split
  train = df[r < 0.8]
  test  = df[r >= 0.8]

  # See that the data is ready
  train.describe()
  train.head()
  test.describe()
  test.head()

  # Run DeepLearning

  print "Train a Deeplearning model: "
  dl = h2o.deeplearning(x           = train[1:],
                        y           = train['CAPSULE'],
                        epochs = 100,
                        hidden = [10, 10, 10],
                        loss   = 'CrossEntropy')
  print "Binomial Model Metrics: "
  print
  dl.show()
  # print dl._model_json
  dl.model_performance(test).show()
예제 #45
0
def prep_airlines(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    air = h2o.import_frame(
        h2o.locate("smalldata/airlines/allyears2k_headers.zip"))

    numRows, numCols = air.dim()

    x_cols = [
        "Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime",
        "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"
    ]
    y_col = "SynthDepDelayed"

    noDepDelayedNAs = air[air["DepDelay"].isna() == 0]
    print "Dimensions of new dataset: {0}".format(noDepDelayedNAs.dim())

    minutesOfDelayWeTolerate = 15
    noDepDelayedNAs.cbind(
        noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate)
    noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols].asfactor()
    noDepDelayedNAs._vecs[numCols].setName(y_col)

    gbm = h2o.gbm(x=noDepDelayedNAs[x_cols],
                  y=noDepDelayedNAs[y_col],
                  loss="bernoulli")
    gbm.show()
def link_incompatible_error(ip, port):

    print("Reading in original prostate data.")
    prostate = h2o.import_file(
        path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

    print(
        "Throw error when trying to create model with incompatible logit link."
    )
    try:
        h2o.model = h2o.glm(x=prostate[1:8],
                            y=prostate[8],
                            family="gaussian",
                            link="logit")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[1:8],
                            y=prostate[8],
                            family="tweedie",
                            link="log")
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        h2o.model = h2o.glm(x=prostate[2:9],
                            y=prostate[1],
                            family="binomial",
                            link="inverse")
        assert False, "expected an error"
    except EnvironmentError:
        assert True
예제 #47
0
def offset_gamma(ip, port):
    # Connect to a pre-existing cluster

    insurance = h2o.import_file(h2o.locate("smalldata/glm_test/insurance.csv"))

    insurance["offset"] = insurance["Holders"].log()

    gbm = h2o.gbm(x=insurance[0:3],
                  y=insurance["Claims"],
                  distribution="gamma",
                  ntrees=600,
                  max_depth=1,
                  min_rows=1,
                  learn_rate=.1,
                  offset_column="offset",
                  training_frame=insurance)

    predictions = gbm.predict(insurance)

    # Comparison result generated from harrysouthworth's gbm:
    #	fit2 = gbm(Claims ~ District + Group + Age+ offset(log(Holders)) , interaction.depth = 1,n.minobsinnode = 1,shrinkage = .1,bag.fraction = 1,train.fraction = 1,
    #           data = Insurance, distribution ="gamma", n.trees = 600)
    #	pr = predict(fit2, Insurance)
    #	pr = exp(pr+log(Insurance$Holders))
    assert abs(-1.714958 - gbm._model_json['output']['init_f']) < 1e-5, "expected init_f to be {0}, but got {1}". \
        format(-1.714958, gbm._model_json['output']['init_f'])
    assert abs(50.1087 - predictions.mean()) < 1e-2, "expected prediction mean to be {0}, but got {1}". \
        format(50.1087, predictions.mean())
    assert abs(0.9133843 - predictions.min()) < 1e-4, "expected prediction min to be {0}, but got {1}". \
        format(0.9133843, predictions.min())
    assert abs(392.6667 - predictions.max()) < 0.1, "expected prediction max to be {0}, but got {1}". \
        format(392.6667, predictions.max())
예제 #48
0
def expr_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # expr[int] (column slice), expr is pending
    res = 2 - iris
    res2 = res[0]
    assert abs(res2[3,0] - -2.6) < 1e-10 and abs(res2[17,0] - -3.1) < 1e-10 and abs(res2[24,0] - -2.8) < 1e-10, \
        "incorrect values"

    # expr[int,int], expr is remote
    assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values"

    # expr[int, slice], expr is remote
    res4 = res[12, 0:3]
    assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \
        abs(res4[0,3] - 1.9) < 1e-10, "incorrect values"

    # expr[slice, int], expr is remote
    res5 = res[5:8, 1]
    assert abs(res5[0,0] - -1.9) < 1e-10 and abs(res5[1,0] - -1.4) < 1e-10 and abs(res5[2,0] - -1.4) < 1e-10 and \
           abs(res5[3,0] - -0.9) < 1e-10, "incorrect values"

    # expr[slice, slice], expr is pending
    res = iris * 2
    res6 = res[5:8, 0:3]
    assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \
           abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"
예제 #49
0
def demo_imputation(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    # execute ipython notebook
    h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/imputation.ipynb"),
                          save_and_norun=False)
예제 #50
0
def weights_gamma(ip, port):

    htable = h2o.upload_file(h2o.locate("smalldata/gbm_test/moppe.csv"))
    htable["premiekl"] = htable["premiekl"].asfactor()
    htable["moptva"] = htable["moptva"].asfactor()
    htable["zon"] = htable["zon"]
    #gg = gbm(formula = medskad ~ premiekl + moptva + zon,data = table.1.2,distribution = "gamma", weights = table.1.2$antskad ,
    #     n.trees = 20,interaction.depth = 1,n.minobsinnode = 1,shrinkage = 1,bag.fraction = 1,train.fraction = 1)
    #pr = predict(gg,newdata = table.1.2,type = "response")
    #htable= as.h2o(table.1.2,destination_frame = "htable")
    hh = h2o.gbm(x=htable[0:3],
                 y=htable["medskad"],
                 training_frame=htable,
                 distribution="gamma",
                 weights_column="antskad",
                 ntrees=20,
                 max_depth=1,
                 min_rows=1,
                 learn_rate=1)
    ph = hh.predict(htable)

    assert abs(8.804447 - hh._model_json['output']['init_f']) < 1e-6 * 8.804447
    assert abs(3751.01 - ph[0].min()) < 1e-4 * 3751.01
    assert abs(15298.87 - ph[0].max()) < 1e-4 * 15298.87
    assert abs(8121.98 - ph[0].mean()) < 1e-4 * 8121.98
예제 #51
0
def ls_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.ls()
def perfectSeparation_balanced(ip, port):

    # Connect to h2o
    h2o.init(ip, port)

    print("Read in synthetic balanced dataset")
    data = h2o.import_frame(
        path=h2o.locate("smalldata/synthetic_perfect_separation/balanced.csv"))

    print("Fit model on dataset")
    model = h2o.glm(x=data[["x1", "x2"]],
                    y=data["y"],
                    family="binomial",
                    lambda_search=True,
                    use_all_factor_levels=True,
                    alpha=[0.5],
                    Lambda=[0])

    print(
        "Extract models' coefficients and assert reasonable values (ie. no greater than 50)"
    )
    print("Balanced dataset")
    coef = [
        c[1]
        for c in model._model_json['output']['coefficients_table'].cell_values
        if c[0] != "Intercept"
    ]
    for c in coef:
        assert c < 50, "coefficient is too large"
def covtype(ip,port):

  # Connect to h2o
  h2o.init(ip,port)

  # Log.info("Importing covtype.20k.data...\n")
  covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))
  #
  myY = 54
  myX = [x for x in range(0,54) if x not in [20,28]]

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
  covtype[54] = (covtype[54] == res_class)

  #covtype.summary()

  # L2: alpha = 0, lambda = 0
  covtype_mod1 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0], Lambda=[0])
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[0.5], Lambda=[1e-4])
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = h2o.glm(y=covtype[myY], x=covtype[myX], family="binomial", n_folds=0, alpha=[1], Lambda=[1e-4])
  covtype_mod3.show()
예제 #54
0
def link_functions_tweedie_vpow(ip,port):
    
    

    # Load example data from HDtweedie, y = aggregate claim loss
    hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv"))
    y = "y"
    x = list(set(hdf.names) - set(["y"]))

    print "Testing for family: TWEEDIE"
    print "Create models with canonical link: TWEEDIE"
    # Iterate over different variance powers for tweedie
    vpower = [0, 1, 1.5]
    r_dev = [0.7516627, 0.6708826, 0.7733762]
    r_null = [221051.88369951, 32296.29783702, 20229.47425307]
    for ridx, vpow in enumerate(vpower):
        print "Fit h2o.glm:"
        h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow,
                         alpha=[0.5], Lambda=[0])

        print "Testing Tweedie variance power: {0}".format(vpow)

        print "Compare model deviances for link function tweedie"
        deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance()

        assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                           "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

        print "compare null and residual deviance between R glm and h2o.glm for tweedie"
        assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                                   "{1}".format(h2ofit.null_deviance(), r_null[ridx])
예제 #55
0
def vi_toy_test(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    toy_data = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/toy_data_RF.csv"))
    #toy_data.summary()

    toy_data[6] = toy_data[6].asfactor()
    toy_data.show()
    rf = h2o.random_forest(x=toy_data[[0, 1, 2, 3, 4, 5]],
                           y=toy_data[6],
                           ntrees=500,
                           max_depth=20,
                           nbins=100,
                           seed=0)

    ranking = [
        rf._model_json['output']['variable_importances'].cell_values[v][0]
        for v in range(toy_data.ncol() - 1)
    ]
    print(ranking)
    assert tuple(ranking) == tuple(
        ["V3", "V2", "V6", "V5", "V1",
         "V4"]), "expected specific variable importance ranking"
예제 #56
0
def bigcatRF(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has 100 categories from cat001 to cat100
    # Categories cat001, cat003, ... are perfect predictors of y = 1
    # Categories cat002, cat004, ... are perfect predictors of y = 0

    #Log.info("Importing bigcat_5000x2.csv data...\n")
    bigcat = h2o.import_frame(
        path=h2o.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    bigcat["y"] = bigcat["y"].asfactor()

    #Log.info("Summary of bigcat_5000x2.csv from H2O:\n")
    #bigcat.summary()

    # Train H2O DRF Model:
    #Log.info("H2O DRF (Naive Split) with parameters:\nclassification = TRUE, ntree = 1, depth = 1, nbins = 100, nbins_cats=10\n")
    model = h2o.random_forest(x=bigcat[["X"]],
                              y=bigcat["y"],
                              ntrees=1,
                              max_depth=1,
                              nbins=100,
                              nbins_cats=10)
    model.show()
예제 #57
0
def parametersKmeans(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)  # connect to localhost:54321

    #Log.info("Getting data...")
    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    #Log.info("Create and and duplicate...")
    iris_km = h2o.kmeans(x=iris[0:4], k=3, seed=1234)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['label']] = parameters[p]['actual_value']

    iris_km_again = h2o.kmeans(x=iris[0:4], **param_dict)

    #Log.info("wss")
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    #Log.info("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
def link_correct_default(ip,port):
	
	

	print("Reading in original prostate data.")
	h2o_data = h2o.upload_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

	print("Compare models with link unspecified and canonical link specified.")
	print("GAUSSIAN: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian")
	h2o_model_specified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian", link="identity")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("BINOMIAL: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial", link="logit")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("POISSON: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("GAMMA: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma")
	h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
예제 #59
0
def demo_chicago_crimes(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    # execute ipython notebook
    h2o.ipy_notebook_exec(h2o.locate("h2o-py/demos/chicago_crimes.ipynb"),
                          save_and_norun=False)
예제 #60
0
def asfactor_basic(ip,port):
  
  

  #Log.info("Printing out the head of the cars datasets")
  h2oframe =  h2o.import_file(path=h2o.locate("smalldata/junit/cars.csv"))
  h2oframe.show()

  h2oframe['cylinders'].show()
  foo = h2oframe["cylinders"]
  foo.show()

  h2oframe['cylinders'].asfactor().show()

  meow = h2oframe['cylinders'].asfactor()
  meow.show()

  foo = h2oframe["cylinders"].isfactor()
  assert not foo, "expected the foo H2OVec to be a not factor"

  h2oframe["cylinders"] = h2oframe['cylinders'].asfactor()
  h2oframe.show()

  bar = h2oframe["cylinders"].isfactor()
  assert bar, "expected the bar H2OVec to be a factor"