def smallcatGBM():
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  
  

  #Log.info("Importing alphabet_cattest.csv data...\n")
  alphabet = h2o.import_file(path=tests.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(tests.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
                         converters={0:lambda s: ord(s.split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  gbm_h2o.show()
  
  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
def dim_checks():
  
  

  # Log.info("Uploading logreg/princeton/cuse.dat")
  h2o_data = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))
  np_data = np.loadtxt(tests.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1)

  h2o_rows, h2o_cols = h2o_data.dim
  np_rows, np_cols = list(np_data.shape)

  print 'The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols)
  print 'The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols)

  assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows"

  # Log.info("Slice out a column and data frame it, try dim on it...")

  h2o_slice = h2o_data[4]
  np_slice = np_data[:,4]

  h2o_rows, h2o_cols = h2o_slice.dim
  np_rows = np_slice.shape[0]

  print 'The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols)
  print 'The dimensions of numpy array column slice is: {0} x 1'.format(np_rows)

  assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows"

  # Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")

  h2oColAmpFive = h2o_slice & 5

  assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
def link_functions_binomial():
  
  

  print("Read in prostate data.")
  h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip"))
  h2o_data.head()

  sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
  sm_data_response = sm_data[:,2]
  sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

  print("Testing for family: BINOMIAL")
  print("Set variables for h2o.")
  myY = "CAPSULE"
  myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

  print("Create models with canonical link: LOGIT")
  h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
  sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

  print("Compare model deviances for link function logit")
  h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
  sm_deviance = sm_model.deviance / sm_model.null_deviance
  assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Пример #4
0
def deeplearning_demo():
    # Training data
    train_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop('Site')
    train_data['Angaus'] = train_data['Angaus'].asfactor()
    print train_data.describe()
    train_data.head()

    # Testing data
    test_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data['Angaus'] = test_data['Angaus'].asfactor()
    print test_data.describe()
    test_data.head()

    # Run DeepLearning

    dl = H2ODeepLearningEstimator(loss="CrossEntropy",
                                  epochs=1000,
                                  hidden=[20, 20, 20])
    dl.train(x=range(1, train_data.ncol),
             y="Angaus",
             training_frame=train_data,
             validation_frame=test_data)
    dl.show()
Пример #5
0
def benignKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(
        path=tests.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(tests.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
    for i in range(1, 7):
        benign_h2o_km = h2o.kmeans(x=benign_h2o, k=i)
        print "H2O centers"
        print benign_h2o_km.centers()

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print "sckit centers"
        print benign_sci_km.cluster_centers_
def link_functions_gaussian():

    print("Read in prostate data.")
    h2o_data = h2o.import_file(
        path=tests.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            tests.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX],
                        y=h2o_data[myY],
                        family="gaussian",
                        link="identity",
                        alpha=[0.5],
                        Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response,
                      exog=sm_data_features,
                      family=sm.families.Gaussian(
                          sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Пример #7
0
def group_by():
    # Connect to a pre-existing cluster

    h2o_iris = h2o.import_file(
        path=tests.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(tests.locate("smalldata/iris/iris_wheader.csv"))

    na_handling = ["ignore", "rm", "all"]
    col_names = h2o_iris.col_names[0:4]

    print "Running smoke test"

    # smoke test
    for na in na_handling:
        grouped = h2o_iris.group_by("class")
        grouped \
          .count(na=na) \
          .min(  na=na) \
          .max(  na=na) \
          .mean( na=na) \
          .var(  na=na) \
          .sd(   na=na) \
          .ss(   na=na) \
          .sum(  na=na)
        print grouped.get_frame()
Пример #8
0
def additional_parameters():

    #col_types as list
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = ["enum", "enum", "string"]

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
        assert col_summary[i]["type"] == c_types[i]

    #col_types as dictionary
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = {"c":"string", "a":"enum", "b": "enum"}

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
      assert col_summary[i]["type"] == c_types[c_names[i]]
Пример #9
0
def plot_test():
    
    
    kwargs = {}
    kwargs['server'] = True

    air = h2o.import_file(tests.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
    myY = "IsDepDelayed"

    air_gbm = h2o.gbm(x=air_train[myX], y=air_train[myY], validation_x=air_valid[myX], validation_y=air_valid[myY],
                      distribution="bernoulli", ntrees=100, max_depth=3, learn_rate=0.01)

    # Plot ROC for training and validation sets
    air_gbm.plot(type="roc", train=True, **kwargs)
    air_gbm.plot(type="roc", valid=True, **kwargs)

    air_test = h2o.import_file(tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf = air_gbm.model_performance(air_test)

    #Plot ROC for test set
    perf.plot(type="roc", **kwargs)
def fiftycatGBM():
  
  

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrix()
  test_auc = performance.auc()
Пример #11
0
def pubdev_1953():

    # small_test = [tests.locate("bigdata/laptop/citibike-nyc/2013-10.csv")]
    # data = h2o.import_file(path=small_test)
    # startime = data["starttime"]
    # secsPerDay=1000*60*60*24
    # data["Days"] = (startime/secsPerDay).floor()
    # grouped = data.group_by(["Days","start station name"])
    # bpd = grouped.count(name="bikes").get_frame()
    # secs = bpd["Days"]*secsPerDay
    # bpd["Month"]     = secs.month().asfactor()
    # bpd["DayOfWeek"] = secs.dayOfWeek()
    # wthr1 = h2o.import_file(path=[tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"), tests.locate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
    # wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
    # wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
    # wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
    # wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
    # wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
    # secsPerDay=1000*60*60*24
    # wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
    # wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
    # rain = wthr4["Rain (mm)"]
    # rain[ rain.isna() ] = 0
    # bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
    # r = bpd_with_weather['Days'].runif(seed=356964763)
    # train = bpd_with_weather[  r  < 0.6]
    # test  = bpd_with_weather[(0.6 <= r) & (r < 0.9)]

    predictors = ['DayOfWeek', 'WC1', 'start station name', 'Temperature (C)', 'Days', 'Month', 'Humidity Fraction', 'Rain (mm)', 'Dew Point (C)']

    train = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_train.csv"))
    test = h2o.import_file(tests.locate("smalldata/glm_test/citibike_small_test.csv"))

    glm0 = h2o.glm(x=train[predictors], y=train["bikes"], validation_x=test[predictors], validation_y=test["bikes"], family="poisson")
def frame_as_list():

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip"))

    res1 = h2o.as_list(iris, use_pandas=False)
    assert (
        abs(float(res1[9][0]) - 4.4) < 1e-10
        and abs(float(res1[9][1]) - 2.9) < 1e-10
        and abs(float(res1[9][2]) - 1.4) < 1e-10
    ), "incorrect values"

    res2 = h2o.as_list(prostate, use_pandas=False)
    assert (
        abs(float(res2[7][0]) - 7) < 1e-10
        and abs(float(res2[7][1]) - 0) < 1e-10
        and abs(float(res2[7][2]) - 68) < 1e-10
    ), "incorrect values"

    res3 = h2o.as_list(airlines, use_pandas=False)
    assert (
        abs(float(res3[4][0]) - 1987) < 1e-10
        and abs(float(res3[4][1]) - 10) < 1e-10
        and abs(float(res3[4][2]) - 18) < 1e-10
    ), "incorrect values"
def checkpoint_new_category_in_predictor():

    sv1 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    sv2 = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    vir = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))

    m1 = h2o.deeplearning(x=sv1[[0, 1, 2, 4]], y=sv1[3], epochs=100)

    m2 = h2o.deeplearning(x=sv2[[0, 1, 2, 4]],
                          y=sv2[3],
                          epochs=200,
                          checkpoint=m1.model_id)

    # attempt to continue building model, but with an expanded categorical predictor domain.
    # this should fail
    try:
        m3 = h2o.deeplearning(x=vir[[0, 1, 2, 4]],
                              y=vir[3],
                              epochs=200,
                              checkpoint=m1.model_id)
        assert False, "Expected continued model-building to fail with new categories introduced in predictor"
    except EnvironmentError:
        pass

    # attempt to predict on new model, but with observations that have expanded categorical predictor domain.
    predictions = m2.predict(vir)
def wide_dataset_large():
    
    

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(tests.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
def group_by():
    # Connect to a pre-existing cluster
    

    h2o_iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(tests.locate("smalldata/iris/iris_wheader.csv"))

    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names[0:4]

    print "Running smoke test"

    # smoke test
    for na in na_handling:
      grouped = h2o_iris.group_by("class")
      grouped \
        .count(na=na) \
        .min(  na=na) \
        .max(  na=na) \
        .mean( na=na) \
        .var(  na=na) \
        .sd(   na=na) \
        .ss(   na=na) \
        .sum(  na=na)
      print grouped.get_frame()
Пример #16
0
def anomaly():
    

    print "Deep Learning Anomaly Detection MNIST"

    train = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    test = h2o.import_file(tests.locate("bigdata/laptop/mnist/test.csv.gz"))

    predictors = range(0,784)
    resp = 784

    # unsupervised -> drop the response column (digit: 0-9)
    train = train[predictors]
    test = test[predictors]

    # 1) LEARN WHAT'S NORMAL
    # train unsupervised Deep Learning autoencoder model on train_hex
    ae_model = h2o.deeplearning(x=train[predictors], training_frame=train, activation="Tanh", autoencoder=True,
                                hidden=[50], l1=1e-5, ignore_const_cols=False, epochs=1)

    # 2) DETECT OUTLIERS
    # anomaly app computes the per-row reconstruction error for the test data set
    # (passing it through the autoencoder model and computing mean square error (MSE) for each row)
    test_rec_error = ae_model.anomaly(test)

    # 3) VISUALIZE OUTLIERS
    # Let's look at the test set points with low/median/high reconstruction errors.
    # We will now visualize the original test set points and their reconstructions obtained
    # by propagating them through the narrow neural net.

    # Convert the test data into its autoencoded representation (pass through narrow neural net)
    test_recon = ae_model.predict(test)
def iris_h2o_vs_sciKmeans():
  # Connect to a pre-existing cluster
    # connect to localhost:54321

  iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
Пример #18
0
def javapredict_cars():

    # optional parameters
    params = {
        'ntrees': 5000,
        'max_depth': 10,
        'min_rows': 1,
        'learn_rate': 0.1,
        'balance_classes': random.sample([True, False], 1)[0]
    }
    print "Parameter list:"
    for k, v in zip(params.keys(), params.values()):
        print "{0}, {1}".format(k, v)

    train = h2o.import_file(
        tests.locate("smalldata/junit/cars_nice_header.csv"))
    test = h2o.import_file(
        tests.locate("smalldata/junit/cars_nice_header.csv"))
    x = [
        "name", "economy", "displacement", "power", "weight", "acceleration",
        "year"
    ]
    y = "cylinders"

    tests.javapredict("gbm", "numeric", train, test, x, y, **params)
def frame_slicing():
    
    

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = iris[0]
    assert abs(res1[8,:] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = prostate[13, 3]
    assert abs(res2 - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = airlines[12, 0:3]
    assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = iris[5:8, 1]
    assert abs(res4[0,:] - 3.9) < 1e-10 and abs(res4[1,:] - 3.4) < 1e-10 and abs(res4[2,:] - 3.4) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = prostate[5:8, 0:3]
    assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
def link_functions_gaussian():
    
    

    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=tests.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(zipfile.ZipFile(tests.locate("smalldata/prostate/prostate_complete.csv.zip")).
                          open("prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:,9]
    sm_data_features = sm_data[:,1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity",alpha=[0.5], Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
                      family=sm.families.Gaussian(sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def fiftycatRF():
    
    

    # Training set has only 45 categories cat1 through cat45
    #Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    #train.summary()

    # Train H2O DRF Model:
    #Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    #Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_file(path=tests.locate("smalldata/gbm_test/50_cattest_test.csv"))

    #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    #test.summary()

    # Predict on test dataset with DRF model:
    #Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    #Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
def offsets_and_distributions():

    # cars
    cars = h2o.upload_file(tests.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame(python_obj=[[.5] for x in range(398)])
    offset.set_name(0,"x1")
    cars = cars.cbind(offset)

    # insurance
    insurance = h2o.import_file(tests.locate("smalldata/glm_test/insurance.csv"))
    insurance["offset"] = insurance["Holders"].log()

    # bernoulli - offset not supported
    #dl = h2o.deeplearning(x=cars[2:8], y=cars["economy_20mpg"], distribution="bernoulli", offset_column="x1",
    #                       training_frame=cars)
    #predictions = dl.predict(cars)

    # gamma
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gamma", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # gaussian
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="gaussian", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # poisson
    dl = h2o.deeplearning(x=insurance[0:3], y=insurance["Claims"], distribution="poisson", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)

    # tweedie
    dl = h2o.deeplearning(x=insurance.names[0:3], y="Claims", distribution="tweedie", offset_column="offset", training_frame=insurance)
    predictions = dl.predict(insurance)
Пример #23
0
def col_names_check():

  iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
  assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
      "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                         iris_wheader.col_names)

  iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
  assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                         "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

  df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
  df.head()
  assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
  assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
                              "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"},
                                                  df.types)

  df = h2o.H2OFrame(np.random.randn(100,4).tolist())
  df.head()
  assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
                                                                                                 , df.col_names)
  assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
                      " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
                                                             "C4": "Numeric"}, df.types)
Пример #24
0
def hit_ratio_test():

    air_train = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    air_valid = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    air_test = h2o.import_file(
        path=tests.locate("smalldata/airlines/AirlinesTest.csv.zip"))

    gbm_mult = h2o.gbm(x=air_train[[
        "Origin", "Dest", "Distance", "UniqueCarrier", "IsDepDelayed",
        "fDayofMonth", "fMonth"
    ]],
                       y=air_train["fDayOfWeek"].asfactor(),
                       validation_x=air_valid[[
                           "Origin", "Dest", "Distance", "UniqueCarrier",
                           "IsDepDelayed", "fDayofMonth", "fMonth"
                       ]],
                       validation_y=air_valid["fDayOfWeek"].asfactor(),
                       distribution="multinomial")

    training_hit_ratio_table = gbm_mult.hit_ratio_table(train=True)
    training_hit_ratio_table.show()

    validation_hit_ratio_table = gbm_mult.hit_ratio_table(valid=True)
    validation_hit_ratio_table.show()

    perf = gbm_mult.model_performance(air_test)
    test_hit_ratio_table = perf.hit_ratio_table()
    test_hit_ratio_table.show()
Пример #25
0
def iris_h2o_vs_sciKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    iris_h2o = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))
    iris_sci = np.genfromtxt(tests.locate("smalldata/iris/iris.csv"),
                             delimiter=',')
    iris_sci = iris_sci[:, 0:4]

    s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]

    start = h2o.H2OFrame(s)

    h2o_km = h2o.kmeans(x=iris_h2o[0:4],
                        k=3,
                        user_points=start,
                        standardize=False)

    sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
    sci_km.fit(iris_sci)

    # Log.info("Cluster centers from H2O:")
    print "Cluster centers from H2O:"
    h2o_centers = h2o_km.centers()
    print h2o_centers

    # Log.info("Cluster centers from scikit:")
    print "Cluster centers from scikit:"
    sci_centers = sci_km.cluster_centers_.tolist()
    print sci_centers

    for hcenter, scenter in zip(h2o_centers, sci_centers):
        for hpoint, spoint in zip(hcenter, scenter):
            assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
Пример #26
0
def separator():

    path = "smalldata/jira/hexdev_29.csv"

    fhex = h2o.import_file(tests.locate(path), sep=",")
    fhex.summary()
    fhex_col_summary = h2o.H2OConnection.get_json(
        "Frames/" + urllib.quote(fhex._id) +
        "/summary")["frames"][0]["columns"]
    fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary])
    assert fhex_missing_count == 0

    fhex_wrong_separator = h2o.import_file(tests.locate(path), sep=";")
    fhex_wrong_separator.summary()
    fhex_wrong_separator_col_summary = h2o.H2OConnection.get_json(
        "Frames/" + urllib.quote(fhex_wrong_separator._id) +
        "/summary")["frames"][0]["columns"]
    fhex_wrong_separator_missing_count = sum(
        [e["missing_count"] for e in fhex_wrong_separator_col_summary])
    assert fhex_wrong_separator_missing_count == fhex_wrong_separator._nrows * fhex_wrong_separator._ncols

    try:
        h2o.import_file(tests.locate(path), sep="--")
    except ValueError:
        pass
    else:
        assert False
Пример #27
0
def test_locate():

    iris_path = h2o.locate("smalldata/iris/iris.csv")

    try:
        tests.locate("smalldata/iris/afilethatdoesnotexist.csv")
        assert False, "Expected h2o.locate to raise a ValueError"
    except ValueError:
        assert True
def test_locate():

    iris_path = h2o.locate("smalldata/iris/iris.csv")

    try:
        tests.locate("smalldata/iris/afilethatdoesnotexist.csv")
        assert False, "Expected h2o.locate to raise a ValueError"
    except ValueError:
        assert True
def get_model_test():
    
    

    prostate = h2o.import_file(path=tests.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._id)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_file(path=tests.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._id)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._id)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
Пример #30
0
def bernoulliGBM():
  
  

  #Log.info("Importing prostate.csv data...\n")
  prostate_train = h2o.import_file(path=tests.locate("smalldata/logreg/prostate_train.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

  #Log.info("H2O Summary of prostate frame:\n")
  #prostate.summary()

  # Import prostate_train.csv as numpy array for scikit comparison
  trainData = np.loadtxt(tests.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  trainDataResponse = trainData[:,0]
  trainDataFeatures = trainData[:,1:]

  ntrees = 100
  learning_rate = 0.1
  depth = 5
  min_rows = 10
  # Build H2O GBM classification model:
  #Log.info(paste("H2O GBM with parameters:\ndistribution = 'bernoulli', ntrees = ", ntrees, ", max_depth = 5,
  # min_rows = 10, learn_rate = 0.1\n", sep = ""))
  gbm_h2o = h2o.gbm(x=prostate_train[1:], y=prostate_train["CAPSULE"], ntrees=ntrees, learn_rate=learning_rate,
                    max_depth=depth, min_rows=min_rows, distribution="bernoulli")

  # Build scikit GBM classification model
  #Log.info("scikit GBM with same parameters\n")
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  #Log.info("Importing prostate_test.csv data...\n")
  prostate_test = h2o.import_file(path=tests.locate("smalldata/logreg/prostate_test.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

  # Import prostate_test.csv as numpy array for scikit comparison
  testData = np.loadtxt(tests.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1)
  testDataResponse = testData[:,0]
  testDataFeatures = testData[:,1:]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(prostate_test)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
def pubdev_1839():

    train = h2o.import_file(tests.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test  = h2o.import_file(tests.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x           =train.drop("bikes"),
                   y           =train     ["bikes"],
                   validation_x=test .drop("bikes"),
                   validation_y=test      ["bikes"],
                   family="poisson")
def deeplearning_autoencoder():
    

    resp = 784
    nfeatures = 20 # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(x=train_unsupervised[0:resp],
                                activation="Tanh",
                                autoencoder=True,
                                hidden=[nfeatures],
                                epochs=1,
                                reproducible=True, #slow, turn off for real problems
                                seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.081) < 0.001, "Error. Expected 0.081, but got {0}".format(cm.cell_values[10][10])
def deeplearning_autoencoder():
    

    resp = 784
    nfeatures = 20 # number of features (smallest hidden layer)

    train_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    train_hex[resp] = train_hex[resp].asfactor()

    test_hex = h2o.upload_file(tests.locate("bigdata/laptop/mnist/test.csv.gz"))
    test_hex[resp] = test_hex[resp].asfactor()

    # split data into two parts
    sid = train_hex[0].runif(1234)

    # unsupervised data for autoencoder
    train_unsupervised = train_hex[sid >= 0.5]
    train_unsupervised.drop(resp)
    train_unsupervised.describe()

    # supervised data for drf
    train_supervised = train_hex[sid < 0.5]
    train_supervised.describe()

    # train autoencoder
    ae_model = h2o.deeplearning(x=train_unsupervised[0:resp],
                                activation="Tanh",
                                autoencoder=True,
                                hidden=[nfeatures],
                                epochs=1,
                                reproducible=True, #slow, turn off for real problems
                                seed=1234)

    # conver train_supervised with autoencoder to lower-dimensional space
    train_supervised_features = ae_model.deepfeatures(train_supervised[0:resp]._frame(), 0)

    assert train_supervised_features.ncol == nfeatures, "Dimensionality of reconstruction is wrong!"

    # Train DRF on extracted feature space
    drf_model = h2o.random_forest(x=train_supervised_features[0:20],
                                  y=train_supervised[resp],
                                  ntrees=10,
                                  min_rows=10,
                                  seed=1234)

    # Test the DRF model on the test set (processed through deep features)
    test_features = ae_model.deepfeatures(test_hex[0:resp]._frame(), 0)
    test_features = test_features.cbind(test_hex[resp])._frame()

    # Confusion Matrix and assertion
    cm = drf_model.confusion_matrix(test_features)
    cm.show()

    # 10% error +/- 0.001
    assert abs(cm.cell_values[10][10] - 0.082) < 0.001, "Error. Expected 0.082, but got {0}".format(cm.cell_values[10][10])
Пример #34
0
def shuffling_large():

    print("Reading in Arcene training data for binomial modeling.")
    train_data = h2o.upload_file(
        path=tests.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
    train_data_shuffled = h2o.upload_file(path=tests.locate(
        "smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))

    print("Create model on original Arcene dataset.")
    h2o_model = h2o.glm(x=train_data[0:1000],
                        y=train_data[1000],
                        family="binomial",
                        lambda_search=True,
                        alpha=[0.5])

    print("Create second model on original Arcene dataset.")
    h2o_model_2 = h2o.glm(x=train_data[0:1000],
                          y=train_data[1000],
                          family="binomial",
                          lambda_search=True,
                          alpha=[0.5])

    print("Create model on shuffled Arcene dataset.")
    h2o_model_s = h2o.glm(x=train_data_shuffled[0:1000],
                          y=train_data_shuffled[1000],
                          family="binomial",
                          lambda_search=True,
                          alpha=[0.5])

    print(
        "Assert that number of predictors remaining and their respective coefficients are equal."
    )

    for x, y in zip(
            h2o_model._model_json['output']['coefficients_table'].cell_values,
            h2o_model_2._model_json['output']
        ['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(
            y[2])), "coefficients should be the same type"
        if isinstance(x[1], float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2], float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

    for x, y in zip(
            h2o_model._model_json['output']['coefficients_table'].cell_values,
            h2o_model_s._model_json['output']
        ['coefficients_table'].cell_values):
        assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(
            y[2])), "coefficients should be the same type"
        if isinstance(x[1], float):
            assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
        if isinstance(x[2], float):
            assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
Пример #35
0
def headers():
    
    

    headers = h2o.import_file(tests.locate("smalldata/airlines/allyears2k_headers_only.csv"))
    headers_and = h2o.import_file(tests.locate("smalldata/airlines/allyears2k.zip"))
    headers_and.set_names(headers.names)
    print headers.names
    print headers_and.names
    assert headers.names == headers_and.names, "Expected the same column names but got {0} and {1}". \
        format(headers.names, headers_and.names)
def frame_show():
    
    

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()
Пример #37
0
def runif_check():
    # Connect to a pre-existing cluster
    

    uploaded_frame = h2o.upload_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    r_u = uploaded_frame[0].runif(1234)

    imported_frame = h2o.import_file(tests.locate("bigdata/laptop/mnist/train.csv.gz"))
    r_i = imported_frame[0].runif(1234)

    print "This demonstrates that seeding runif on identical frames with different chunk distributions provides " \
          "different results. upload_file: {0}, import_frame: {1}.".format(r_u.mean(), r_i.mean())
Пример #38
0
def frame_as_list():

    prostate = h2o.import_file(
        path=tests.locate("smalldata/prostate/prostate.csv.zip"))

    (prostate % 10).show()
    (prostate[4] % 10).show()

    airlines = h2o.import_file(
        path=tests.locate("smalldata/airlines/allyears2k_headers.zip"))

    (airlines["CRSArrTime"] % 100).show()
Пример #39
0
def pubdev_1839():

    train = h2o.import_file(
        tests.locate("smalldata/jira/pubdev_1839_repro_train.csv"))
    test = h2o.import_file(
        tests.locate("smalldata/jira/pubdev_1839_repro_test.csv"))

    glm0 = h2o.glm(x=train.drop("bikes"),
                   y=train["bikes"],
                   validation_x=test.drop("bikes"),
                   validation_y=test["bikes"],
                   family="poisson")
Пример #40
0
def frame_show():

    iris = h2o.import_file(
        path=tests.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(
        path=tests.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(
        path=tests.locate("smalldata/airlines/allyears2k.zip"))

    iris.show()
    prostate.show()
    airlines.show()
def col_names_check():
    
    

    iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
    assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
        "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
                                                           iris_wheader.col_names)

    iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
    assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
                                                           "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
def kmeans_mllib():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            tests.locate("smalldata/mllib_bench/bigcross_wcsse.csv"),
            delimiter=",",
            skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(
                k)
            cross_km = h2o.kmeans(training_frame=cross_h2o,
                                  x=cross_h2o,
                                  k=k,
                                  init="PlusPlus",
                                  max_iterations=10,
                                  standardize=False)

            clust_mllib = np.genfromtxt(
                tests.locate("smalldata/mllib_bench/bigcross_centers_" +
                             str(k) + ".csv"),
                delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
def pubdev_1829():

    train =  h2o.import_file(path=tests.locate("smalldata/jira/gbm_checkpoint_train.csv"))
    valid =  h2o.import_file(path=tests.locate("smalldata/jira/gbm_checkpoint_valid.csv"))

    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    train[response_col] = train[response_col].asfactor()
    valid[response_col] = valid[response_col].asfactor()

    ntrees1 = 5
    max_depth1 = 5
    min_rows1 = 10
    model1 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     score_each_iteration=True,
                     distribution=distribution,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])

    ntrees2 = 10
    max_depth2 = 5
    min_rows2 = 10
    model2 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col],
                     checkpoint=model1._id)

    model4 = h2o.gbm(x=train[predictors],
                     y=train[response_col],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     score_each_iteration=True,
                     validation_x=valid[predictors],
                     validation_y=valid[response_col])


    assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
    assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
    assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
Пример #44
0
def javapredict_iris_drf():

    # optional parameters
    params = {'ntrees':100, 'max_depth':5, 'min_rows':10}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(tests.locate("smalldata/iris/iris_train.csv"))
    x = ["sepal_len","sepal_wid","petal_len","petal_wid"]
    y = "species"

    tests.javapredict("random_forest", "class", train, test, x, y, **params)
def javapredict_smallcat():

    # optional parameters
    params = {'ntrees':100, 'max_depth':5, 'min_rows':10}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))
    x = [0,1,2,4]
    y = 3

    tests.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_cars():

    # optional parameters
    params = {'ntrees':5000, 'max_depth':10, 'min_rows':1, 'learn_rate':0.1, 'balance_classes':random.sample([True,False],1)[0]}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.import_file(tests.locate("smalldata/junit/cars_nice_header.csv"))
    test = h2o.import_file(tests.locate("smalldata/junit/cars_nice_header.csv"))
    x = ["name","economy", "displacement","power","weight","acceleration","year"]
    y = "cylinders"

    tests.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_smallcat():

    # optional parameters
    params = {'epochs':100}
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train = h2o.upload_file(tests.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))
    x = [0,1,2,4]
    y = 3

    tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
Пример #48
0
def colname_set_basic():

    print "Uploading iris data..."

    no_headers = h2o.upload_file(tests.locate("smalldata/iris/iris.csv"))
    headers_and = h2o.upload_file(
        tests.locate("smalldata/iris/iris_header.csv"))

    print no_headers.names
    print headers_and.names

    no_headers.set_names(headers_and.names)
    assert no_headers.names == headers_and.names, "Expected the same column names but got {0} and {1}".\
        format(no_headers.names, headers_and.names)
def javapredict_smallcat():

    # optional parameters
    params = {'epochs': 100}
    print "Parameter list:"
    for k, v in zip(params.keys(), params.values()):
        print "{0}, {1}".format(k, v)

    train = h2o.upload_file(
        tests.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))
    x = [0, 1, 2, 4]
    y = 3

    tests.javapredict("deeplearning", "numeric", train, test, x, y, **params)
Пример #50
0
def prostate():

  h2o_data = h2o.upload_file(path=tests.locate("smalldata/logreg/prostate.csv"))
  h2o_data.summary()

  sm_data = pd.read_csv(tests.locate("smalldata/logreg/prostate.csv")).as_matrix()
  sm_data_response = sm_data[:,1]
  sm_data_features = sm_data[:,2:]

  h2o_glm = h2o.glm(y=h2o_data[1], x=h2o_data[2:], family="binomial", nfolds=10, alpha=[0.5])
  sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit()

  print "statsmodels null deviance {0}".format(sm_glm.null_deviance)
  print "h2o null deviance {0}".format(h2o_glm.null_deviance())
  assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()) < 1e-5, "Expected null deviances to be the same"
def javapredict_smallcat():

    # optional parameters
    params = {'ntrees': 100, 'max_depth': 5, 'min_rows': 10}
    print "Parameter list:"
    for k, v in zip(params.keys(), params.values()):
        print "{0}, {1}".format(k, v)

    train = h2o.upload_file(
        tests.locate("smalldata/iris/setosa_versicolor.csv"))
    test = h2o.upload_file(tests.locate("smalldata/iris/virginica.csv"))
    x = [0, 1, 2, 4]
    y = 3

    tests.javapredict("random_forest", "numeric", train, test, x, y, **params)