def get_native_parameters_test():
    assert H2OXGBoostEstimator.available() is True
    ntrees = 17
    # CPU Backend is forced for the results to be comparable
    h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":1, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                  "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}

    nrows = 1000
    ncols = 10
    factorL = 11
    numCols = 0
    enumCols = ncols-numCols

    trainFile = pyunit_utils.genTrainFrame(nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1,
                                           randseed=17)
    print(trainFile)
    myX = trainFile.names
    y='response'
    myX.remove(y)

    h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
    h2oModelS.train(x=myX, y=y, training_frame=trainFile)

    print(h2oModelS._model_json["output"]["native_parameters"].as_data_frame())

    assert h2oModelS._model_json["output"]["native_parameters"]._table_header == u"Native XGBoost Parameters"
Пример #2
0
def interaction_constraint_test():
    assert H2OXGBoostEstimator.available() is True

    # CPU Backend is forced for the results to be comparable
    h2o_params = {
        "tree_method": "exact",
        "seed": 123,
        "backend": "cpu",
        "ntrees": 5
    }

    train = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17)
    print(train)
    x = train.names
    y = 'response'
    x.remove(y)

    h2o_params["interaction_constraints"] = [["C1", "C2"], ["C3", "C4", "C5"]]

    model = H2OXGBoostEstimator(**h2o_params)
    model.train(x=x, y=y, training_frame=train)

    native_params = model._model_json["output"][
        "native_parameters"].as_data_frame()
    print(native_params)

    constraints = (native_params[native_params['name'] ==
                                 "interaction_constraints"])['value'].values[0]

    assert constraints == u'[[0,1],[2,3,4]]', "Constraints should be [[0,1],[2,3,4]] but it is:" + constraints
Пример #3
0
def get_native_parameters_test():
    assert H2OXGBoostEstimator.available() is True

    # CPU Backend is forced for the results to be comparable
    h2oParamsS = {
        "tree_method": "exact",
        "seed": 123,
        "backend": "cpu",
        "ntrees": 5
    }

    trainFile = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17)
    print(trainFile)
    myX = trainFile.names
    y = 'response'
    myX.remove(y)

    h2oParamsS["monotone_constraints"] = {"C1": -1, "C3": 1, "C7": 1}

    h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
    h2oModelS.train(x=myX, y=y, training_frame=trainFile)

    native_params = h2oModelS._model_json["output"][
        "native_parameters"].as_data_frame()
    print(native_params)

    constraints = (native_params[native_params['name'] ==
                                 "monotone_constraints"])['value'].values[0]

    assert constraints == u'(-1,0,1,0,0,0,1,0,0,0)'
Пример #4
0
def get_native_parameters_test():
    assert H2OXGBoostEstimator.available() is True

    # CPU Backend is forced for the results to be comparable
    h2oParamsS = {"tree_method": "exact", "seed": 123, "backend": "cpu", "ntrees": 5}

    trainFile = pyunit_utils.genTrainFrame(100, 10, enumCols=0, randseed=17)
    print(trainFile)
    myX = trainFile.names
    y='response'
    myX.remove(y)

    h2oParamsS["monotone_constraints"] = {
        "C1": -1,
        "C3": 1,
        "C7": 1
    }

    h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
    h2oModelS.train(x=myX, y=y, training_frame=trainFile)

    native_params = h2oModelS._model_json["output"]["native_parameters"].as_data_frame()
    print(native_params)

    constraints = (native_params[native_params['name'] == "monotone_constraints"])['value'].values[0]

    assert constraints == u'(-1,0,1,0,0,0,1,0,0,0)'
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True

    runSeed = 1
    testTol = 1e-6
    ntrees = 17
    maxdepth = 5
    # CPU Backend is forced for the results to be comparable
    h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                 "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"}
    nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                   'tree_method': 'exact',
                   'seed': h2oParamsD["seed"],
                   'booster': 'gbtree',
                   'objective': 'binary:logistic',
                   'lambda': 0.0,
                   'eta': h2oParamsD["learn_rate"],
                   'grow_policy': 'depthwise',
                   'alpha': 0.0,
                   'subsample': 1.0,
                   'colsample_bylevel': 1.0,
                   'max_delta_step': 0.0,
                   'min_child_weight': h2oParamsD["min_rows"],
                   'gamma': 0.0,
                   'max_depth': h2oParamsD["max_depth"]}

    nrows = 10000
    ncols = 10
    factorL = 11
    numCols = 5
    enumCols = ncols-numCols
    responseL = 2

    trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL)
    myX = trainFile.names
    y='response'
    myX.remove(y)
    enumCols = myX[0:enumCols]

    h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
    # gather, print and save performance numbers for h2o model
    h2oModelD.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictD = h2oModelD.predict(trainFile)
    h2oPredictTimeD = time.time()-time1

    # train the native XGBoost
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
    nativeModel = xgb.train(params=nativeParam,
                            dtrain=nativeTrain, num_boost_round=ntrees)
    nativeTrainTime = time.time()-time1
    time1=time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time()-time1

    pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                          nativeScoreTime, tolerance=testTol)
Пример #6
0
def comparison_test():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        ntrees = 17
        maxdepth = 5
        nrows = 10000
        ncols = 12
        factorL = 20
        numCols = 1
        enumCols = ncols-numCols
        responseL = 4
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7,
                      "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1,
                      "tree_method": "exact", "backend":"cpu"}

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL,
                                               responseLevel=responseL, miscfrac=0.01,randseed=dataSeed)

        myX = trainFile.names
        y='response'
        myX.remove(y)
        newNames = []
        for ind in range(0, len(myX)):
            myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names
            newNames.append(myX[ind])
        newNames.append(y)
        trainFile.set_names(newNames)

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oPredictD = h2oModelD.predict(trainFile)

        # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame
        nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams()
        nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD)

        nativeModel = xgb.train(params=nativeXGBoostParam[0],
                                dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1])
        nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1])
        pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1,
                                              -1, tolerance=1e-6)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
Пример #7
0
def scale_pos_weight_test():
    assert H2OXGBoostEstimator.available() is True

    train = pyunit_utils.genTrainFrame(1000,
                                       0,
                                       enumCols=10,
                                       enumFactors=2,
                                       miscfrac=0.1,
                                       randseed=17)

    xgboost = H2OXGBoostEstimator(ntrees=1, seed=1, scale_pos_weight=1.2)
    xgboost.train(y='response', training_frame=train)

    native_params = xgboost._model_json["output"][
        "native_parameters"].as_data_frame()
    assert min(native_params[native_params['name'] == 'scale_pos_weight']
               ["value"]) == 1.2
def comparison_test():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        ntrees = 17
        maxdepth = 5
        nrows = 10000
        ncols = 12
        factorL = 20
        numCols = 11
        enumCols = ncols-numCols
        responseL = 4
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "tree_method": "exact", "backend":"cpu"}

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL,
                                               responseLevel=responseL, miscfrac=0.01,randseed=dataSeed)
        myX = trainFile.names
        y='response'
        myX.remove(y)
        newNames = []
        for ind in range(0, len(myX)):
            myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names
            newNames.append(myX[ind])
        newNames.append(y)
        trainFile.set_names(newNames)

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oPredictD = h2oModelD.predict(trainFile)

        # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame
        nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams()
        nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD)

        nativeModel = xgb.train(params=nativeXGBoostParam[0],
                                dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1])
        nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1])
        pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1,
                                              -1, tolerance=1e-6)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def get_native_parameters_test():
    assert H2OXGBoostEstimator.available() is True
    ntrees = 17
    # CPU Backend is forced for the results to be comparable
    h2oParamsS = {
        "ntrees": ntrees,
        "max_depth": 4,
        "seed": 1,
        "learn_rate": 0.7,
        "col_sample_rate_per_tree": 0.9,
        "min_rows": 5,
        "score_tree_interval": ntrees + 1,
        "dmatrix_type": "sparse",
        "tree_method": "exact",
        "backend": "cpu"
    }

    nrows = 1000
    ncols = 10
    factorL = 11
    numCols = 0
    enumCols = ncols - numCols

    trainFile = pyunit_utils.genTrainFrame(nrows,
                                           0,
                                           enumCols=enumCols,
                                           enumFactors=factorL,
                                           miscfrac=0.1,
                                           randseed=17)
    print(trainFile)
    myX = trainFile.names
    y = 'response'
    myX.remove(y)

    h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
    h2oModelS.train(x=myX, y=y, training_frame=trainFile)

    print(h2oModelS._model_json["output"]["native_parameters"].as_data_frame())

    assert h2oModelS._model_json["output"][
        "native_parameters"]._table_header == u"Native XGBoost Parameters"
Пример #10
0
def comparison_test():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        ntrees = 17
        h2oParamsS = {
            "ntrees": ntrees,
            "max_depth": 4,
            "seed": runSeed,
            "learn_rate": 0.7,
            "col_sample_rate_per_tree": 0.9,
            "min_rows": 5,
            "score_tree_interval": ntrees + 1,
            "dmatrix_type": "sparse",
            "tree_method": "exact",
            "backend": "cpu"
        }
        nativeParam = {
            'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"],
            'tree_method': 'exact',
            'seed': h2oParamsS["seed"],
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'lambda': 0.0,
            'eta': h2oParamsS["learn_rate"],
            'grow_policy': 'depthwise',
            'alpha': 0.0,
            'subsample': 1.0,
            'colsample_bylevel': 1.0,
            'max_delta_step': 0.0,
            'min_child_weight': h2oParamsS["min_rows"],
            'gamma': 0.0,
            'max_depth': h2oParamsS["max_depth"]
        }

        nrows = 10000
        ncols = 11
        factorL = 0
        numCols = 11
        enumCols = ncols - numCols

        trainFile = pyunit_utils.genTrainFrame(nrows,
                                               numCols,
                                               enumCols=enumCols,
                                               enumFactors=factorL,
                                               miscfrac=0.5,
                                               randseed=dataSeed)
        print(trainFile)
        myX = trainFile.names
        y = 'response'

        h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
        # gather, print and save performance numbers for h2o model
        h2oModelS.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictS = h2oModelS.predict(trainFile)
        h2oPredictTimeS = time.time() - time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile,
                                                                  y,
                                                                  enumCols=[])
        nrounds = ntrees
        nativeModel = xgb.train(params=nativeParam,
                                dtrain=nativeTrain,
                                num_boost_round=nrounds)
        modelsfound = False
        while not (modelsfound
                   ):  # loop to make sure accurate number of trees are built
            modelInfo = nativeModel.get_dump()
            print(modelInfo)
            print("num_boost_round: {1}, Number of trees built: {0}".format(
                len(modelInfo), nrounds))
            if len(modelInfo) >= ntrees:
                modelsfound = True
            else:
                nrounds = nrounds + 1
                nativeModel = xgb.train(params=nativeParam,
                                        dtrain=nativeTrain,
                                        num_boost_round=nrounds)
        nativeTrainTime = time.time() - time1
        time1 = time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time() - time1

        print(
            "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....."
        )
        pyunit_utils.summarizeResult_binomial(h2oPredictS,
                                              nativePred,
                                              h2oTrainTimeS,
                                              nativeTrainTime,
                                              h2oPredictTimeS,
                                              nativeScoreTime,
                                              tolerance=1e-6)
    else:
        print(
            "********  Test skipped.  This test cannot be performed in multinode environment."
        )
def comparison_test():
    assert H2OXGBoostEstimator.available() is True
    runSeed = 1
    ntrees = 10
    responseL = 11
    # CPU Backend is forced for the results to be comparable
    h2oParamsS = {
        "ntrees": ntrees,
        "max_depth": 4,
        "seed": runSeed,
        "learn_rate": 0.7,
        "col_sample_rate_per_tree": 0.9,
        "min_rows": 5,
        "score_tree_interval": ntrees + 1,
        "dmatrix_type": "sparse",
        "tree_method": "exact",
        "backend": "cpu"
    }
    nativeParam = {
        'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"],
        'tree_method': 'exact',
        'seed': h2oParamsS["seed"],
        'booster': 'gbtree',
        'objective': 'multi:softprob',
        'lambda': 0.0,
        'eta': h2oParamsS["learn_rate"],
        'grow_policy': 'depthwise',
        'alpha': 0.0,
        'subsample': 1.0,
        'colsample_bylevel': 1.0,
        'max_delta_step': 0.0,
        'min_child_weight': h2oParamsS["min_rows"],
        'gamma': 0.0,
        'max_depth': h2oParamsS["max_depth"],
        'num_class': responseL
    }

    nrows = 10000
    ncols = 10
    factorL = 11
    numCols = 0
    enumCols = ncols - numCols

    trainFile = pyunit_utils.genTrainFrame(
        nrows,
        numCols,
        enumCols=enumCols,
        enumFactors=factorL,
        miscfrac=0.5,
        responseLevel=responseL)  # load in dataset and add response column
    print(trainFile)
    myX = trainFile.names
    y = 'response'
    myX.remove(y)
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile,
                                                              y,
                                                              enumCols=myX)

    h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
    # gather, print and save performance numbers for h2o model
    h2oModelS.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictS = h2oModelS.predict(trainFile)
    h2oPredictTimeS = time.time() - time1

    # train the native XGBoost

    nativeModel = xgb.train(params=nativeParam,
                            dtrain=nativeTrain,
                            num_boost_round=10)
    nativeTrainTime = time.time() - time1
    time1 = time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time() - time1

    print(
        "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....."
    )
    pyunit_utils.summarizeResult_multinomial(h2oPredictS,
                                             nativePred,
                                             h2oTrainTimeS,
                                             nativeTrainTime,
                                             h2oPredictTimeS,
                                             nativeScoreTime,
                                             tolerance=1e-6)
def comparison_test():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        ntrees = 17
        responseL = 11
        # CPU Backend is forced for the results to be comparable
        h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                      "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsS["seed"],
                       'booster': 'gbtree',
                       'objective': 'multi:softprob',
                       'eta': h2oParamsS["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsS["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsS["max_depth"],
                       'num_class':responseL}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols-numCols

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)       # load in dataset and add response column
        print(trainFile)
        myX = trainFile.names
        y='response'
        myX.remove(y)
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX)

        h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
        # gather, print and save performance numbers for h2o model
        h2oModelS.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictS = h2oModelS.predict(trainFile)
        h2oPredictTimeS = time.time()-time1

        # train the native XGBoost
        nrounds = ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds))
        nativeTrainTime = time.time()-time1
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        print("Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse.....")
        pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS,
                                              nativeScoreTime, tolerance=1e-6)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def comparison_test():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        ntrees = 17
        # CPU Backend is forced for the results to be comparable
        h2oParamsS = {
            "ntrees": ntrees,
            "max_depth": 4,
            "seed": runSeed,
            "learn_rate": 0.7,
            "col_sample_rate_per_tree": 0.9,
            "min_rows": 5,
            "score_tree_interval": ntrees + 1,
            "dmatrix_type": "sparse",
            "tree_method": "exact",
            "backend": "cpu"
        }
        nativeParam = {
            'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"],
            'tree_method': 'exact',
            'seed': h2oParamsS["seed"],
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eta': h2oParamsS["learn_rate"],
            'grow_policy': 'depthwise',
            'alpha': 0.0,
            'subsample': 1.0,
            'colsample_bylevel': 1.0,
            'max_delta_step': 0.0,
            'min_child_weight': h2oParamsS["min_rows"],
            'gamma': 0.0,
            'max_depth': h2oParamsS["max_depth"],
            'eval_metric': ['auc', 'aucpr']
        }

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols - numCols

        trainFile = pyunit_utils.genTrainFrame(
            nrows,
            0,
            enumCols=enumCols,
            enumFactors=factorL,
            miscfrac=0.1,
            randseed=dataSeed)  # load in dataset and add response column
        print(trainFile)
        myX = trainFile.names
        y = 'response'
        myX.remove(y)
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile,
                                                                  y,
                                                                  enumCols=myX)
        h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
        # gather, print and save performance numbers for h2o model
        h2oModelS.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictS = h2oModelS.predict(trainFile)
        h2oPredictTimeS = time.time() - time1

        # train the native XGBoost
        nrounds = ntrees
        evals_result = {}
        watch_list = [(nativeTrain, 'train')]
        nativeModel = xgb.train(params=nativeParam,
                                dtrain=nativeTrain,
                                num_boost_round=nrounds,
                                evals=watch_list,
                                verbose_eval=True,
                                evals_result=evals_result)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num_boost_round: {1}, Number of trees built: {0}".format(
            len(modelInfo), nrounds))
        nativeTrainTime = time.time() - time1
        time1 = time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time() - time1

        print(
            "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....."
        )
        pyunit_utils.summarizeResult_binomial(h2oPredictS,
                                              nativePred,
                                              h2oTrainTimeS,
                                              nativeTrainTime,
                                              h2oPredictTimeS,
                                              nativeScoreTime,
                                              tolerance=1e-6)

        print(
            "Comparing H2OXGBoost metrics with native XGBoost metrics when DMatrix is set to sparse....."
        )
        h2o_metrics = [
            h2oModelS.training_model_metrics()["AUC"],
            h2oModelS.training_model_metrics()["pr_auc"]
        ]
        xgboost_metrics = [
            evals_result['train']['auc'][ntrees - 1],
            evals_result['train']['aucpr'][ntrees - 1]
        ]
        # TODO: less tolerance ?
        pyunit_utils.summarize_metrics_binomial(h2o_metrics,
                                                xgboost_metrics,
                                                ["auc", "aucpr"],
                                                tolerance=1e-3)

    else:
        print(
            "********  Test skipped.  This test cannot be performed in multinode environment."
        )
Пример #14
0
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-6
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {
            "ntrees": ntrees,
            "max_depth": maxdepth,
            "seed": runSeed,
            "learn_rate": 0.7,
            "col_sample_rate_per_tree": 0.9,
            "min_rows": 5,
            "score_tree_interval": ntrees + 1,
            "dmatrix_type": "dense",
            "tree_method": "exact",
            "backend": "cpu"
        }
        nativeParam = {
            'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
            'tree_method': 'exact',
            'seed': h2oParamsD["seed"],
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eta': h2oParamsD["learn_rate"],
            'grow_policy': 'depthwise',
            'alpha': 0.0,
            'subsample': 1.0,
            'colsample_bylevel': 1.0,
            'max_delta_step': 0.0,
            'min_child_weight': h2oParamsD["min_rows"],
            'gamma': 0.0,
            'max_depth': h2oParamsD["max_depth"]
        }

        nrows = 10000
        ncols = 20
        factorL = 20
        numCols = 10
        enumCols = ncols - numCols

        trainFile = pyunit_utils.genTrainFrame(
            nrows,
            numCols,
            enumCols=enumCols,
            enumFactors=factorL,
            miscfrac=0.01,
            randseed=dataSeed)  # load in dataset and add response column
        myX = trainFile.names
        y = 'response'
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        t1Array = h2oModelD._model_json["output"]["scoring_history"]
        print("classifier error: {0}".format(
            t1Array._cell_values[len(t1Array._cell_values) - 1][
                t1Array._col_header.index("training_classification_error")]))
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time() - time1

        # train the native XGBoost
        nrounds = ntrees
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile,
                                                            y,
                                                            enumCols=enumCols)
        nativeModel = xgb.train(params=nativeParam,
                                dtrain=nativeTrain,
                                num_boost_round=nrounds)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num_boost_round: {1}, Number of trees built: {0}".format(
            len(modelInfo), nrounds))

        nativeTrainTime = time.time() - time1
        time1 = time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time() - time1

        pyunit_utils.summarizeResult_binomial(h2oPredictD,
                                              nativePred,
                                              h2oTrainTimeD,
                                              nativeTrainTime,
                                              h2oPredictTimeD,
                                              nativeScoreTime,
                                              tolerance=testTol)
    else:
        print(
            "********  Test skipped.  This test cannot be performed in multinode environment."
        )
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-10
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                      "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'reg:linear',
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 20
        numCols = 5
        enumCols = ncols-numCols

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01,
                                               randseed=dataSeed)     # load in dataset and add response column
        y='response'
        trainFile = trainFile.drop(y)   # drop the enum response and generate real values here
        yresp = 0.99*pyunit_utils.random_dataset_numeric_only(nrows, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed)
        yresp.set_name(0, y)
        trainFile = trainFile.cbind(yresp)
        myX = trainFile.names
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) # need to specify one more to get the right number
        nativeTrainTime = time.time()-time1
        # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions
        # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions
        # cached predictions are slightly different from the actual predictions
        nativeTest = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols)
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                                nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def comparison_test_dense():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-6
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'binary:logistic',
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 5
        enumCols = ncols-numCols
        responseL = 2

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)
        myX = trainFile.names
        y='response'
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        nrounds = ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        modelsfound = False
        while not(modelsfound): # loop to make sure accurate number of trees are built
            modelInfo = nativeModel.get_dump()
            print(modelInfo)
            print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds))
            if len(modelInfo)>=ntrees:
                modelsfound=True
            else:
                nrounds=nrounds+1
                nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)

        nativeTrainTime = time.time()-time1
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-6
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7,
                      "col_sample_rate_per_tree": 0.9,
                      "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact",
                      "backend": "cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'binary:logistic',
                       'lambda': 0.0,
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 20
        factorL = 20
        numCols = 10
        enumCols = ncols - numCols

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL,
                                               miscfrac=0.01, randseed=dataSeed)  # load in dataset and add response column
        myX = trainFile.names
        y = 'response'
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        t1Array = h2oModelD._model_json["output"]["scoring_history"]
        print("classifier error: {0}".format(t1Array._cell_values[len(t1Array._cell_values) - 1][t1Array._col_header.index("training_classification_error")]))
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time() - time1

        # train the native XGBoost
        nrounds = ntrees
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols)
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds))

        nativeTrainTime = time.time() - time1
        time1 = time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time() - time1

        pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
Пример #18
0
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True

    runSeed = 1
    testTol = 1e-6
    ntrees = 10
    maxdepth = 5
    # CPU Backend is forced for the results to be comparable
    h2oParamsD = {
        "ntrees": ntrees,
        "max_depth": maxdepth,
        "seed": runSeed,
        "learn_rate": 0.7,
        "col_sample_rate_per_tree": 0.9,
        "min_rows": 5,
        "score_tree_interval": ntrees + 1,
        "dmatrix_type": "dense",
        "tree_method": "exact",
        "backend": "cpu"
    }
    nativeParam = {
        'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
        'tree_method': 'exact',
        'seed': h2oParamsD["seed"],
        'booster': 'gbtree',
        'objective': 'reg:linear',
        'lambda': 0.0,
        'eta': h2oParamsD["learn_rate"],
        'grow_policy': 'depthwise',
        'alpha': 0.0,
        'subsample': 1.0,
        'colsample_bylevel': 1.0,
        'max_delta_step': 0.0,
        'min_child_weight': h2oParamsD["min_rows"],
        'gamma': 0.0,
        'max_depth': h2oParamsD["max_depth"]
    }

    nrows = 10000
    ncols = 10
    factorL = 20
    numCols = 5
    enumCols = ncols - numCols

    trainFile = pyunit_utils.genTrainFrame(
        nrows, numCols, enumCols=enumCols, enumFactors=factorL,
        miscfrac=0.01)  # load in dataset and add response column
    y = 'response'
    trainFile = trainFile.drop(
        y)  # drop the enum response and generate real values here
    yresp = 0.99 * pyunit_utils.random_dataset_numeric_only(
        nrows, 1, integerR=1000000, misFrac=0)
    yresp.set_name(0, y)
    trainFile = trainFile.cbind(yresp)
    myX = trainFile.names
    myX.remove(y)
    enumCols = myX[0:enumCols]

    h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
    # gather, print and save performance numbers for h2o model
    h2oModelD.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictD = h2oModelD.predict(trainFile)
    h2oPredictTimeD = time.time() - time1

    # train the native XGBoost
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile,
                                                        y,
                                                        enumCols=enumCols)
    nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain)
    nativeTrainTime = time.time() - time1
    time1 = time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time() - time1

    pyunit_utils.summarizeResult_regression(h2oPredictD,
                                            nativePred,
                                            h2oTrainTimeD,
                                            nativeTrainTime,
                                            h2oPredictTimeD,
                                            nativeScoreTime,
                                            tolerance=testTol)
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-6
        ntrees = 17
        maxdepth = 5
        responseL = 11
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'auto',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'multi:softprob',
                       'lambda': 0.0,
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"],
                       'num_class':responseL}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols-numCols

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)
        myX = trainFile.names
        y='response'
        myX.remove(y)
        enumCols = myX[0:11]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam,
                                dtrain=nativeTrain, num_boost_round=nrounds)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num of boosters found in model: {0} and num_boost_round specified: {1}.".format(len(modelInfo), nrounds))

        nativeTrainTime = time.time()-time1
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
def comparison_test_dense():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-10
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'reg:linear',
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols-numCols
        responseL = 2

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)


        y='response'
        trainFile = trainFile.drop(y)   # drop the enum response and generate real values here
        yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed)
        yresp.set_name(0, y)
        trainFile = trainFile.cbind(yresp)
        myX = trainFile.names
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        nativeTrainTime = time.time()-time1
        # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions
        # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions
        # cached predictions are slightly different from the actual predictions
        nativeTest = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
Пример #21
0
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True

    runSeed = random.randint(1, 1073741824)
    testTol = 1e-6
    ntrees = 10
    maxdepth = 5
    nrows = random.randint(100000, 500000)
    ncols = random.randint(1, 10)
    factorL = random.randint(2, 10)
    numCols = random.randint(1, ncols)
    enumCols = ncols - numCols
    responseL = random.randint(3, 10)

    h2oParamsD = {
        "ntrees": ntrees,
        "max_depth": maxdepth,
        "seed": runSeed,
        "learn_rate": 0.7,
        "col_sample_rate_per_tree": 0.9,
        "min_rows": 5,
        "score_tree_interval": ntrees + 1,
        "dmatrix_type": "dense"
    }
    nativeParam = {
        'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
        'tree_method': 'auto',
        'seed': h2oParamsD["seed"],
        'booster': 'gbtree',
        'objective': 'multi:softprob',
        'lambda': 0.0,
        'eta': h2oParamsD["learn_rate"],
        'grow_policy': 'depthwise',
        'alpha': 0.0,
        'subsample': 1.0,
        'colsample_bylevel': 1.0,
        'max_delta_step': 0.0,
        'min_child_weight': h2oParamsD["min_rows"],
        'gamma': 0.0,
        'max_depth': h2oParamsD["max_depth"],
        'num_class': responseL
    }
    trainFile = pyunit_utils.genTrainFrame(nrows,
                                           numCols,
                                           enumCols=enumCols,
                                           enumFactors=factorL,
                                           responseLevel=responseL,
                                           miscfrac=0.01)
    myX = trainFile.names
    y = 'response'
    myX.remove(y)
    enumCols = myX[0:enumCols]

    h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
    # gather, print and save performance numbers for h2o model
    h2oModelD.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictD = h2oModelD.predict(trainFile)
    h2oPredictTimeD = time.time() - time1

    # train the native XGBoost
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile,
                                                        y,
                                                        enumCols=enumCols)
    nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain)
    nativeTrainTime = time.time() - time1
    time1 = time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time() - time1

    pyunit_utils.summarizeResult_multinomial(h2oPredictD,
                                             nativePred,
                                             h2oTrainTimeD,
                                             nativeTrainTime,
                                             h2oPredictTimeD,
                                             nativeScoreTime,
                                             tolerance=testTol)