Пример #1
0
def mojo_model_ifr_test():

    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    ifr = H2OIsolationForestEstimator(ntrees=1)
    ifr.train(x = ["Origin", "Dest"], y = "Distance", training_frame=airlines)
    print(ifr)
    with Capturing() as original_output:
        ifr.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = ifr.download_mojo(original_model_filename)
      
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    print(model)
    compare_params(ifr, model)
    with Capturing() as generic_output:
        model.show()

    strip_part = "'Model Summary: '"
    algo_name = 'ModelMetricsAnomaly: isolationforest'
    generic_algo_name = 'ModelMetricsAnomaly: generic'

    compare_output(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is None
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
Пример #2
0
def test(x, y, output_test, strip_part, algo_name, generic_algo_name):

    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    drf = H2ORandomForestEstimator(ntrees=1, nfolds = 3)
    drf.train(x = x, y = y, training_frame=airlines, validation_frame=airlines)
    print(drf)
    with Capturing() as original_output:
        drf.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = drf.download_mojo(original_model_filename)
      
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    print(model)
    compare_params(drf, model)
    with Capturing() as generic_output:
        model.show()

    output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
Пример #3
0
def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family):

    # GLM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator(nfolds = 3, family = family, alpha = 1, lambda_ = 1)
    glm.train(x = x, y = y, training_frame=airlines, validation_frame=airlines, )
    print(glm)
    with Capturing() as original_output:
        glm.show()
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    generic_mojo_model_from_file = H2OGenericEstimator.from_file(original_model_filename)
    assert generic_mojo_model_from_file is not None
    print(generic_mojo_model_from_file)
    compare_params(glm, generic_mojo_model_from_file)
    with Capturing() as generic_output:
        generic_mojo_model_from_file.show()

    output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = generic_mojo_model_from_file.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model_from_file._model_json["output"]["model_summary"] is not None
    assert len(generic_mojo_model_from_file._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
Пример #4
0
def test(x, ties, stratify_by, use_all_factor_levels):

    heart = h2o.import_file(
        path=pyunit_utils.locate("smalldata/coxph_test/heart.csv"))
    heart_test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/coxph_test/heart_test.csv"))
    for colname in stratify_by:
        heart[colname] = heart[colname].asfactor()
        heart_test[colname] = heart_test[colname].asfactor()

    coxph = H2OCoxProportionalHazardsEstimator(
        start_column="start",
        stop_column="stop",
        stratify_by=stratify_by,
        use_all_factor_levels=use_all_factor_levels,
        ties=ties)
    coxph.train(x=x, y="event", training_frame=heart)
    coxph.show()

    mojo_path = pyunit_utils.locate("results")
    mojo_path = coxph.download_mojo(mojo_path)

    from h2o.estimators import H2OGenericEstimator
    model = H2OGenericEstimator.from_file(mojo_path)
    assert model is not None
    # test printing the model won't cause issues but don't compare - they won't match
    model.show()
    compare_params(coxph, model)

    assert isinstance(model.model_performance(),
                      H2OModelMetricsRegressionCoxPH)
    assert coxph.model_performance().concordance() == model.model_performance(
    ).concordance()
    assert coxph.model_performance().concordant() == model.model_performance(
    ).concordant()
    assert coxph.model_performance().tied_y() == model.model_performance(
    ).tied_y()

    # also check we can get metrics on new data
    assert isinstance(model.model_performance(test_data=heart_test),
                      H2OModelMetricsRegressionCoxPH)

    predictions = model.predict(heart_test)
    predictions_orig = coxph.predict(heart_test)
    assert predictions is not None
    assert predictions.nrows == heart_test.nrows
    assert predictions_orig.nrows == heart_test.nrows

    pyunit_utils.compare_string_frames_local(predictions, predictions_orig,
                                             0.001)

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(mojo_path)
def test(x, y, output_test, strip_part, algo_name, generic_algo_name):
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2ODeepLearningEstimator(epochs=1)
    gbm.train(x=x, y=y, training_frame=airlines, validation_frame=airlines)
    print(gbm)
    with Capturing() as original_output:
        gbm.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = gbm.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    generic_mojo_model = H2OGenericEstimator(model_key=fr)
    generic_mojo_model.train()
    compare_params(gbm, generic_mojo_model)
    print(generic_mojo_model)
    with Capturing() as generic_output:
        generic_mojo_model.show()

    output_test(str(original_output), str(generic_output), strip_part,
                algo_name, generic_algo_name)

    predictions = generic_mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model._model_json["output"][
        "model_summary"] is not None
    assert len(generic_mojo_model._model_json["output"]
               ["model_summary"]._cell_values) > 0

    # Test constructor generating the model from existing MOJO file
    generic_mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename)
    assert generic_mojo_model_from_file is not None
    predictions = generic_mojo_model_from_file.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model_from_file._model_json["output"][
        "model_summary"] is not None
    assert len(generic_mojo_model_from_file._model_json["output"]
               ["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(
        path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
Пример #6
0
def stackedensemble_mojo_model_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)
    print(se)
    with Capturing() as original_output:
        se.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = se.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    generic_mojo_model = H2OGenericEstimator(model_key=fr)
    generic_mojo_model.train()
    compare_params(se, generic_mojo_model)

    predictions = generic_mojo_model.predict(test)
    assert predictions is not None

    # Test constructor generating the model from existing MOJO file
    generic_mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename)
    assert generic_mojo_model_from_file is not None
    predictions = generic_mojo_model_from_file.predict(test)
    assert predictions is not None

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(
        path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
def test(output_test, x, ties, stratify_by, use_all_factor_levels):

    heart = h2o.import_file(
        path=pyunit_utils.locate("smalldata/coxph_test/heart.csv"))
    heart_test = h2o.import_file(
        path=pyunit_utils.locate("smalldata/coxph_test/heart_test.csv"))
    for colname in stratify_by:
        heart[colname] = heart[colname].asfactor()
        heart_test[colname] = heart_test[colname].asfactor()

    coxph = H2OCoxProportionalHazardsEstimator(
        start_column="start",
        stop_column="stop",
        stratify_by=stratify_by,
        use_all_factor_levels=use_all_factor_levels,
        ties=ties)
    coxph.train(x=x, y="event", training_frame=heart)
    with Capturing() as original_output:
        coxph.show()

    coxph.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = coxph.download_mojo(original_model_filename)

    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    compare_params(coxph, model)

    predictions = model.predict(heart_test)
    predictions_orig = coxph.predict(heart_test)
    assert predictions is not None
    assert predictions.nrows == heart_test.nrows
    assert predictions_orig.nrows == heart_test.nrows

    pyunit_utils.compare_string_frames_local(predictions, predictions_orig,
                                             0.001)

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
def mojo_model_eif_test():

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))
    eif = H2OExtendedIsolationForestEstimator(ntrees=1, extension_level=train.ncol - 1, seed=1234)
    eif.train(training_frame=train)
    prediction_orig = eif.predict(train)
    print(eif)
    print(prediction_orig)
    with Capturing() as original_output:
        eif.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = eif.download_mojo(original_model_filename)

    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    print(model)
    compare_params(eif, model)
    with Capturing() as generic_output:
        model.show()

    strip_part = "'Model Summary: '"
    algo_name = 'ModelMetricsAnomaly: extendedisolationforest'
    generic_algo_name = 'ModelMetricsAnomaly: generic'

    compare_output(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = model.predict(train)
    print(predictions)
    assert predictions is not None
    assert predictions.nrows == 500
    assert model._model_json["output"]["variable_importances"] is None
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0
    assert compare_frames(prediction_orig, predictions, numElements=-1, strict=True)

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)