Exemplos de upload_mojo em Python, exemplos de h2o.upload_mojo em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pyunit_mojo_ids.py Projeto: vishalbelsare/h2o-3

def test_mojo_ids():

    # Train a model
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=["Origin", "Dest"],
                y="IsDepDelayed",
                training_frame=airlines,
                verbose=False)

    # Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)

    original_model_id = model.model_id
    print(original_model_id)

    # Import MOJO from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename,
                                 model_id=original_model_id)
    print(mojo_model.model_id)
    assert_equals(mojo_model.model_id, original_model_id,
                  "Ids should be the same.")

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)

    # Upload MOJO from the temporary file
    mojo_model_up = h2o.upload_mojo(original_model_filename,
                                    model_id=original_model_id)
    print(mojo_model_up.model_id)
    assert_equals(mojo_model_up.model_id, original_model_id,
                  "Ids should be the same.")

    # Load MOJO model from file
    mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename, original_model_id)
    print(mojo_model_from_file.model_id)
    assert_equals(mojo_model_from_file.model_id, original_model_id,
                  "Ids should be the same.")

    # Test initialize model_id from path
    mojo_model_up_wid = h2o.upload_mojo(original_model_filename)
    print(mojo_model_up_wid.model_id)
    assert_equals(mojo_model_up_wid.model_id, original_model_id,
                  "Ids should not be the same.")

    mojo_model_im_wid = h2o.import_mojo(original_model_filename)
    print(mojo_model_im_wid.model_id)
    assert_equals(mojo_model_im_wid.model_id, original_model_id,
                  "Ids should not be the same.")

Exemplo n.º 2

0

Exibir arquivo

def mojo_conveniece():
    
    # Train a model
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees = 1)
    model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)
    
    #Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)
    
    # Load the model from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)
    
    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO UPLOAD TEST
    #####

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)
    # Load the model from the temporary file
    mojo_model = h2o.upload_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

Exemplo n.º 3

0

Exibir arquivo

def titanic():
    df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"),
                       col_types={'pclass': "enum", 'survived': "enum"})
    x =  ["age", "sibsp", "parch", "fare", "sex", "pclass"]

    # Split the dataset into train and test
    train, test = df.split_frame(ratios=[.8], seed=1234)

    rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules")
    rfit.train(training_frame=train, x=x, y="survived", validation_frame=test)

    assert rfit.rmse(valid=True) is not None, "validation metrics should be present"


    print(rfit.rule_importance())
    assert rfit._model_json["output"]["model_summary"] is not None, "model_summary should be present"
    assert len(rfit._model_json["output"]["model_summary"]._cell_values) > 0, "model_summary's content should be present"

    rfit_predictions = rfit.predict(test)

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = rfit.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: pyunit_iris_rulefit.py Projeto: wwjiang007/h2o-3

def iris():
    df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"),
                         col_types={'species': "enum"})
    x = df.columns
    y = "species"
    x.remove(y)

    # Split the dataset into train and test
    train, test = df.split_frame(ratios=[.8], seed=1234)

    rfit = H2ORuleFitEstimator(min_rule_length=4,
                               max_rule_length=5,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules")
    rfit.train(training_frame=train, x=x, y=y, validation_frame=test)

    assert rfit.rmse(
        valid=True) is not None, "validation metrics should be present"

    print(rfit.rule_importance())
    assert rfit._model_json["output"][
        "model_summary"] is not None, "model_summary should be present"
    assert len(rfit._model_json["output"]["model_summary"]._cell_values
               ) > 0, "model_summary's content should be present"

    rfit_predictions = rfit.predict(test)

    frame = rfit.predict_rules(train, ['M0T38N5_Iris-virginica'])
    assert frame.sum().getrow()[0] == 49.0

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = rfit.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)

    # test predict_rules also on linear variable input
    rfit = H2ORuleFitEstimator(min_rule_length=4,
                               max_rule_length=5,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules_and_linear")
    rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
    print(rfit.rule_importance())
    frame = rfit.predict_rules(
        train,
        ['linear.petal_len_Iris-setosa', 'linear.petal_wid_Iris-virginica'])
    assert frame.sum().getrow()[0] == train.nrows

Exemplo n.º 5

0

Exibir arquivo

Arquivo: pyunit_mojo_import.py Projeto: vishalbelsare/h2o-3

def mojo_conveniece():
    # Train a model
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=["Origin", "Dest"],
                y="IsDepDelayed",
                training_frame=airlines)

    #Save the previously created model into a temporary file
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = model.save_mojo(original_model_filename)

    # Load the model from the temporary file
    mojo_model = h2o.import_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO UPLOAD TEST
    #####

    # Download the MOJO
    original_model_filename = model.download_mojo(original_model_filename)
    # Load the model from the temporary file
    mojo_model = h2o.upload_mojo(original_model_filename)
    assert isinstance(mojo_model, H2OGenericEstimator)

    # Test scoring is available on the model
    predictions = mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421

    #####
    # MOJO to POJO Conversion test with POJO re-import
    #####

    pojo_directory = os.path.join(pyunit_utils.locate("results"),
                                  model.model_id + ".java")
    pojo_path = model.download_pojo(path=pojo_directory)
    mojo2_model = h2o.import_mojo(pojo_path)

    predictions2 = mojo2_model.predict(airlines)
    assert predictions2 is not None
    assert predictions2.nrows == 24421
    assert_frame_equal(predictions.as_data_frame(),
                       predictions2.as_data_frame())

Exemplo n.º 6

0

Exibir arquivo

def gbm_mojo_reproducibility_info():
    prostate_hex = h2o.import_file(pyunit_utils.locate("smalldata/testng/prostate.csv"))

    model = H2OIsolationForestEstimator()
    model.train(training_frame=prostate_hex)

    print("Downloading Java prediction model code from H2O")
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id))
    os.makedirs(TMPDIR)
    mojo_path = model.download_mojo(path=TMPDIR)
    gbmModel = h2o.upload_mojo(mojo_path=mojo_path)

    isinstance(gbmModel._model_json['output']['reproducibility_information_table'][1]['h2o_cluster_uptime'][0], float)
    isinstance(gbmModel._model_json['output']['reproducibility_information_table'][0]['java_version'][0], str)
    assert(gbmModel._model_json['output']['reproducibility_information_table'][2]['input_frame'][0] == 'training_frame')

Exemplo n.º 7

0

Exibir arquivo

Arquivo: pyunit_mojo_reproducibility_info_dl.py Projeto: zoudongyang/h2o-3

def dl_mojo_reproducibility_info():

    # Training data
    train_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_model.csv"))
    train_data = train_data.drop('Site')
    train_data['Angaus'] = train_data['Angaus'].asfactor()
    print(train_data.describe())
    train_data.head()

    # Testing data
    test_data = h2o.import_file(
        path=tests.locate("smalldata/gbm_test/ecology_eval.csv"))
    test_data['Angaus'] = test_data['Angaus'].asfactor()
    print(test_data.describe())
    test_data.head()

    # Run DeepLearning
    model = H2ODeepLearningEstimator(loss="CrossEntropy",
                                     epochs=1000,
                                     hidden=[20, 20, 20])
    model.train(x=list(range(1, train_data.ncol)),
                y="Angaus",
                training_frame=train_data,
                validation_frame=test_data)

    print("Downloading Java prediction model code from H2O")
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", model._id))
    os.makedirs(TMPDIR)
    mojo_path = model.download_mojo(path=TMPDIR)
    dlModel = h2o.upload_mojo(mojo_path=mojo_path)

    isinstance(
        dlModel._model_json['output']['reproducibility_information_table'][1]
        ['h2o_cluster_uptime'][0], float)
    isinstance(
        dlModel._model_json['output']['reproducibility_information_table'][0]
        ['java_version'][0], str)
    assert (dlModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][0] == 'training_frame')
    assert (dlModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][1] == 'validation_frame')

Exemplo n.º 8

0

Exibir arquivo

Arquivo: pyunit_stackedensemble_mojo.py Projeto: zoudongyang/h2o-3

def test_helper(train_path, test_path, target, classification, blending,
                metalearner_transform):
    train = h2o.import_file(path=pu.locate(train_path))
    test = h2o.import_file(path=pu.locate(test_path))
    if classification:
        train[target] = train[target].asfactor()
    if blending:
        train, blend = train.split_frame(ratios=[.7], seed=seed)

    model_args = dict() if blending else dict(
        nfolds=3,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True)

    gbm = H2OGradientBoostingEstimator(ntrees=10, seed=seed, **model_args)
    gbm.train(y=target, training_frame=train)

    rf = H2ORandomForestEstimator(ntrees=10, seed=seed, **model_args)
    rf.train(y=target, training_frame=train)

    se = H2OStackedEnsembleEstimator(
        base_models=[rf, gbm], metalearner_transform=metalearner_transform)
    se.train(y=target,
             training_frame=train,
             **(dict(blending_frame=blend) if blending else dict()))

    se_predictions = se.predict(test)

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = se.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pu.compare_frames(se_predictions, mojo_predictions, 0)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: pyunit_mojo_reproducibility_info_xgboost.py Projeto: zoudongyang/h2o-3

def xgb_mojo_reproducibility_info():
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
    df["Angaus"] = df["Angaus"].asfactor()
    df["Weights"] = h2o.H2OFrame.from_python(
        abs(np.random.randn(df.nrow, 1)).tolist())[0]
    print(df.col_names)
    train, calib = df.split_frame(
        ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)

    model = H2OXGBoostEstimator(ntrees=100,
                                distribution="bernoulli",
                                min_rows=10,
                                max_depth=5,
                                weights_column="Weights",
                                calibrate_model=True,
                                calibration_frame=calib)
    model.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)

    print("Downloading Java prediction model code from H2O")
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", model._id))
    os.makedirs(TMPDIR)
    mojo_path = model.download_mojo(path=TMPDIR)
    xgbModel = h2o.upload_mojo(mojo_path=mojo_path)

    isinstance(
        xgbModel._model_json['output']['reproducibility_information_table'][1]
        ['h2o_cluster_uptime'][0], float)
    isinstance(
        xgbModel._model_json['output']['reproducibility_information_table'][0]
        ['java_version'][0], str)
    assert (xgbModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][0] == 'training_frame')
    assert (xgbModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][2] == 'calibration_frame')

Exemplo n.º 10

0

Exibir arquivo

def titanic():
    df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"),
                         col_types={
                             'pclass': "enum",
                             'survived': "enum"
                         })
    x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]

    # Split the dataset into train and test
    train, test = df.split_frame(ratios=[.8], seed=1234)

    rfit = H2ORuleFitEstimator(min_rule_length=4,
                               max_rule_length=5,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules")
    rfit.train(training_frame=train, x=x, y="survived", validation_frame=test)

    rfit2 = H2ORuleFitEstimator(min_rule_length=4,
                                max_rule_length=5,
                                max_num_rules=3,
                                seed=1234,
                                model_type="rules",
                                lambda_=1e-8)
    rfit2.train(training_frame=train, x=x, y="survived", validation_frame=test)
    assert len(rfit.rule_importance()['rule']) < len(
        rfit2.rule_importance()['rule'])

    assert rfit.rmse(
        valid=True) is not None, "validation metrics should be present"

    print(rfit.rule_importance())
    count = 0
    for i in range(train.nrows):
        # these conditions are taken from the resulting rule rfit.rule_importance()['rule'][0]
        if (train.as_data_frame()['age'][i] >= 14.977890968322754 or math.isnan(train.as_data_frame()['age'][i])) and\
            (train.as_data_frame()['fare'][i] < 56.036006927490234 or math.isnan(train.as_data_frame()['fare'][i])) and\
            (train.as_data_frame()['sex'][i] == "male") and\
            (train.as_data_frame()['sibsp'][i] < 3.5 or math.isnan(train.as_data_frame()['sibsp'][i])):
            count = count + 1

    assert abs(rfit.rule_importance()['support'][0] -
               count / train.nrows) < 1e-6

    assert rfit._model_json["output"][
        "model_summary"] is not None, "model_summary should be present"
    assert len(rfit._model_json["output"]["model_summary"]._cell_values
               ) > 0, "model_summary's content should be present"

    rfit_predictions = rfit.predict(test)

    import tempfile
    tmpdir = tempfile.mkdtemp()

    try:
        mojo_path = rfit.save_mojo(tmpdir)
        mojo_model = h2o.upload_mojo(mojo_path)
    finally:
        import shutil
        shutil.rmtree(tmpdir)

    mojo_predictions = mojo_model.predict(test)

    assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)

    rfit = H2ORuleFitEstimator(min_rule_length=1,
                               max_rule_length=1,
                               max_num_rules=3,
                               seed=1234,
                               model_type="rules")
    rfit.train(training_frame=train, x=x, y="survived", validation_frame=test)

    print(rfit.rule_importance())

    count = 0
    for i in range(train.nrows):
        # this condition is taken from the resulting rule rfit.rule_importance()['rule'][0]
        if train.as_data_frame()['sex'][i] == 'female':
            count = count + 1

    assert abs(rfit.rule_importance()['support'][0] -
               count / train.nrows) < 1e-6

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_h2o_converters.py Projeto: vinitra/onnxmltools

 def __setstate__(self, state):
     self._mojo_path = state.path
     self._mojo_model = h2o.upload_mojo(state.path)
     self._column_names = state.colnames

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_h2o_converters.py Projeto: vinitra/onnxmltools

 def __init__(self, mojo_path, column_names=None):
     self._mojo_path = mojo_path
     self._mojo_model = h2o.upload_mojo(mojo_path)
     self._column_names = column_names

Exemplo n.º 13

0

Exibir arquivo

Arquivo: pyunit_pojo_import.py Projeto: timgates42/h2o-3

def generate_and_import_combined_pojo():
    if sys.version_info[0] < 3:  # Python 2
        print("This example needs Python 3.x+")
        return

    weather_orig = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/weather.csv"))
    weather = weather_orig  # working copy

    features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"})
    features.sort()
    response = "RISK_MM"

    glm_model = H2OGeneralizedLinearEstimator()
    glm_model.train(x=features, y=response, training_frame=weather)
    glm_preds = glm_model.predict(weather)

    gbm_model = H2OGradientBoostingEstimator(ntrees=5)
    gbm_model.train(x=features, y=response, training_frame=weather)
    gbm_preds = gbm_model.predict(weather)

    # Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same)
    weather = weather.drop("ChangeTemp")
    weather = weather.drop("ChangeTempDir")

    (combined_pojo_name,
     combined_pojo_path) = generate_combined_pojo(glm_model, gbm_model)
    print("Combined POJO was stored in: " + combined_pojo_path)

    # Note: when using upload_mojo - always specify model_id=<POJO class name>
    pojo_model = h2o.upload_mojo(combined_pojo_path,
                                 model_id=combined_pojo_name)

    # Testing begins

    # Sanity test - test parameterization that delegates to GLM
    weather["Bias"] = 1  # behave like GLM
    pojo_glm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_glm_preds.as_data_frame(),
                       glm_preds.as_data_frame())

    # Sanity test - test parameterization that delegates to GBM
    weather["Bias"] = 0  # behave like GBM
    pojo_gbm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_gbm_preds.as_data_frame(),
                       gbm_preds.as_data_frame())

    # Test per-segment specific behavior, segments are defined by ChangeWindDirect
    weather["Bias"] = float("NaN")
    for change_wind_dir in weather["ChangeWindDirect"].levels()[0]:
        weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir]
        weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] ==
                                        change_wind_dir]
        pojo_weather_cwd_preds = pojo_model.predict(weather_cwd)
        if change_wind_dir == "c" or change_wind_dir == "l":
            expected = glm_model.predict(weather_orig_cwd) * 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "n":
            expected = (glm_model.predict(weather_orig_cwd) +
                        gbm_model.predict(weather_orig_cwd)) / 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "s":
            expected = gbm_model.predict(weather_orig_cwd)
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())

Exemplo n.º 14

0

Exibir arquivo

Arquivo: pyunit_mojo_reproducibility_info_gbm.py Projeto: yangxhcaf/h2o-3

def gbm_mojo_reproducibility_info():
    problems = ['binomial', 'multinomial', 'regression']
    PROBLEM = problems[randint(0, (len(problems) - 1))]
    TESTROWS = 2000
    df = pyunit_utils.random_dataset(PROBLEM,
                                     verbose=False,
                                     NTESTROWS=TESTROWS)
    train = df[TESTROWS:, :]
    x = list(set(df.names) - {"respose"})
    params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4}
    gbmModel = pyunit_utils.build_save_model_GBM(params, x, train, "response")

    isinstance(
        gbmModel._model_json['output']['reproducibility_information_table'][1]
        ['h2o_cluster_uptime'][0], float)
    isinstance(
        gbmModel._model_json['output']['reproducibility_information_table'][0]
        ['java_version'][0], str)
    assert (gbmModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][0] == 'training_frame')

    ecology = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
    ecology['Angaus'] = ecology['Angaus'].asfactor()
    train, calib = ecology.split_frame(seed=12354)
    predictors = ecology.columns[3:13]
    w = h2o.create_frame(binary_fraction=1,
                         binary_ones_fraction=0.5,
                         missing_fraction=0,
                         rows=744,
                         cols=1)
    w.set_names(["weight"])
    train = train.cbind(w)
    model = H2OGradientBoostingEstimator(ntrees=10,
                                         max_depth=5,
                                         min_rows=10,
                                         learn_rate=0.1,
                                         distribution="multinomial",
                                         weights_column="weight",
                                         calibrate_model=True,
                                         calibration_frame=calib)
    model.train(x=predictors, y="Angaus", training_frame=train)

    print("Downloading Java prediction model code from H2O")
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", model._id))
    os.makedirs(TMPDIR)
    mojo_path = model.download_mojo(path=TMPDIR)
    gbmModel = h2o.upload_mojo(mojo_path=mojo_path)

    isinstance(
        gbmModel._model_json['output']['reproducibility_information_table'][1]
        ['h2o_cluster_uptime'][0], float)
    isinstance(
        gbmModel._model_json['output']['reproducibility_information_table'][0]
        ['java_version'][0], str)
    assert (gbmModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][0] == 'training_frame')
    assert (gbmModel._model_json['output']['reproducibility_information_table']
            [2]['input_frame'][2] == 'calibration_frame')

Exemplo n.º 15

0

Exibir arquivo

Arquivo: make_predictions.py Projeto: ecosystemai/ecosystem-notebooks

def predict_risk_scores(**kwargs):
    mongo_connect = kwargs["dag_run"].conf.get("mongo_connect")
    database = kwargs["dag_run"].conf.get("database")
    from_date = kwargs["dag_run"].conf.get("from_date")
    to_date = kwargs["dag_run"].conf.get("to_date")
    speciality_list = kwargs["dag_run"].conf.get("speciality_list")
    p_auth = authenticate.prediction_login(**kwargs)
    client = pymongo.MongoClient(mongo_connect)
    db = client[database]
    for i in speciality_list:
        speciality_name = i[0].replace("-", "").replace(" ", "").replace(
            ",", "").replace("&", "").lower()
        #Generate the name for the filtered claim collection
        provider_collection = "provider_"
        provider_collection = provider_collection + speciality_name
        provider_collection = provider_collection + "_" + from_date + to_date
        provider_collection = provider_collection.replace("-", "").replace(
            " ", "").replace(",", "").replace("&", "").lower()

        pros = h2o.import_file("/data/" + provider_collection + ".csv")
        pros["provider_fraudulent"] = pros["provider_fraudulent"].asfactor()
        path = "/data/models/" + speciality_name + ".zip"
        model_key = h2o.upload_mojo(path)
        if "alpha" in model_key.params:
            model_type = "GLM"
        else:
            model_type = "GBM"
        print(model_type)

        if model_type in ['GBM', 'XRT']:
            contrib = model_key.predict_contributions(pros)
            contribdrop = contrib.drop("_id")
            contributions = contribdrop.cbind(pros["_id"])
            export_path = "/data/" + speciality_name + "_contrib_oot.csv"
            h2o.export_file(contributions, path=export_path, force=True)
            db[speciality_name + "_contrib_oot"].drop()
            d.csv_import(p_auth, "medscheme_new",
                         speciality_name + "_contrib_oot",
                         speciality_name + "_contrib_oot.csv")
        elif model_type == 'GLM':
            print("HERE")
            detail_path = "/data/models/" + speciality_name[0].upper(
            ) + speciality_name[1:] + "/experimental/modelDetails.json"
            with open(detail_path) as json_file:
                mojo_dict = json.load(json_file)
            coeff_dict = dict(
                zip(mojo_dict['output']['coefficients_table']['data'][0],
                    mojo_dict['output']['coefficients_table']['data'][1]))
            contrib_table = speciality_name + "_contrib_oot"
            db[contrib_table].drop()
            for doc in db[provider_collection].find():
                insert_dict = {"_id": doc["_id"]}
                iter_count = 0
                for j in coeff_dict:
                    if (j in ["Intercept", "_id"]):
                        pass
                    elif "." in j:
                        var_value = j[j.index(".") + 1:]
                        var_name = j[:j.index(".")]
                        if doc[var_name] == var_value:
                            insert_dict[var_name] = coeff_dict[j]
                    elif j not in doc:
                        insert_dict[j] = 0
                    else:
                        insert_dict[j] = doc[j] * coeff_dict[j]
                db[contrib_table].insert_one(insert_dict)

        pred = model_key.predict(pros)
        colsCombine_df = pred.cbind(pros["_id"])
        export_path = "/data/" + speciality_name + "_prediction_oot.csv"
        h2o.export_file(colsCombine_df, path=export_path, force=True)
        db[speciality_name + "_prediction_oot"].drop()
        d.csv_import(p_auth, "medscheme_new",
                     speciality_name + "_prediction_oot",
                     speciality_name + "_prediction_oot.csv")