示例#1
0
def expr_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # expr[int] (column slice), expr is pending
    res = 2 - iris
    res2 = h2o.as_list(res[0])
    assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \
        "incorrect values"

    # expr[int,int], expr is remote
    res.eager()
    res3 = h2o.as_list(res[13, 3])
    assert abs(res3[0][0] - 1.9) < 1e-10, "incorrect values"

    # expr[int, slice], expr is remote
    res4 = h2o.as_list(res[12, 0:3])
    assert abs(res4[0][0] - -2.8) < 1e-10 and abs(res4[0][1] - -1.0) < 1e-10 and abs(res4[0][2] - 0.6) < 1e-10 and \
        abs(res4[0][3] - 1.9) < 1e-10, "incorrect values"

    # expr[slice, int], expr is remote
    res5 = h2o.as_list(res[5:8, 1])
    assert abs(res5[0][0] - -1.9) < 1e-10 and abs(res5[1][0] - -1.4) < 1e-10 and abs(res5[2][0] - -1.4) < 1e-10 and \
           abs(res5[3][0] - -0.9) < 1e-10, "incorrect values"

    # expr[slice, slice], expr is pending
    res = iris * 2
    res6 = h2o.as_list(res[5:8, 0:3])
    assert abs(res6[0][0] - 10.8) < 1e-10 and abs(res6[1][1] - 6.8) < 1e-10 and abs(res6[2][2] - 3.0) < 1e-10 and \
           abs(res6[3][3] - 0.4) < 1e-10, "incorrect values"
示例#2
0
def expr_as_list():

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # multiple rows and columns
    res = 2 - iris
    res = h2o.as_list(res, use_pandas=False)
    res = list(zip(*res))
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[1][5]) - -1.6) < 1e-10 and \
           abs(float(res[2][11]) - 0.5) < 1e-10, "incorrect values"

    # single column
    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    res = list(zip(*res))
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \
           abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"

    # local data
    frm = h2o.as_list(h2o.H2OFrame([[1, 2, 3]]), use_pandas=False)
    assert float(frm[1][2]) == 3, "incorrect values"

    frm = h2o.as_list(h2o.H2OFrame([[1, 2, 3], [4, 5, 6]]), use_pandas=False)
    assert float(frm[2][1]) == 5, "incorrect values"
示例#3
0
def expr_as_list(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    # multiple rows and columns
    res = 2 - iris
    res = h2o.as_list(res, use_pandas=False)
    assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[5][1]) - -1.6) < 1e-10 and \
           abs(float(res[11][2]) - 0.5) < 1e-10, "incorrect values"

    # single column
    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[18][0]) - -3.1) < 1e-10 and \
           abs(float(res[25][0]) - -2.8) < 1e-10, "incorrect values"

    # local data
    frm = h2o.as_list(h2o.H2OFrame(python_obj=[1, 2, 3]), use_pandas=False)
    assert float(frm[1][2]) == 3, "incorrect values"

    frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1, 2, 3], [4, 5, 6]]),
                      use_pandas=False)
    assert float(frm[2][1]) == 5, "incorrect values"
示例#4
0
def auc(m, v, t):
    y_true = v[t]
    y_scores = m.predict(v)
    y_true = h2o.as_list(y_true, use_pandas=True).values
    y_scores = h2o.as_list(y_scores, use_pandas=True).values
    d = roc_auc_score(y_true, y_scores)
    return d
示例#5
0
def frame_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = h2o.as_list(iris[0])
    assert abs(res1[8][0] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = h2o.as_list(prostate[13, 3])
    assert abs(res2[0][0] - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = h2o.as_list(airlines[12, 0:3])
    assert abs(res3[0][0] - 1987) < 1e-10 and abs(res3[0][1] - 10) < 1e-10 and abs(res3[0][2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = h2o.as_list(iris[5:8, 1])
    assert abs(res4[0][0] - 3.9) < 1e-10 and abs(res4[1][0] - 3.4) < 1e-10 and abs(res4[2][0] - 3.4) < 1e-10 and \
           abs(res4[3][0] - 2.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = h2o.as_list(prostate[5:8, 0:3])
    assert abs(res5[0][0] - 6) < 1e-10 and abs(res5[1][1] - 0) < 1e-10 and abs(res5[2][2] - 61) < 1e-10, "incorrect values"
def expr_as_list(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    # multiple rows and columns
    res = 2 - iris
    res = h2o.as_list(res, use_pandas=False)
    assert (
        abs(float(res[4][0]) - -2.6) < 1e-10
        and abs(float(res[5][1]) - -1.6) < 1e-10
        and abs(float(res[11][2]) - 0.5) < 1e-10
    ), "incorrect values"

    # single column
    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert (
        abs(float(res[4][0]) - -2.6) < 1e-10
        and abs(float(res[18][0]) - -3.1) < 1e-10
        and abs(float(res[25][0]) - -2.8) < 1e-10
    ), "incorrect values"

    # local data
    frm = h2o.as_list(h2o.H2OFrame(python_obj=[1, 2, 3]), use_pandas=False)
    assert float(frm[1][2]) == 3, "incorrect values"

    frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1, 2, 3], [4, 5, 6]]), use_pandas=False)
    assert float(frm[2][1]) == 5, "incorrect values"
def expr_as_list():



    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # multiple rows and columns
    res = 2 - iris
    res = h2o.as_list(res, use_pandas=False)
    res = list(zip(*res))
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[1][5]) - -1.6) < 1e-10 and \
           abs(float(res[2][11]) - 0.5) < 1e-10, "incorrect values"

    # single column
    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    res = list(zip(*res))
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \
           abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"

    # local data
    frm = h2o.as_list(h2o.H2OFrame([[1,2,3]]), use_pandas=False)
    assert float(frm[1][2]) == 3, "incorrect values"

    frm = h2o.as_list(h2o.H2OFrame([[1,2,3], [4,5,6]]), use_pandas=False)
    assert float(frm[2][1]) == 5, "incorrect values"
示例#8
0
def group_by(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"]
    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names()[0:4]

    # smoke test
    for a in h2o_agg_funcs:
        for n in na_handling:
            for c in col_names:
                h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]})

    # h2o/pandas/numpy comparison test
    h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum}
    for k in h2o_np_agg_dict.keys():
        for c in col_names:
            h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]})
            pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k])
            for i in range(3):
                h2o_val = h2o.as_list(h2o_res)[i][1]
                pd_val = pd_res.values[int(h2o.as_list(h2o_res)[i][0])]
                assert abs(h2o_val - pd_val) < 1e-06, \
                    "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
def frame_as_list():

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip"))

    res1 = h2o.as_list(iris, use_pandas=False)
    assert (
        abs(float(res1[9][0]) - 4.4) < 1e-10
        and abs(float(res1[9][1]) - 2.9) < 1e-10
        and abs(float(res1[9][2]) - 1.4) < 1e-10
    ), "incorrect values"

    res2 = h2o.as_list(prostate, use_pandas=False)
    assert (
        abs(float(res2[7][0]) - 7) < 1e-10
        and abs(float(res2[7][1]) - 0) < 1e-10
        and abs(float(res2[7][2]) - 68) < 1e-10
    ), "incorrect values"

    res3 = h2o.as_list(airlines, use_pandas=False)
    assert (
        abs(float(res3[4][0]) - 1987) < 1e-10
        and abs(float(res3[4][1]) - 10) < 1e-10
        and abs(float(res3[4][2]) - 18) < 1e-10
    ), "incorrect values"
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()
    
    print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()
    
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    
    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2])
    def huber(a):
        return a*a/2 if abs(a) <= 1 else abs(a)-0.5
    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:,3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
示例#11
0
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan,
                  VMailMessage, DayMins, DayCalls, DayCharge, EveMins,
                  EveCalls, EveCharge, NightMins, NightCalls, NightCharge,
                  IntlMins, IntlCalls, IntlCharge, CustServCalls):
    # connect to the model scoring service
    h2o.init(nthreads=1,
             max_mem_size=1,
             start_h2o=True,
             strict_version_check=False)

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='AutoML-leader')

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame(
        {
            'State': State,
            'Account Length': AccountLength,
            'Area Code': AreaCode,
            'Phone': Phone,
            'Int\'l Plan': IntlPlan,
            'VMail Plan': VMailPlan,
            'VMail Message': VMailMessage,
            'Day Mins': DayMins,
            'Day Calls': DayCalls,
            'Day Charge': DayCharge,
            'Eve Mins': EveMins,
            'Eve Calls': EveCalls,
            'Eve Charge': EveCharge,
            'Night Mins': NightMins,
            'Night Calls': NightCalls,
            'Night Charge': NightCharge,
            'Intl Mins': IntlMins,
            'Intl Calls': IntlCalls,
            'Intl Charge': IntlCharge,
            'CustServ Calls': CustServCalls
        },
        index=[0])

    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityChurn = predictionsOut[1][1]
    probabilityRetain = predictionsOut[1][2]

    mySQL_Username = os.environ['BRETT_MYSQL_USERNAME']
    mySQL_Password = os.environ['BRETT_MYSQL_PASSWORD']
    mySQL_IP = os.environ['BRETT_MYSQL_IP']

    engine = create_engine("mysql+mysqldb://" + mySQL_Username + ":" +
                           mySQL_Password + "@" + mySQL_IP + "/customers")
    predictionsToDB = h2o.as_list(predictions, use_pandas=True)
    predictionsToDB.to_sql(con=engine, name='predictions', if_exists='append')

    return "Prediction: " + str(prediction) + " |Probability to Churn: " + str(
        probabilityChurn) + " |Probability to Retain: " + str(
            probabilityRetain)
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) +
          " and cols = " + str(n))
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print("Run GLRM with non-negative regularization")
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              loss="Quadratic",
                                              regularization_x="NonNegative",
                                              regularization_y="NonNegative",
                                              gamma_x=1,
                                              gamma_y=1)
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    glrm_h2o.show()

    print("Check that X and Y matrices are non-negative")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
def svd_1_golden():

    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print "Compare with SVD"
    fitH2O = h2o.svd(x=arrestsH2O[0:4],
                     nv=4,
                     transform="NONE",
                     max_iterations=2000)

    print "Compare singular values (D)"
    h2o_d = fitH2O._model_json['output']['d']
    r_d = [
        1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677
    ]
    print "R Singular Values: {0}".format(r_d)
    print "H2O Singular Values: {0}".format(h2o_d)
    for r, h in zip(r_d, h2o_d):
        assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r)

    print "Compare right singular vectors (V)"
    h2o_v = h2o.as_list(h2o.get_frame(
        fitH2O._model_json['output']['v_key']['name']),
                        use_pandas=False)
    h2o_v = zip(*h2o_v)
    h2o_v.pop(0)
    r_v = [[-0.04239181, 0.01616262, -0.06588426, 0.99679535],
           [-0.94395706, 0.32068580, 0.06655170, -0.04094568],
           [-0.30842767, -0.93845891, 0.15496743, 0.01234261],
           [-0.10963744, -0.12725666, -0.98347101, -0.06760284]]
    print "R Right Singular Vectors: {0}".format(r_v)
    print "H2O Right Singular Vectors: {0}".format(h2o_v)
    for rl, hl in zip(r_v, h2o_v):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))
                       ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)

    print "Compare left singular vectors (U)"
    h2o_u = h2o.as_list(h2o.get_frame(
        fitH2O._model_json['output']['u_key']['name']),
                        use_pandas=False)
    h2o_u = zip(*h2o_u)
    h2o_u.pop(0)
    r_u = [[-0.1716251, 0.096325710, 0.06515480, 0.15369551],
           [-0.1891166, 0.173452566, -0.42665785, -0.17801438],
           [-0.2155930, 0.078998111, 0.02063740, -0.28070784],
           [-0.1390244, 0.059889811, 0.01392269, 0.01610418],
           [-0.2067788, -0.009812026, -0.17633244, -0.21867425],
           [-0.1558794, -0.064555293, -0.28288280, -0.11797419]]
    print "R Left Singular Vectors: {0}".format(r_u)
    print "H2O Left Singular Vectors: {0}".format(h2o_u)
    for rl, hl in zip(r_u, h2o_u):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))
                       ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
示例#14
0
def auc(m, v, t):
    y_true = v[t]
    y_scores = m.predict(v)
    y_true = h2o.as_list(y_true, use_pandas=True).values
    y_scores = h2o.as_list(y_scores, use_pandas=True).values
    d = roc_auc_score(y_true, y_scores)
    # score.append(d)
    print('AUC:', d)
    return d
    del m
示例#15
0
def distance_check_without_empty_strings():
    x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'], column_types=['factor'])
    y = h2o.H2OFrame.from_python(['Marhta', 'Duane', ''], column_types=['string'])
    dist = x.strdistance(y, measure="jw", compare_empty=False)
    dist_list = h2o.as_list(dist, use_pandas=False, header=False)
    # compare without last value as it is empty list
    tst.assert_allclose([float(c[0]) for c in dist_list[0:2]], [0.961111, 0.84], atol=0.001)
    # compare that last value os NA
    dist_na_list = h2o.as_list(dist.isna(), use_pandas=False, header=False)
    assert dist_na_list == [['0'], ['0'], ['1']]
示例#16
0
def deep_1(
        K, dfs, dfs_collector, test,
        test_collector
):
    r = 'deep_1'

    features = on_top2
    val_hf = h2o.H2OFrame(test)
    ntrees = 100
    seed = 1155
    v = np.zeros(shape=[len(test)])
    for i in range(K):
        print()
        print('in model:', r, ' k-fold:', i + 1, '/', K)
        print()
        b = [i for i in range(K)]
        b.remove(i)
        c = [dfs[b[j]] for j in range(K - 1)]
        dt = pd.concat(c)
        train_hf = h2o.H2OFrame(dt)
        del dt
        dfs_i = h2o.H2OFrame(dfs[i])

        # features = list(train_hf.columns)
        features.remove('target')
        print('- ' * 10)
        for c in features:
            print("'{}',".format(c))
        print('- ' * 10)
        model = H2ODeepLearningEstimator(hidden=[200,200], epochs=500)
        model.train(x=features,
                         y='target',
                         training_frame=train_hf)
        del train_hf
        p = model.predict(dfs_i)
        dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values
        print(dfs_collector[i].head())
        print(dfs_collector[i].head().dtypes)
        q = model.predict(val_hf)

        dd = h2o.as_list(q, use_pandas=True)
        a = dd['predict']
        a = np.array(a, dtype=pd.Series).tolist()
        # print(type(a))
        # print(a.shape)
        v += a
        print('# ' * 10)
        for show_v in range(5):
            print(v[show_v])
        print('# ' * 10)

    test_collector[r] = v / K
    print(test_collector.head())
    return dfs_collector, test_collector, r
示例#17
0
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame.fromPython(zip(*train.tolist()))

    print "Run GLRM with non-negative regularization"
    initial_y = np.random.rand(n, k)
    initial_y_h2o = h2o.H2OFrame.fromPython(initial_y.tolist())

    glrm_h2o = H2OGeneralizedLowRankEstimator(
        k=k,
        init="User",
        user_y=initial_y_h2o,
        loss="Quadratic",
        regularization_x="NonNegative",
        regularization_y="NonNegative",
        gamma_x=1,
        gamma_y=1,
    )
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    glrm_h2o.show()

    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json["output"]["archetypes"].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json["output"]["representation_name"])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json["output"]["objective"]
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["numerr"]
    glrm_caterr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["caterr"]
    assert abs(glrm_numerr - glrm_obj) < 1e-3, (
        "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    )
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_unitonesparse():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n))
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in range(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(list(zip(*train.tolist())))

    print("Run GLRM with unit one-sparse regularization on X")
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y.tolist())))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o)
 #   glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print("Check that X matrix consists of rows of basis vectors")
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_basis(a):
        zeros = np.where(a == 0)[0].size
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1"
        return basis
    np.apply_along_axis(is_basis, 1, fit_x_np)

    print("Check final objective function value")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def vec_as_list():

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \
           abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \
           abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"
def glrm_simplex():
    m = 1000
    n = 100
    k = 10
    
    print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n))
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in range(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    
    print("Run GLRM with quadratic mixtures (simplex) regularization on X")
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o)
#    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()
    
    print("Check that X matrix consists of rows within standard probability simplex")
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_simplex(a):
        row_sum = sum(a)
        simplex = abs(row_sum - 1) < 1e-6
        assert simplex, "Got sum over row = " + row_sum + ", but expected 1"
        return simplex
    np.apply_along_axis(is_simplex, 1, fit_x_np)
    
    print("Check final objective function value")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)
    
    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_unitonesparse():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(zip(*train.tolist()))

    print "Run GLRM with unit one-sparse regularization on X"
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist()))
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print "Check that X matrix consists of rows of basis vectors"
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_basis(a):
        zeros = np.where(a == 0)[0].size
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1"
        return basis
    np.apply_along_axis(is_basis, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
示例#22
0
def vec_as_list():



    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \
           abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \
           abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"
示例#23
0
def compare_frames(expected, actual):
    assert actual.shape == expected.shape
    assert actual.columns == expected.columns, "Columns differ: %r vs %r" % (actual.columns, colnames)
    for i in range(len(actual.columns)):
        colname = actual.columns[i]
        t1 = expected.types[colname]
        t2 = actual.types[colname]
        assert t1 == t2, ("Bad types %s: expected %s, got %s" %(colname, t1, t2))
        col1 = expected[colname]
        s1 = str(h2o.as_list(col1))
        col2 = actual[colname] 
        s2 = str(h2o.as_list(col2))
        assert s1 == s2, ("bad values: expected[%d] = %r, actual[%d] = %r"
                          % (i, s1, i, s2))
示例#24
0
def distance_check_without_empty_strings():
    x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'],
                                 column_types=['factor'])
    y = h2o.H2OFrame.from_python(['Marhta', 'Duane', ''],
                                 column_types=['string'])
    dist = x.strdistance(y, measure="jw", compare_empty=False)
    dist_list = h2o.as_list(dist, use_pandas=False, header=False)
    # compare without last value as it is empty list
    tst.assert_allclose([float(c[0]) for c in dist_list[0:2]],
                        [0.961111, 0.84],
                        atol=0.001)
    # compare that last value os NA
    dist_na_list = h2o.as_list(dist.isna(), use_pandas=False, header=False)
    assert dist_na_list == [['0'], ['0'], ['1']]
def svd_1_golden():

    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print("Compare with SVD")
    from h2o.transforms.decomposition import H2OSVD

    fitH2O = H2OSVD(nv=4, transform="NONE", max_iterations=2000)
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)

    print("Compare singular values (D)")
    h2o_d = fitH2O._model_json["output"]["d"]
    r_d = [1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677]
    print("R Singular Values: {0}".format(r_d))
    print("H2O Singular Values: {0}".format(h2o_d))
    for r, h in zip(r_d, h2o_d):
        assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r)

    print("Compare right singular vectors (V)")
    h2o_v = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["v_key"]["name"]), use_pandas=False)
    h2o_v.pop(0)
    r_v = [
        [-0.04239181, 0.01616262, -0.06588426, 0.99679535],
        [-0.94395706, 0.32068580, 0.06655170, -0.04094568],
        [-0.30842767, -0.93845891, 0.15496743, 0.01234261],
        [-0.10963744, -0.12725666, -0.98347101, -0.06760284],
    ]
    print("R Right Singular Vectors: {0}".format(r_v))
    print("H2O Right Singular Vectors: {0}".format(h2o_v))
    for rl, hl in zip(r_v, h2o_v):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)

    print("Compare left singular vectors (U)")
    h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["u_key"]["name"]), use_pandas=False)
    h2o_u.pop(0)
    r_u = [
        [-0.1716251, 0.096325710, 0.06515480, 0.15369551],
        [-0.1891166, 0.173452566, -0.42665785, -0.17801438],
        [-0.2155930, 0.078998111, 0.02063740, -0.28070784],
        [-0.1390244, 0.059889811, 0.01392269, 0.01610418],
        [-0.2067788, -0.009812026, -0.17633244, -0.21867425],
        [-0.1558794, -0.064555293, -0.28288280, -0.11797419],
    ]
    print("R Left Singular Vectors: {0}".format(r_u))
    print("H2O Left Singular Vectors: {0}".format(h2o_u))
    for rl, hl in zip(r_u, h2o_u):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
示例#26
0
def distance_check():
    x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'], column_types=['factor'])
    y = h2o.H2OFrame.from_python(['Marhta', 'Duane', 'Dicksonx'], column_types=['string'])
    dist = x.strdistance(y, measure="jw")
    dist_list = h2o.as_list(dist, use_pandas=False, header=False)

    tst.assert_allclose([float(c[0]) for c in dist_list], [0.961111, 0.84, 0.813333], atol=0.001)
示例#27
0
def save_histogram(dataset, feature, max_value=100):
    sns.set()
    x = h2o.as_list(dataset[feature]).values
    ax = sns.distplot(x)
    ax.set(xlim=(0, max_value))
    fig = ax.get_figure()
    fig.savefig(feature + "_hist.png")
示例#28
0
def vec_as_list(ip, port):

    iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \
           abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \
           abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values"

    res3 = h2o.as_list(res[1], use_pandas=False)
    assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \
           abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
示例#29
0
def h2o_grid():
    h2o.init()
    data = h2o.import_file('output/diamonds_PCA.csv')
    splits = data.split_frame(ratios=[0.7, 0.15], seed=1)
    train = splits[0]
    valid = splits[1]
    test = splits[2]
    y = 'price'
    x = list(data.columns)

    x.remove(y)

    hyper_parameters = {'learn_rate': [0.01, 0.1],'max_depth': [3, 5, 9],
            'sample_rate': [0.8, 1.0],'col_sample_rate': [0.2, 0.5, 1.0]}

    gs = H2OGridSearch(H2OGradientBoostingEstimator,hyper_parameters)

    gs.train(x = x,y=y, training_frame=train,validation_frame=valid)
    gs1=gs.get_grid(sort_by='rmse',decreasing=True)
    best_m=gs1.models[0]
    best_mp=best_m.model_performance(test)
    print(best_mp.rmse())
    test = h2o.import_file('output/diamonds_test_PCA.csv')
    predict=best_m.predict(test)
    predict=h2o.as_list(predict) 
    predict.to_csv('output/pred_h2o.csv') 
示例#30
0
def sdev(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                          delimiter=',',
                          skip_header=1,
                          usecols=(0, 1, 2, 3))

  sd_np = np.std(iris_np, axis=0, ddof=1)
  for i in range(4):
    sd_h2o = h2o.as_list(iris_h2o[i].sd())[0][0]
    assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same"

  try:
    iris_h2o[4].sd().eager()
    assert False, "expected an error. column is categorical."
  except EnvironmentError:
    assert True

  try:
    iris_h2o[0:2].sd().eager()
    assert False, "expected an error. more than one column."
  except AttributeError:
    assert True
示例#31
0
def _from_frame(frame):
    """Create numpy array from H2OFrame object
    """
    preds = h2o.as_list(frame, use_pandas=False)
    preds.pop(0)
    [r.pop(0) for r in preds]
    return np.asarray(preds, dtype=np.float)
示例#32
0
def convert_h2o_list(lst):
    """
    Converts an h2o list to a python list
    :param lst:
    :return:
    """
    return h2o.as_list(lst)
def evalmodel(df):
    glm_classifier = h2o.load_model('./model')
    result = h2o.as_list(glm_classifier.predict(df), use_pandas=False)
    result.pop(0)  #get rid of the column header
    result = [float(r[0]) for r in result
              ]  #the results are each returned as 1-element lists. fix that.
    return result
示例#34
0
def numeric_quantile_bin(data_df, cal_numeric_cols, nbin=20):
    """
    cut numerical variables into buckets by quantiles
    :param data_df: a data frame
    :param cal_numeric_cols: numerical columns to be cut
    :param nbin: bucket number
    :return: a data frame after cutting, and a dict with cutting info
    """
    percentiles = [i * 1.0 / nbin for i in range(nbin + 1)]
    numeric_bin_dict = dict()
    for col in cal_numeric_cols:
        break_lst = h2o.as_list(data_df[col].quantile(
            prob=percentiles, combine_method=u'interpolate')[:, 1],
                                use_pandas=False,
                                header=False)
        break_lst = [float(i[0]) for i in break_lst]
        break_labels = [col + '_' + str(i + 1) for i in range(nbin)]
        data_df[col] = data_df[col].cut(break_lst,
                                        labels=break_labels,
                                        include_lowest=True,
                                        right=True,
                                        dig_lab=3)
        numeric_bin_dict[col] = [break_labels, break_lst]

    return data_df, numeric_bin_dict
示例#35
0
def sdev(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris_h2o = h2o.import_frame(
        path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                            delimiter=',',
                            skip_header=1,
                            usecols=(0, 1, 2, 3))

    sd_np = np.std(iris_np, axis=0, ddof=1)
    for i in range(4):
        sd_h2o = h2o.as_list(iris_h2o[i].sd())[0][0]
        assert abs(sd_np[i] - sd_h2o
                   ) < 1e-10, "expected standard deviations to be the same"

    try:
        iris_h2o[4].sd().eager()
        assert False, "expected an error. column is categorical."
    except EnvironmentError:
        assert True

    try:
        iris_h2o[0:2].sd().eager()
        assert False, "expected an error. more than one column."
    except AttributeError:
        assert True
def save_histogram(dataset,feature,max_value=100):
    sns.set()
    x = h2o.as_list(dataset[feature]).values
    ax = sns.distplot(x)
    ax.set(xlim=(0,max_value))
    fig = ax.get_figure()
    fig.savefig(feature+"_hist.png")
示例#37
0
文件: score.py 项目: zhang-lucy/MLOps
def run(raw_data):
    data = pd.read_json(raw_data,orient='table')
    # make prediction
    h2data = h2o.H2OFrame(data)
    y_hat = h2o.as_list(model.predict(h2data))
    # you can return any data type as long as it is JSON-serializable
    return y_hat.to_json(orient="table")
示例#38
0
def check_leaderboard(aml,
                      excluded_algos,
                      expected_metrics,
                      expected_sort_metric,
                      expected_sorted_desc=False):
    print("AutoML leaderboard")
    leaderboard = aml.leaderboard
    print(leaderboard)
    # check that correct leaderboard columns exist
    expected_columns = (['model_id'] + expected_metrics)
    assert leaderboard.names == expected_columns, \
        "expected leaderboard columns to be {expected} but got {actual}".format(expected=expected_columns, actual=leaderboard.names)

    model_ids = list(h2o.as_list(leaderboard['model_id'])['model_id'])
    assert len([a for a in excluded_algos if len([b for b in model_ids if a in b]) > 0]) == 0, \
        "leaderboard contains some excluded algos among {excluded}: {models}".format(excluded=excluded_algos, models=model_ids)

    included_algos = list(set(all_algos) - set(excluded_algos)) + (
        [] if 'DRF' in excluded_algos else ['XRT'])
    assert len([a for a in included_algos if len([b for b in model_ids if a in b]) > 0]) == len(included_algos), \
        "leaderboard is missing some algos from {included}: {models}".format(included=included_algos, models=model_ids)

    j_leaderboard = aml._state_json['leaderboard']
    if expected_sort_metric is not None:
        sort_metric = j_leaderboard['sort_metric']
        assert sort_metric == expected_sort_metric, \
            "expected leaderboard sorted by {expected} but was sorted by {actual}".format(expected=expected_sort_metric, actual=sort_metric)
    if expected_sorted_desc is not None:
        sorted_desc = j_leaderboard['sort_decreasing']
        assert sorted_desc == expected_sorted_desc, \
            "expected leaderboard sorted {expected} but was sorted {actual}".format(expected="desc" if expected_sorted_desc else "asc",
                                                                                    actual="desc" if sorted_desc else "asc")
示例#39
0
 def check_values(h2o_data, numpy_data):
     success = True
     for i in range(10):
         r = random.randint(0,row-1)
         c = random.randint(0,col-1)
         if not abs(h2o.as_list(h2o_data[r,c])[0][0] - numpy_data[r,c]) < 1e-06: success = False
     return success
示例#40
0
文件: frame.py 项目: Vishnu24/h2o-3
  def hist(self, breaks="Sturges", plot=True, **kwargs):
    """
    Compute a histogram over a numeric column. If breaks=="FD", the MAD is used over the IQR in computing bin width.

    :param breaks: breaks Can be one of the following: A string: "Sturges", "Rice", "sqrt", "Doane", "FD", "Scott." A single number for the number of breaks splitting the range of the vec into number of breaks bins of equal width. Or, A vector of numbers giving the split points, e.g., c(-50,213.2123,9324834)
    :param plot: A logical value indicating whether or not a plot should be generated (default is TRUE).
    :return: if plot is True, then return None, else, an self._newExpr with these columns: breaks, counts, mids_true, mids, and density
    """
    frame = self._newExpr("hist", self, breaks)
    total = frame["counts"].sum(True)
    densities = [(frame[i,"counts"]/total)*(1/(frame[i,"breaks"]-frame[i-1,"breaks"])) for i in range(1,frame["counts"].nrow)]
    densities.insert(0,0)
    densities_frame = H2OFrame.fromPython(densities)
    densities_frame.set_names(["density"])
    frame = frame.cbind(densities_frame)

    if plot:
      try:
        imp.find_module('matplotlib')
        import matplotlib
        if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False)
        import matplotlib.pyplot as plt
      except ImportError:
        print "matplotlib is required to make the histogram plot. Set `plot` to False, if a plot is not desired."
        return

      lower = float(frame[0,"breaks"])
      clist = h2o.as_list(frame["counts"], use_pandas=False)
      clist = zip(*clist)
      clist.pop(0)
      clist.pop(0)
      mlist = h2o.as_list(frame["mids"], use_pandas=False)
      mlist = zip(*mlist)
      mlist.pop(0)
      mlist.pop(0)
      counts = [float(c[0]) for c in clist]
      counts.insert(0,0)
      mids = [float(m[0]) for m in mlist]
      mids.insert(0,lower)
      plt.xlabel(self.names[0])
      plt.ylabel('Frequency')
      plt.title('Histogram of {0}'.format(self.names[0]))
      plt.bar(mids, counts)
      if not ('server' in kwargs.keys() and kwargs['server']): plt.show()

    else: return frame
示例#41
0
def gbm_1_R(K, dfs, dfs_collector, test, test_collector):
    r = 'gbm_1'

    on = []
    val_hf = h2o.H2OFrame(test)
    ntrees = 100
    seed = 1155
    v = np.zeros(shape=[len(test)])
    for i in range(K):
        print()
        print('in model:', r, ' k-fold:', i + 1, '/', K)
        print()
        b = [i for i in range(K)]
        b.remove(i)
        c = [dfs[b[j]] for j in range(K - 1)]
        dt = pd.concat(c)
        train_hf = h2o.H2OFrame(dt)
        del dt
        dfs_i = h2o.H2OFrame(dfs[i])

        features = list(train_hf.columns)
        features.remove('target')
        model = H2OGradientBoostingEstimator(model_id='gbm_manual',
                                             seed=seed,
                                             ntrees=ntrees,
                                             sample_rate=0.9,
                                             col_sample_rate=0.9)

        model.train(x=features, y='target', training_frame=train_hf)
        del train_hf
        p = model.predict(dfs_i)
        dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values
        print(dfs_collector[i].head())
        print(dfs_collector[i].head().dtypes)
        q = model.predict(val_hf)

        dd = h2o.as_list(q, use_pandas=True)
        a = dd['predict']
        a = np.array(a, dtype=pd.Series).tolist()
        # print(type(a))
        # print(a.shape)
        v += a

    test_collector[r] = v / K
    print(test_collector.head())
    return dfs_collector, test_collector, r
示例#42
0
def get_partitioned_model_names(leaderboard):
    model_names = Namespace()
    model_names.all = list(h2o.as_list(leaderboard['model_id'])['model_id'])
    model_names.se = [
        m for m in model_names.all if m.startswith('StackedEnsemble')
    ]
    model_names.base = [m for m in model_names.all if m not in model_names.se]
    return model_names
示例#43
0
文件: wnvh.py 项目: tanayz/Kaggle
def ntrain():
    
    h2o.init(ip="zurich.h2o.ai",strict_version_check=False)
    weather = load_weather()
    training = load_training()    
    X = assemble_X(training, weather)
    mean, std = normalize(X)
    y =assemble_y(training)
    xd=[]
    for l in X:
        xd.append(l.tolist())
        
    y=np.asarray(y,dtype='bool_')    
        
    xtr=H2OFrame(python_obj=xd)
    ytr=H2OFrame(python_obj=y.tolist()) 
    
    ytr["C1"]._name = "C40"  # Rename the default column
        
    gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'],
                distribution = "bernoulli",
                ntrees=1000, # 500 works well
                max_depth=12,
                learn_rate=0.01)
                
    dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'],
                variable_importances=True,balance_classes=True,
                input_dropout_ratio=0.2,rho=0.899,
                hidden_dropout_ratios=[0.4,0.4,0.4,0.4],
                activation="Tanh",hidden=[39,325,325,1],epochs=100)
                
    rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'],
                seed=1234, ntrees=600, 
                max_depth=20, balance_classes=False)

    
    testing = load_testing()
    X_test= assemble_X(testing, weather) 
    normalize(X_test, mean, std)
    
    xd=[]
    for l in X_test:
        xd.append(l.tolist())
    xts=H2OFrame(python_obj=xd)
    
#    gp=gb.predict(xts)
    dp=dl.predict(xts) 
    rp=rf.predict(xts)
    gbp=gb.predict(xts) 
    
    gp=dp*0.35+rp*0.3+gbp*0.35
    
    gph=h2o.as_list(gp)
    Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1)
    df = pd.DataFrame(Id)
    df_concat = pd.concat([df, gph.True],axis=1)
    df_concat.columns=['Id','WnvPresent']
    df_concat.to_csv("wnvh.csv",index=False)
示例#44
0
    def _evaluate(request):
        """
        Summarize the column sent as a parameter. Aggregation function.
        :param request: an iterable sequence of RowData
        :return: int, sum if column
        """
        params = []
        logging.info('_evaluate')
        logging.info('_evaluate request {}'.format(request))

        print(request)

        # Iterate over bundled rows
        for request_rows in request:
            print(request_rows.rows)
            print(len(request_rows.rows))
            # Iterating over rows
            logging.info('_evaluate request_rows {}'.format(request_rows.rows))
            for row in request_rows.rows:
                # Retrieve numerical value of parameter and append to the params variable
                # Length of param is 1 since one column is received, the [0] collects the first value in the list
                param = [d.numData for d in row.duals]
                logging.info('_evaluate row {}'.format(param))
                params.append(param)

        h2o.init()

        dir_path = os.path.dirname(
            os.path.realpath(__file__)) + "\\kmeans_iris"
        #results = h2o.load_model("C:/Users/daniel/Documents/Qlik Advanced Analytics/Examples/Python/H2O/kmeans_iris")
        results = h2o.load_model(dir_path)
        newData = h2o.H2OFrame(params)

        predictedNew = results.predict(newData)
        predicted_as_list = h2o.as_list(predictedNew, use_pandas=False)

        predicted_as_list.pop(0)

        response_rows = []

        for result in predicted_as_list:
            # Create an iterable of Dual with a numerical value
            duals = iter([SSE.Dual(numData=int(result[0]))])
            # Append the row data constructed to response_rows
            response_rows.append(SSE.Row(duals=duals))

        #print(predicted_as_list)
        logging.info('_evaluate params {}'.format(params))
        logging.info(
            '_evaluate predicted_as_list {}'.format(predicted_as_list))
        # Sum all rows collected the the params variable
        #result = sum(params[0])

        # Create an iterable of dual with numerical value
        #duals = iter([SSE.Dual(numData=result)])

        # Yield the row data constructed
        yield SSE.BundledRows(rows=response_rows)
示例#45
0
def cal_vars_levels_amount(data_df, factor_var):
    """
    
    :return: 
    """
    groupby_lst = h2o.as_list(data_df.group_by(by=factor_var).count().frame,
                              use_pandas=False,
                              header=False)
    return dict(groupby_lst)
 def check_values(h2o_data, np_data):
     success = True
     for i in range(10):
         h2o_val = h2o.as_list(h2o_data[i,0])[0][0]
         num_val = np_data[i]
         if not abs(h2o_val - num_val) < 1e-06:
             success = False
             print "check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val,num_val)
     return success
示例#47
0
def as_python_test():
  
  

  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip"))
  airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k.zip"))

  iris.show()
  prostate.show()
  airlines.show()


  print(h2o.as_list(iris))

  print(h2o.as_list(prostate))

  print(h2o.as_list(airlines))
示例#48
0
def vec_as_list(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \
           abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]._expr), use_pandas=False)
    assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \
           abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values"

    res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]._expr), use_pandas=False)
    assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \
           abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
示例#49
0
def vec_as_list(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \
           abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]._expr), use_pandas=False)
    assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \
           abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values"

    res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]._expr), use_pandas=False)
    assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \
           abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def vec_as_list(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0])
    assert abs(res[3][0] - 4.6) < 1e-10 and abs(res[5][0] - 5.4) < 1e-10 and abs(res[9][0] - 4.9) < 1e-10, \
        "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]))
    assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \
        "incorrect values"

    res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]))
    assert abs(res3[3][0] - -1.1) < 1e-10 and abs(res3[5][0] - -1.9) < 1e-10 and abs(res3[9][0] - -1.1) < 1e-10, \
        "incorrect values"
def vec_as_list():
    
    

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv"))

    res = h2o.as_list(iris[0], use_pandas=False)
    assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \
           abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values"

    res = 2 - iris
    res2 = h2o.as_list(res[0], use_pandas=False)
    assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \
           abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values"

    res3 = h2o.as_list(res[1], use_pandas=False)
    assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \
           abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def compare_frames(d1 = saving_meanImputed_fp, 
                  d2 = saving_modelImputed_fp,
                  imputed = to_impute):
  print "Comparing the resulting two matrices..."
  # Load the saved frames back in
  meanI  = h2o.import_file(path = d1)
  modelI = h2o.import_file(path = d2)
  
  meanIquantiles = h2o.as_list(meanI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]))
  modelIquantiles = h2o.as_list(modelI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]))
  
  meanIcolmeans = [v.mean() for v in meanI[imputed]]  
  modelIcolmeans = [v.mean() for v in modelI[imputed]]  
  
  meanIcolmedians = [v.median() for v in meanI[imputed]]  
  modelIcolmedians = [v.median() for v in modelI[imputed]]  
  
  meanIcolmin = [v.min() for v in meanI[imputed]]  
  modelIcolmin = [v.min() for v in modelI[imputed]]
def frame_as_list(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))

    res1 = h2o.as_list(iris)
    assert abs(res1[8][0] - 4.4) < 1e-10 and abs(res1[8][1] - 2.9) < 1e-10 and abs(res1[8][2] - 1.4) < 1e-10, \
        "incorrect values"

    res2 = h2o.as_list(prostate)
    assert abs(res2[6][0] - 7) < 1e-10 and abs(res2[6][1] - 0) < 1e-10 and abs(res2[6][2] - 68) < 1e-10, \
        "incorrect values"

    res3 = h2o.as_list(airlines)
    assert abs(res3[3][0] - 1987) < 1e-10 and abs(res3[3][1] - 10) < 1e-10 and abs(res3[3][2] - 18) < 1e-10, \
        "incorrect values"
def vec_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # H2OVec[int]
    res = 2 - iris
    res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]))
    assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \
        "incorrect values"

    # H2OVec[slice]
    res = iris[1][12:25]
    res3 = h2o.as_list(res)
    assert abs(res3[0][0] - 3.0) < 1e-10 and abs(res3[1][0] - 3.0) < 1e-10 and abs(res3[5][0] - 3.5) < 1e-10, \
        "incorrect values"
示例#55
0
文件: frame.py 项目: Vishnu24/h2o-3
  def levels(self, col=None):
    """
    Get the factor levels for this frame and specified columns

    :param col: A column index in this H2OFrame
    :return: A list of lists of strings that are the factor levels for columns.
    """
    fr = self if col is None else self._newExpr("cols", self, col)
    lol = h2o.as_list(self._newExpr("levels", fr), False)
    for l in lol: l.pop(0) # Remove column headers
    return lol
 def check_values(h2o_data, numpy_data):
     success = True
     for i in range(10):
         r = random.randint(0,row-1)
         c = random.randint(0,col-1)
         h2o_val = h2o.as_list(h2o_data[r,c])[0][0]
         num_val = numpy_data[r,c]
         if not abs(h2o_val - num_val) < 1e-06:
             success = False
             print "check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val,num_val)
     return success
示例#57
0
文件: frame.py 项目: Vishnu24/h2o-3
  def level(self, col=None):
    """
    Get the factor levels for this single column

    :param col: A column index in this H2OFrame
    :return: a list of strings that are the factor levels for the one column.
    """
    fr = self if col is None else self._newExpr("cols", self, col)
    if fr.ncol > 1: raise ValueError("level takes only a single column")
    l = h2o.as_list(self._newExpr("levels", fr), False)[0]
    l.pop(0) # Remove column header
    return l