def glrm_catagorical_bug_fix():
  print("Importing prostate.csv data...")

  tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10, col_header=["q1", "q2", "q3"], row_header=range(10),
                        table_header="Table 2")

  # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM
  cell_values = [['Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 58.295918367346935,
                  8.810102040816325, 11.344897959183678, 6.285714285714286],
                 ['Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 69.35514018691589, 7.538224299065424,
                  10.087757009345797, 5.6168224299065415],
                 ['Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 64.68, 75.892, 10.812000000000001,
                  7.44],
                 ['Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 68.77083333333333, 13.368750000000002,
                  49.44583333333334, 5.9375],
                 ['Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 69.04901960784314, 16.140196078431373,
                  11.510000000000005, 7.235294117647059]]
  col_header = ['dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack', 'racena', 'racewhite', 'capsuleno',
                'capsuleyes', 'dcapsno', 'dcapsyes', 'age', 'psa', 'vol', 'gleason']
  row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5']
  table_header = "archetypes"
  correct_archetype = H2OTwoDimTable(cell_values=cell_values, col_header=col_header, row_header=row_header,
                                     table_header=table_header)

  prostateF = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234)
  glrm_h2o.train(x=prostateF.names, training_frame=prostateF)
  glrm_h2o.show()

  assert pyunit_utils.equal_2D_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values,
                                      correct_archetype._cell_values, tolerance=1e-4), \
      "GLRM model archetypes generated from current model are not correct."
def glrm_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    pca_h2o = H2OPCA(k=4, transform="STANDARDIZE")
    pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
    pca_h2o.summary()
    pca_h2o.show()

    print("H2O GLRM on standardized data with quadratic loss:\n")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="STANDARDIZE",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="SVD",
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()

    # compare table values and make sure they are the same between PCA and GLRM
    assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values,
                                        glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \
      "PCA and GLRM variance metrics do not agree.  Fix it please."

    sys.stdout.flush()
def glrm_catagorical_bug_fix():
    print("Importing prostate.csv data...")

    tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10,
                          col_header=["q1", "q2", "q3"],
                          row_header=range(10),
                          table_header="Table 2")

    # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM
    cell_values = [[
        'Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0,
        58.295918367346935, 8.810102040816325, 11.344897959183678,
        6.285714285714286
    ],
                   [
                       'Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0,
                       1.0, 0.0, 69.35514018691589, 7.538224299065424,
                       10.087757009345797, 5.6168224299065415
                   ],
                   [
                       'Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
                       1.0, 0.0, 64.68, 75.892, 10.812000000000001, 7.44
                   ],
                   [
                       'Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0,
                       1.0, 0.0, 68.77083333333333, 13.368750000000002,
                       49.44583333333334, 5.9375
                   ],
                   [
                       'Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0,
                       1.0, 0.0, 69.04901960784314, 16.140196078431373,
                       11.510000000000005, 7.235294117647059
                   ]]
    col_header = [
        'dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack',
        'racena', 'racewhite', 'capsuleno', 'capsuleyes', 'dcapsno',
        'dcapsyes', 'age', 'psa', 'vol', 'gleason'
    ]
    row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5']
    table_header = "archetypes"
    correct_archetype = H2OTwoDimTable(cell_values=cell_values,
                                       col_header=col_header,
                                       row_header=row_header,
                                       table_header=table_header)

    prostateF = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234)
    glrm_h2o.train(x=prostateF.names, training_frame=prostateF)
    glrm_h2o.show()

    assert pyunit_utils.equal_2D_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values,
                                        correct_archetype._cell_values, tolerance=1e-4), \
        "GLRM model archetypes generated from current model are not correct."
Пример #4
0
def performOneTest(frameWithNA, frameWithoutNA, interactionColumn, xcols, standard=True):
    # default missing value handling = meanImputation
    h2o_df_NA = h2o.H2OFrame(frameWithNA, na_strings=["UNKNOWN"])
    h2o_df_NA_Valid = h2o.H2OFrame(frameWithNA, na_strings=["UNKNOWN"])
    h2o_df = h2o.H2OFrame(frameWithoutNA, na_strings=["UNKNOWN"])
    h2o_df_valid = h2o.H2OFrame(frameWithoutNA, na_strings=["UNKNOWN"])
    # build model with and without NA in Frame
    modelNA = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False,
                                            interactions=interactionColumn, standardize=standard)
    modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA, validation_frame=h2o_df_NA_Valid)
    model = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False,
                                      interactions=interactionColumn, standardize=standard)
    model.train(x=xcols, y='label', training_frame=h2o_df, validation_frame=h2o_df_valid)
    # extract GLM coefficients
    coef_m_NA = modelNA._model_json['output']['coefficients_table']
    coef_m =  model._model_json['output']['coefficients_table']
    
    if not (len(coef_m_NA.cell_values)==len(coef_m.cell_values)):   # deal with 0 coeff for NA
        assert_arrays_equal_NA(coef_m_NA.cell_values, coef_m.cell_values)         
    else:
        pyunit_utils.equal_2D_tables(coef_m_NA.cell_values, coef_m.cell_values)
def glrm_arrests():
  print("Importing USArrests.csv data...")
  arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

  pca_h2o = H2OPCA(k = 4, transform="STANDARDIZE")
  pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
  pca_h2o.summary()
  pca_h2o.show()

  print("H2O GLRM on standardized data with quadratic loss:\n")
  glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0,
                                            init="SVD", recover_svd=True)
  glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
  glrm_h2o.show()

  # compare table values and make sure they are the same between PCA and GLRM
  assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values,
                                      glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \
    "PCA and GLRM variance metrics do not agree.  Fix it please."

  sys.stdout.flush()