def glrm_catagorical_bug_fix(): print("Importing prostate.csv data...") tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10, col_header=["q1", "q2", "q3"], row_header=range(10), table_header="Table 2") # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM cell_values = [['Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 58.295918367346935, 8.810102040816325, 11.344897959183678, 6.285714285714286], ['Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 69.35514018691589, 7.538224299065424, 10.087757009345797, 5.6168224299065415], ['Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 64.68, 75.892, 10.812000000000001, 7.44], ['Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 68.77083333333333, 13.368750000000002, 49.44583333333334, 5.9375], ['Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 69.04901960784314, 16.140196078431373, 11.510000000000005, 7.235294117647059]] col_header = ['dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack', 'racena', 'racewhite', 'capsuleno', 'capsuleyes', 'dcapsno', 'dcapsyes', 'age', 'psa', 'vol', 'gleason'] row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5'] table_header = "archetypes" correct_archetype = H2OTwoDimTable(cell_values=cell_values, col_header=col_header, row_header=row_header, table_header=table_header) prostateF = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234) glrm_h2o.train(x=prostateF.names, training_frame=prostateF) glrm_h2o.show() assert pyunit_utils.equal_2D_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values, correct_archetype._cell_values, tolerance=1e-4), \ "GLRM model archetypes generated from current model are not correct."
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) pca_h2o = H2OPCA(k=4, transform="STANDARDIZE") pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O) pca_h2o.summary() pca_h2o.show() print("H2O GLRM on standardized data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0, init="SVD", recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show() # compare table values and make sure they are the same between PCA and GLRM assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values, glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \ "PCA and GLRM variance metrics do not agree. Fix it please." sys.stdout.flush()
def glrm_catagorical_bug_fix(): print("Importing prostate.csv data...") tbl2 = H2OTwoDimTable(cell_values=[[1, 2, 4]] * 10, col_header=["q1", "q2", "q3"], row_header=range(10), table_header="Table 2") # H2OTwoDimTable containing the correct archetype values run before Wendy optimized memory for GLRM cell_values = [[ 'Arch1', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 58.295918367346935, 8.810102040816325, 11.344897959183678, 6.285714285714286 ], [ 'Arch2', 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 69.35514018691589, 7.538224299065424, 10.087757009345797, 5.6168224299065415 ], [ 'Arch3', 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 64.68, 75.892, 10.812000000000001, 7.44 ], [ 'Arch4', 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 68.77083333333333, 13.368750000000002, 49.44583333333334, 5.9375 ], [ 'Arch5', 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 69.04901960784314, 16.140196078431373, 11.510000000000005, 7.235294117647059 ]] col_header = [ 'dprosboth', 'dprosleft', 'dprosnone', 'dprosright', 'raceblack', 'racena', 'racewhite', 'capsuleno', 'capsuleyes', 'dcapsno', 'dcapsyes', 'age', 'psa', 'vol', 'gleason' ] row_header = ['Arch1', 'Arch2', 'Arch3', 'Arch4', 'Arch5'] table_header = "archetypes" correct_archetype = H2OTwoDimTable(cell_values=cell_values, col_header=col_header, row_header=row_header, table_header=table_header) prostateF = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, recover_svd=True, seed=1234) glrm_h2o.train(x=prostateF.names, training_frame=prostateF) glrm_h2o.show() assert pyunit_utils.equal_2D_tables(glrm_h2o._model_json["output"]["archetypes"]._cell_values, correct_archetype._cell_values, tolerance=1e-4), \ "GLRM model archetypes generated from current model are not correct."
def performOneTest(frameWithNA, frameWithoutNA, interactionColumn, xcols, standard=True): # default missing value handling = meanImputation h2o_df_NA = h2o.H2OFrame(frameWithNA, na_strings=["UNKNOWN"]) h2o_df_NA_Valid = h2o.H2OFrame(frameWithNA, na_strings=["UNKNOWN"]) h2o_df = h2o.H2OFrame(frameWithoutNA, na_strings=["UNKNOWN"]) h2o_df_valid = h2o.H2OFrame(frameWithoutNA, na_strings=["UNKNOWN"]) # build model with and without NA in Frame modelNA = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False, interactions=interactionColumn, standardize=standard) modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA, validation_frame=h2o_df_NA_Valid) model = H2OGeneralizedLinearEstimator(family = "Binomial", alpha=0, lambda_search=False, interactions=interactionColumn, standardize=standard) model.train(x=xcols, y='label', training_frame=h2o_df, validation_frame=h2o_df_valid) # extract GLM coefficients coef_m_NA = modelNA._model_json['output']['coefficients_table'] coef_m = model._model_json['output']['coefficients_table'] if not (len(coef_m_NA.cell_values)==len(coef_m.cell_values)): # deal with 0 coeff for NA assert_arrays_equal_NA(coef_m_NA.cell_values, coef_m.cell_values) else: pyunit_utils.equal_2D_tables(coef_m_NA.cell_values, coef_m.cell_values)
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) pca_h2o = H2OPCA(k = 4, transform="STANDARDIZE") pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O) pca_h2o.summary() pca_h2o.show() print("H2O GLRM on standardized data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0, init="SVD", recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show() # compare table values and make sure they are the same between PCA and GLRM assert pyunit_utils.equal_2D_tables(pca_h2o._model_json["output"]["importance"]._cell_values, glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \ "PCA and GLRM variance metrics do not agree. Fix it please." sys.stdout.flush()