def test_expr_rpy2(self): for k in range(20): a = random.randint(1, 10) # b = random.randint(49,50) b = random.randint(1, 10) c = random.randint(0, 3) for k in range(50): execExpr = "a=" + str(h2o_eqns.Expression(a, b, c)) + ";" (resultExec, hResult) = h2e.exec_expr(execExpr=execExpr) print "h2o:", hResult rResult = robjects.r(execExpr)[0] print "R:", rResult if math.isinf(rResult): # covers pos/neg inf? if not 'Infinity' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif math.isnan(rResult): if not 'NaN' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif 'Infinity' in str(hResult) or 'NaN' in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) else: # skip Inf # don't do logicals..h2o 1/0, R True/False h2o_util.approxEqual( rResult, hResult, tol=1e-12, msg='mismatch h2o/R expression result')
def test_expr_rpy2(self): h2o.beta_features = True for k in range(20): a = random.randint(1, 10) # b = random.randint(49,50) b = random.randint(1, 10) c = random.randint(0, 3) for k in range(50): execExpr = "a=" + str(h2o_eqns.Expression(a, b, c)) + ";" (resultExec, hResult) = h2e.exec_expr(execExpr=execExpr) print "h2o:", hResult rResult = robjects.r(execExpr)[0] print "R:", rResult if math.isinf(rResult): # covers pos/neg inf? if not "Infinity" in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif math.isnan(rResult): if not "NaN" in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) elif "Infinity" in str(hResult) or "NaN" in str(hResult): raise Exception("h2o: %s R: %s not equal" % (hResult, rResult)) else: # skip Inf # don't do logicals..h2o 1/0, R True/False h2o_util.approxEqual(rResult, hResult, tol=1e-12, msg="mismatch h2o/R expression result")
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ('cars.csv', 'c.hex', [ (None, None,None,None,None,None), ('economy (mpg)', None,None,None,None,None), ('cylinders', None,None,None,None,None), ], ), ('runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ('runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ('runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100,00), ], ), ('runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype= stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype!='Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange/(MAX_QBINS-2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype!='Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname!='' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() h2o.beta_features = False SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1==0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1==1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 h2o.beta_features = False CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 h2o.beta_features = True kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y+1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON} timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr="B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % (trainKey, y+1, trainKey, y+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time() - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER-1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER-1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % (100.0 * (abs(coeff0) - abs(coeff0Expected))/abs(coeff0Expected)) self.assertTrue(h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % (100.0 * (abs(coeff2) - abs(coeff2Expected))/abs(coeff2Expected)) self.assertTrue(h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual(auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * (abs(intercept) - abs(interceptExpected))/abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [abs(abs(a) - abs(b)) for a,b in zip(coefficients1, coefficients)] def printit(self, a, b, c, d): pctDiff = abs(d/c)*100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len(coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i,cValue in enumerate(coefficients): printit(self , "coefficient", "C"+str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual(pctWrong, expectedPctWrong, delta = 2.0, msg="predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_impute_with_na(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.1 # NOT ALLOWED TO SET AN ENUM COL? if 1 == 0: # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec? # just one in row 1 for enumCol in enumColList: print "hack: Putting NA in row 0 of col %s" % enumCol execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after exec:", missingValuesList if len(missingValuesList) != len(enumColList): raise Exception( "Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList)) for trial in range(5): # copy the dataset hex_key2 = 'c.hex' execExpr = '%s = %s' % (hex_key2, hex_key) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) imvResult = h2o.nodes[0].insert_missing_values( key=hex_key2, missing_fraction=missing_fraction, seed=SEED) print "imvResult", h2o.dump_json(imvResult) # maybe make the output col a factor column # maybe one of the 0,1 cols too? # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns. # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3) print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before" expectedMissing = missing_fraction * origNumRows # per col enumColList = [49, 50, 51, 52, 53, 54] for e in enumColList: enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e + 1)) inspect = h2o_cmd.runInspect(key=hex_key2) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList if len(missingValuesList) != numCols: raise Exception( "Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList)) for mv in missingValuesList: self.assertAlmostEqual(mv, expectedMissing, delta=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) summaryResult = h2o_cmd.runSummary(key=hex_key2) h2o_cmd.infoFromSummary(summaryResult) # h2o_cmd.infoFromSummary(summaryResult) print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect" print "trial", trial print "expectedMissing:", expectedMissing print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above" print "Do the columns in random order" # don't do the enum cols ..impute doesn't support right? if AVOID_BUG: shuffledColList = range(0, 49) # 0 to 48 execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # summaryResult = h2o_cmd.runSummary(key=hex_key2) # h2o_cmd.infoFromSummary(summaryResult) inspect = h2o_cmd.runInspect(key=hex_key2) numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if len(missingValuesList) != 49: raise Exception( "expected missing values in all cols after pruning enum cols: %s" % missingValuesList) else: shuffledColList = range(0, 55) # 0 to 54 origInspect = inspect random.shuffle(shuffledColList) for column in shuffledColList: # get a random set of column. no duplicate. random order? 0 is okay? will be [] groupBy = random.sample(range(55), random.randint(0, 54)) # header names start with 1, not 0. Empty string if [] groupByNames = ",".join( map(lambda x: "C" + str(x + 1), groupBy)) # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap columnName = "C%s" % (column + 1) print "don't use mode if col isn't enum" badChoices = True while badChoices: method = random.choice(["mean", "median", "mode"]) badChoices = column not in enumColList and method == "mode" NEWSEED = random.randint(0, sys.maxint) print "does impute modify the source key?" # we get h2o error (argument exception) if no NAs impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method) print "Now check that there are no missing values" print "FIX! broken..insert missing values doesn't insert NAs in enum cols" inspect = h2o_cmd.runInspect(key=hex_key2) numRows2 = inspect['numRows'] numCols2 = inspect['numCols'] self.assertEqual( numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2)) self.assertEqual( numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2)) # check that the mean didn't change for the col # the enum cols with mode, we'll have to think of something else missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if missingValuesList: raise Exception( "Not expecting any missing values after imputing all cols: %s" % missingValuesList) cols = inspect['cols'] origCols = origInspect['cols'] for i, (c, oc) in enumerate(zip(cols, origCols)): # I suppose since we impute to either median or mean, we can't assume the mean stays the same # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true h2o_util.approxEqual( c['mean'], oc['mean'], tol=0.000000001, msg= "col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean']))
def test_summary2_unifiles(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() # new with 1000 bins. copy expected from R tryList = [ ( 'cars.csv', 'c.hex', [ (None, None, None, None, None, None), ('economy (mpg)', None, None, None, None, None), ('cylinders', None, None, None, None, None), ], ), ( 'runifA.csv', 'A.hex', [ (None, 1.00, 25.00, 50.00, 75.00, 100.0), ('x', -99.9, -44.7, 8.26, 58.00, 91.7), ], ), # colname, (min, 25th, 50th, 75th, max) ( 'runif.csv', 'x.hex', [ (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00), ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8), ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0), ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00), ], ), ( 'runifB.csv', 'B.hex', [ (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00), ('x', -100.00, -50.1, 0.974, 51.7, 100, 00), ], ), ( 'runifC.csv', 'C.hex', [ (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00), ('x', -100.00, -50.45, -1.135, 49.28, 100.00), ], ), ] timeoutSecs = 15 trial = 1 n = h2o.nodes[0] lenNodes = len(h2o.nodes) timeoutSecs = 60 for (csvFilename, hex_key, expectedCols) in tryList: csvPathname = csvFilename csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True) parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvFilename numRows = inspect["numRows"] numCols = inspect["numCols"] # okay to get more cols than we want # okay to vary MAX_QBINS because we adjust the expected accuracy summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS) h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult)) summaries = summaryResult['summaries'] scipyCol = 0 for expected, column in zip(expectedCols, summaries): colname = column['colname'] if expected[0]: self.assertEqual(colname, expected[0]), colname, expected[0] else: # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page scipyCol += 1 continue quantile = 0.5 if DO_MEDIAN else .999 # h2o has problem if a list of columns (or dictionary) is passed to 'column' param q = h2o.nodes[0].quantiles( source_key=hex_key, column=column['colname'], quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2 qresult = q['result'] qresult_single = q['result_single'] h2p.blue_print("h2o quantiles result:", qresult) h2p.blue_print("h2o quantiles result_single:", qresult_single) h2p.blue_print("h2o quantiles iterations:", q['iterations']) h2p.blue_print("h2o quantiles interpolated:", q['interpolated']) print h2o.dump_json(q) # ('', '1.00', '25002.00', '50002.00', '75002.00', '100000.00'), coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] print stattype # FIX! we should compare mean and sd to expected? # enums don't have mean or sd? if stattype != 'Enum': mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals( mean) print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals( sd) pct = stats['pct'] print "pct:", pct print "" # the thresholds h2o used, should match what we expected expectedPct = [ 0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99 ] pctile = stats['pctile'] # figure out the expected max error # use this for comparing to sklearn/sort if expected[1] and expected[5]: expectedRange = expected[5] - expected[1] # because of floor and ceil effects due we potentially lose 2 bins (worst case) # the extra bin for the max value, is an extra bin..ignore expectedBin = expectedRange / (MAX_QBINS - 2) maxErr = 0.5 * expectedBin # should we have some fuzz for fp? else: print "Test won't calculate max expected error" maxErr = 0 # hack..assume just one None is enough to ignore for cars.csv if expected[1]: h2o_util.assertApproxEqual( mins[0], expected[1], tol=maxErr, msg='min is not approx. expected') if expected[2]: h2o_util.assertApproxEqual( pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected') if expected[3]: h2o_util.assertApproxEqual( pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected') if expected[4]: h2o_util.assertApproxEqual( pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected') if expected[5]: h2o_util.assertApproxEqual( maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected') hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] for b in hcnt: # should we be able to check for a uniform distribution in the files? e = .1 * numRows # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount, # msg="Bins not right. b: %s e: %s" % (b, e)) if stattype != 'Enum': pt = h2o_util.twoDecimals(pctile) print "colname:", colname, "pctile (2 places):", pt mx = h2o_util.twoDecimals(maxs) mn = h2o_util.twoDecimals(mins) print "colname:", colname, "maxs: (2 places):", mx print "colname:", colname, "mins: (2 places):", mn # FIX! we should do an exec and compare using the exec quantile too actual = mn[0], pt[3], pt[5], pt[7], mx[0] print "min/25/50/75/max colname:", colname, "(2 places):", actual print "maxs colname:", colname, "(2 places):", mx print "mins colname:", colname, "(2 places):", mn # don't check if colname is empty..means it's a string and scipy doesn't parse right? # need to ignore the car names if colname != '' and expected[scipyCol]: # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=scipyCol, datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # FIX! ignore for now h2oSummary2=pctile[5 if DO_MEDIAN else 10], h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, h2oSummary2MaxErr=maxErr, ) if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'): raise Exception("stopping to look") scipyCol += 1 trial += 1
def test_GLM1_GLM2_predict(self): # h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() trees = 15 timeoutSecs = 120 predictHexKey = 'predict_0.hex' predictCsv = 'predict_0.csv' actualCsv = 'actual_0.csv' if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'covtype.data.hex' y = 54 expectedPctWrong = 0 if 1 == 0: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.shuffled.10pct.data' hexKey = 'covtype.shuffled.10pct.data.hex' y = 54 expectedPctWrong = 0 if 1 == 1: skipSrcOutputHeader = 1 skipPredictHeader = 1 bucket = 'smalldata' # no header csvPathname = 'iris/iris.csv' hexKey = 'iris.hex' y = 4 expectedPctWrong = 26 csvPredictPathname = SYNDATASETS_DIR + "/" + predictCsv csvSrcOutputPathname = SYNDATASETS_DIR + "/" + actualCsv # for using below in csv reader csvFullname = h2i.find_folder_and_filename(bucket, csvPathname, schema='put', returnFullPath=True) parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) h2o_cmd.runSummary(key=hexKey) # do the binomial conversion with Exec2, for both training and test (h2o won't work otherwise) trainKey = parseResult['destination_key'] # just to check. are there any NA/constant cols? ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300) #************************************************************************** # first glm1 CLASS = 1 # try ignoring the constant col to see if it makes a diff kwargs = { 'lsm_solver': LSM_SOLVER, 'standardize': STANDARDIZE, 'y': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON, 'case': CLASS, 'case_mode': '=', } timeoutSecs = 120 kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # hack. fix bad 'family' ('link' is bad too)..so h2o_glm.py works right glm['GLMModel']['GLMParams']['family'] = FAMILY print "glm1 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients1, intercept1) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) iterations1 = glm['GLMModel']['iterations'] err1 = glm['GLMModel']['validations'][0]['err'] nullDev1 = glm['GLMModel']['validations'][0]['nullDev'] resDev1 = glm['GLMModel']['validations'][0]['resDev'] if FAMILY == 'binomial': classErr1 = glm['GLMModel']['validations'][0]['classErr'] auc1 = glm['GLMModel']['validations'][0]['auc'] #************************************************************************** # then glm2 kwargs = { # 'ignored_cols': 'C29', 'standardize': STANDARDIZE, 'response': 'C' + str(y + 1), 'family': FAMILY, 'n_folds': 0, 'max_iter': MAX_ITER, 'beta_epsilon': BETA_EPSILON } timeoutSecs = 120 # class 1=1, all else 0 if FAMILY == 'binomial': execExpr = "B.hex=%s; B.hex[,%s]=(%s[,%s]==%s)" % ( trainKey, y + 1, trainKey, y + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) bHack = {'destination_key': 'B.hex'} else: bHack = parseResult kwargs.update({'alpha': TRY_ALPHA, 'lambda': TRY_LAMBDA}) # kwargs.update({'alpha': 0.0, 'lambda': 0}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # kwargs.update({'alpha': 0.5, 'lambda': 1e-4}) # bad model (auc=0.5) # kwargs.update({'alpha': 0.0, 'lambda': 0.0}) start = time.time() glm = h2o_cmd.runGLM(parseResult=bHack, timeoutSecs=timeoutSecs, **kwargs) print "glm2 end on ", csvPathname, 'took', time.time( ) - start, 'seconds' (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) #************************************************************************** modelKey = glm['glm_model']['_key'] submodels = glm['glm_model']['submodels'] # hackery to make it work when there's just one validation = submodels[-1]['validation'] iteration = submodels[-1]['iteration'] resDev = validation['residual_deviance'] nullDev = validation['null_deviance'] if FAMILY == 'binomial': auc = validation['auc'] self.assertLess(iterations1, MAX_ITER - 1, msg="GLM1: Too many iterations, didn't converge %s" % iterations1) self.assertLess(iteration, MAX_ITER - 1, msg="GLM2: Too many iterations, didn't converge %s" % iteration) nullDevExpected = nullDev1 # self.assertAlmostEqual(nullDev, nullDevExpected, delta=2, # msg='GLM2 nullDev %s is too different from GLM1 %s' % (nullDev, nullDevExpected)) iterationExpected = iterations1 # self.assertAlmostEqual(iteration, iterationExpected, delta=2, # msg='GLM2 iteration %s is too different from GLM1 %s' % (iteration, iterationExpected)) # coefficients is a list. coeff0 = coefficients[0] coeff0Expected = coefficients1[0] print "coeff0 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff0) - abs(coeff0Expected)) / abs(coeff0Expected)) self.assertTrue( h2o_util.approxEqual(coeff0, coeff0Expected, rel=0.5), msg='GLM2 coefficient 0 %s is too different from GLM1 %s' % (coeff0, coeff0Expected)) coeff2 = coefficients[2] coeff2Expected = coefficients1[2] print "coeff2 pct delta:", "%0.3f" % ( 100.0 * (abs(coeff2) - abs(coeff2Expected)) / abs(coeff2Expected)) self.assertTrue( h2o_util.approxEqual(coeff2, coeff2Expected, rel=0.5), msg='GLM2 coefficient 2 %s is too different from GLM1 %s' % (coeff2, coeff2Expected)) # compare to known values GLM1 got for class 1 case, with these parameters # aucExpected = 0.8428 if FAMILY == 'binomial': aucExpected = auc1 self.assertAlmostEqual( auc, aucExpected, delta=10, msg='GLM2 auc %s is too different from GLM1 %s' % (auc, aucExpected)) interceptExpected = intercept1 print "intercept pct delta:", 100.0 * ( abs(intercept) - abs(interceptExpected)) / abs(interceptExpected) self.assertTrue(h2o_util.approxEqual(intercept, interceptExpected, rel=0.5), msg='GLM2 intercept %s is too different from GLM1 %s' % (intercept, interceptExpected)) # avg_errExpected = 0.2463 avg_errExpected = err1 # self.assertAlmostEqual(avg_err, avg_errExpected, delta=0.50*avg_errExpected, # msg='GLM2 avg_err %s is too different from GLM1 %s' % (avg_err, avg_errExpected)) # self.assertAlmostEqual(best_threshold, 0.35, delta=0.10*best_threshold, # msg='GLM2 best_threshold %s is too different from GLM1 %s' % (best_threshold, 0.35)) #******************** # Print comparison #******************** interceptDelta = abs(abs(intercept1) - abs(intercept)) cDelta = [ abs(abs(a) - abs(b)) for a, b in zip(coefficients1, coefficients) ] def printit(self, a, b, c, d): pctDiff = abs(d / c) * 100 print "%-20s %-20.5e %8s %5.2f%% %10s %-20.5e" % \ ("GLM2: " + a + " " + b + ":", c, "pct. diff:", pctDiff, "abs diff:", d) # self.assertLess(pctDiff,1,"Expect <1% difference between H2O and R coefficient/intercept") printit(self, "intercept", "", intercept1, interceptDelta) print "compare lengths coefficients1, coefficients, cDelta:", len( coefficients1), len(coefficients), len(cDelta) print "GLM1:", coefficients1 print "GLM2:", coefficients print "cDelta:", cDelta for i, cValue in enumerate(coefficients): printit(self, "coefficient", "C" + str(i), cValue, cDelta[i]) hexKey = 'B.hex' pctWrong = h2o_rf.predict_and_compare_csvs(modelKey, hexKey, predictHexKey, csvSrcOutputPathname, csvPredictPathname, skipSrcOutputHeader, skipPredictHeader, translate=None, y=y) # we are predicting using training data...so error is really low # self.assertAlmostEqual(pctWrong, classification_error, delta = 0.2, # msg="predicted pctWrong: %s should be close to training classification error %s" % (pctWrong, classification_error)) self.assertAlmostEqual( pctWrong, expectedPctWrong, delta=2.0, msg= "predicted pctWrong: %s should be small because we're predicting with training data %s" % (pctWrong, expectedPctWrong))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0, NUM_CASES - 1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo( y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary( key=selKey2, max_column_display=colNumberMax + 1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual( colNumberMax + 1, numCols, msg= "generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols( None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k, v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k >= 0 and k < len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual( v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0) / rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k + 1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json( resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception( 'col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_many_fp_formats_libsvm_2_fvec(self): #h2b.browseTheCloud() SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 10000, 'cA', 300, 'sparse50'), (100, 10000, 'cB', 300, 'sparse'), # (100, 40000, 'cC', 300, 'sparse50'), # (100, 40000, 'cD', 300, 'sparse'), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList: NUM_CASES = h2o_util.fp_format() for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList) SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount) csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname # dict of col sums for comparison to exec col sums below (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution) selKey2 = hex_key + "_" + str(sel) print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically" parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight') print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) numCols = inspect['numCols'] numRows = inspect['numRows'] print "\n" + csvFilename # SUMMARY**************************************** # gives us some reporting on missing values, constant values, # to see if we have x specified well # figures out everything from parseResult['destination_key'] # needs y to avoid output column (which can be index or name) # assume all the configs have the same y..just check with the firs tone goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult['destination_key'], timeoutSecs=300, noPrint=True) if DO_SUMMARY: summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs) h2o_cmd.infoFromSummary(summaryResult, noPrint=True) self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax+1, numCols)) # Exec (column sums)************************************************* if DO_COMPARE_SUM: h2e.exec_zero_list(zeroList) colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1, timeoutSecs=timeoutSecs, print_params=False) #print "\n*************" #print "colResultList", colResultList #print "*************" self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows)) # need to fix this for compare to expected # we should be able to keep the list of fp sums per col above # when we generate the dataset sortedColSumDict = OrderedDict(sorted(synColSumDict.items())) print sortedColSumDict for k,v in sortedColSumDict.iteritems(): print k if DO_COMPARE_SUM: # k should be integers that match the number of cols self.assertTrue(k>=0 and k<len(colResultList)) compare = colResultList[k] print "\nComparing col sums:", v, compare # Even though we're comparing floating point sums, the operations probably should have # been done in same order, so maybe the comparison can be exact (or not!) self.assertAlmostEqual(v, compare, places=0, msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare)) synMean = (v + 0.0)/rowCount # enums don't have mean, but we're not enums mean = float(inspect['cols'][k]['mean']) # our fp formats in the syn generation sometimes only have two places? if not h2o_util.approxEqual(mean, synMean, tol=1e-3): execExpr = 'sum(%s[,%s])' % (selKey2, k+1) resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec) print "Result of remembered sum on failing col:..:", k, v print "Result of inspect mean * rowCount on failing col..:", mean * rowCount print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean sys.stdout.flush() raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean)) naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))
def test_impute_with_na(self): csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20) print "Just insert some NAs and see what happens" inspect = h2o_cmd.runInspect(key=hex_key) origNumRows = inspect['numRows'] origNumCols = inspect['numCols'] missing_fraction = 0.1 # NOT ALLOWED TO SET AN ENUM COL? if 1==0: # since insert missing values (below) doesn't insert NA into enum rows, make it NA with exec? # just one in row 1 for enumCol in enumColList: print "hack: Putting NA in row 0 of col %s" % enumCol execExpr = '%s[1, %s+1] = NA' % (hex_key, enumCol) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) inspect = h2o_cmd.runInspect(key=hex_key) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after exec:", missingValuesList if len(missingValuesList) != len(enumColList): raise Exception ("Didn't get missing values in expected number of cols: %s %s" % (enumColList, missingValuesList)) for trial in range(5): # copy the dataset hex_key2 = 'c.hex' execExpr = '%s = %s' % (hex_key2, hex_key) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) imvResult = h2o.nodes[0].insert_missing_values(key=hex_key2, missing_fraction=missing_fraction, seed=SEED) print "imvResult", h2o.dump_json(imvResult) # maybe make the output col a factor column # maybe one of the 0,1 cols too? # java.lang.IllegalArgumentException: Method `mode` only applicable to factor columns. # ugh. ToEnum2 and ToInt2 take 1-based column indexing. This should really change back to 0 based for h2o-dev? (like Exec3) print "Doing the ToEnum2 AFTER the NA injection, because h2o doesn't work right if we do it before" expectedMissing = missing_fraction * origNumRows # per col enumColList = [49, 50, 51, 52, 53, 54] for e in enumColList: enumResult = h2o.nodes[0].to_enum(src_key=hex_key2, column_index=(e+1)) inspect = h2o_cmd.runInspect(key=hex_key2) numRows = inspect['numRows'] numCols = inspect['numCols'] self.assertEqual(origNumRows, numRows) self.assertEqual(origNumCols, numCols) missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList", missingValuesList if len(missingValuesList) != numCols: raise Exception ("Why is missingValuesList not right afer ToEnum2?: %s %s" % (enumColList, missingValuesList)) for mv in missingValuesList: self.assertAlmostEqual(mv, expectedMissing, delta=0.1 * mv, msg='mv %s is not approx. expected %s' % (mv, expectedMissing)) summaryResult = h2o_cmd.runSummary(key=hex_key2) h2o_cmd.infoFromSummary(summaryResult) # h2o_cmd.infoFromSummary(summaryResult) print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect" print "trial", trial print "expectedMissing:", expectedMissing print "Now get rid of all the missing values, but imputing means. We know all columns should have NAs from above" print "Do the columns in random order" # don't do the enum cols ..impute doesn't support right? if AVOID_BUG: shuffledColList = range(0,49) # 0 to 48 execExpr = '%s = %s[,1:49]' % (hex_key2, hex_key2) h2e.exec_expr(execExpr=execExpr, timeoutSecs=10) # summaryResult = h2o_cmd.runSummary(key=hex_key2) # h2o_cmd.infoFromSummary(summaryResult) inspect = h2o_cmd.runInspect(key=hex_key2) numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if len(missingValuesList) != 49: raise Exception ("expected missing values in all cols after pruning enum cols: %s" % missingValuesList) else: shuffledColList = range(0,55) # 0 to 54 origInspect = inspect random.shuffle(shuffledColList) for column in shuffledColList: # get a random set of column. no duplicate. random order? 0 is okay? will be [] groupBy = random.sample(range(55), random.randint(0, 54)) # header names start with 1, not 0. Empty string if [] groupByNames = ",".join(map(lambda x: "C" + str(x+1), groupBy)) # what happens if column and groupByNames overlap?? Do we loop here and choose until no overlap columnName = "C%s" % (column + 1) print "don't use mode if col isn't enum" badChoices = True while badChoices: method = random.choice(["mean", "median", "mode"]) badChoices = column not in enumColList and method=="mode" NEWSEED = random.randint(0, sys.maxint) print "does impute modify the source key?" # we get h2o error (argument exception) if no NAs impResult = h2o.nodes[0].impute(source=hex_key2, column=column, method=method) print "Now check that there are no missing values" print "FIX! broken..insert missing values doesn't insert NAs in enum cols" inspect = h2o_cmd.runInspect(key=hex_key2) numRows2 = inspect['numRows'] numCols2 = inspect['numCols'] self.assertEqual(numRows, numRows2, "imput shouldn't have changed frame numRows: %s %s" % (numRows, numRows2)) self.assertEqual(numCols, numCols2, "imput shouldn't have changed frame numCols: %s %s" % (numCols, numCols2)) # check that the mean didn't change for the col # the enum cols with mode, we'll have to think of something else missingValuesList = h2o_cmd.infoFromInspect(inspect) print "missingValuesList after impute:", missingValuesList if missingValuesList: raise Exception ("Not expecting any missing values after imputing all cols: %s" % missingValuesList) cols = inspect['cols'] origCols = origInspect['cols'] for i, (c, oc) in enumerate(zip(cols, origCols)): # I suppose since we impute to either median or mean, we can't assume the mean stays the same # but for this tolerance it's okay (maybe a different dataset, that wouldn't be true h2o_util.approxEqual(c['mean'], oc['mean'], tol=0.000000001, msg="col %i original mean: %s not equal to mean after impute: %s" % (i, c['mean'], oc['mean']))