def test_GLM2_tweedie(self): csvFilename = "AutoClaim.csv" csvPathname = 'standard/' + csvFilename print "\nStarting", csvPathname parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') # columns start at 0 # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34)) coefs = [7, 13, 20, 27, 21, 11] y = 4 ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=coefs, response=y) # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1) kwargs = { 'family': 'tweedie', 'tweedie_variance_power': 1.36, 'response': y, 'ignored_cols' : ignored_cols, 'max_iter': 10, 'lambda': 0, 'alpha': 0, 'n_folds': 0, 'beta_epsilon': 1e-4, } glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs) coefficientsExpected = {'Intercept': 0, 'GENDER.M': 0.0014842488782470984, 'CAR_TYPE.Sports Car': 0.07786742314454961, 'MARRIED.Yes': 0.0007748552195851079, 'CAR_TYPE.SUV': 0.07267702940249621, 'CAR_TYPE.Pickup': 0.04952083408742968, 'CAR_TYPE.Van': 0.026422137690691405, 'CAR_TYPE.Sedan': 0.05128350794060489, 'CAR_USE.Private': -0.03050194832853935, 'REVOLKED.Yes': -0.05095942737408699} deltaExpected = 0.05 (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None, coefficientsExpected=coefficientsExpected, deltaExpected=deltaExpected, **kwargs) print 'coefficients: %s' % (str(coefficients))
def test_GLM2_umass(self): h2o.beta_features = True csvFilenameList = [ ('cgd.dat', 'gaussian', 12, 30, None), ('chdage.dat', 'binomial', 2, 30, None), # leave out ID and birth weight ('clslowbwt.dat', 'binomial', 7, 60, [1, 2, 3, 4, 5]), ('icu.dat', 'binomial', 1, 60, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ('lowbwt.dat', 'binomial', 1, 60, [2, 3, 4, 5, 6, 7, 8, 9]), ('lowbwtm11.dat', 'binomial', 1, 60, None), ('meexp.dat', 'gaussian', 3, 60, None), ('nhanes3.dat', 'binomial', 15, 60, None), ('pbc.dat', 'gaussian', 1, 60, None), ('pharynx.dat', 'gaussian', 12, 60, None), ('pros.dat', 'binomial', 1, 60, None), ('uis.dat', 'binomial', 8, 60, None), ] trial = 0 for i in range(3): for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList: csvPathname = "logreg/umass_statdata/" + csvFilename kwargs = { 'n_folds': 3, 'response': y, 'family': family, 'alpha': 1, 'lambda': 1e-4 } parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) if x is not None: ignored_cols = h2o_cmd.createIgnoredCols( key=parseResult['destination_key'], cols=x, response=y) kwargs['ignored_cols'] = ignored_cols start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname, 'took', time.time( ) - start, 'seconds' trial += 1 print "\nTrial #", trial
def test_GLM2_umass(self): h2o.beta_features = True csvFilenameList = [ ('cgd.dat', 'gaussian', 12, 5, None), ('chdage.dat', 'binomial', 2, 5, None), # leave out ID and birth weight ('clslowbwt.dat', 'binomial', 7, 30, [1,2,3,4,5]), ('icu.dat', 'binomial', 1, 30, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ('lowbwt.dat', 'binomial', 1, 30, [2,3,4,5,6,7,8,9]), ('lowbwtm11.dat', 'binomial', 1, 30, None), ('meexp.dat', 'gaussian', 3, 30, None), ('nhanes3.dat', 'binomial', 15, 30, None), ('pbc.dat', 'gaussian', 1, 30, None), ('pharynx.dat', 'gaussian', 12, 30, None), ('pros.dat', 'binomial', 1, 30, None), ('uis.dat', 'binomial', 8, 30, None), ] trial = 0 for i in range(3): for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList: csvPathname = "logreg/umass_statdata/" + csvFilename kwargs = {'n_folds': 3, 'response': y, 'family': family, 'alpha': 1, 'lambda': 1e-4} parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=timeoutSecs) if x is not None: ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=x, response=y) kwargs['ignored_cols'] = ignored_cols start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname, 'took', time.time() - start, 'seconds' trial += 1 print "\nTrial #", trial
def test_GLM2_umass(self): h2o.beta_features = True csvFilenameList = [ ("cgd.dat", "gaussian", 12, 5, None), ("chdage.dat", "binomial", 2, 5, None), # leave out ID and birth weight ("clslowbwt.dat", "binomial", 7, 10, [1, 2, 3, 4, 5]), ("icu.dat", "binomial", 1, 10, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ("lowbwt.dat", "binomial", 1, 10, [2, 3, 4, 5, 6, 7, 8, 9]), ("lowbwtm11.dat", "binomial", 1, 10, None), ("meexp.dat", "gaussian", 3, 10, None), ("nhanes3.dat", "binomial", 15, 10, None), ("pbc.dat", "gaussian", 1, 10, None), ("pharynx.dat", "gaussian", 12, 10, None), ("pros.dat", "binomial", 1, 10, None), ("uis.dat", "binomial", 8, 10, None), ] trial = 0 for i in range(3): for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList: csvPathname = "logreg/umass_statdata/" + csvFilename kwargs = {"n_folds": 2, "response": y, "family": family, "alpha": 1, "lambda": 1e-4} parseResult = h2i.import_parse( bucket="smalldata", path=csvPathname, schema="put", timeoutSecs=timeoutSecs ) if x is not None: ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult["destination_key"], cols=x, response=y) kwargs["ignored_cols"] = ignored_cols start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "glm end (w/check) on ", csvPathname, "took", time.time() - start, "seconds" trial += 1 print "\nTrial #", trial