def test_GLM_params_rand2_newargs(self): csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() y = 54 print "Want to see if there are constant columns" goodX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) print "goodX:", goodX # intermittent fail on the forced params? for trial in range(10 if DO_FAIL_ONLY else 20): if DO_FAIL_ONLY: params = define_params_fail() else: # params is mutable. This is default. params = {'y': y, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1} h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_newargs(self): csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_poisson_rand2(self): csvPathname = "standard/covtype.data" parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put") paramDict = define_params() for trial in range(20): params = { "response": 54, "n_folds": 3, "family": "poisson", "alpha": 0.5, "lambda": 1e-4, "beta_epsilon": 0.001, "max_iter": 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs["n_folds"] * 40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_NOPASS_GLM2_tweedie_rand2(self): h2o.beta_features = True if 1==1: csvPathname = 'standard/covtype.data' hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='put') else: csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = { 'response': 54, 'lambda': 0, 'alpha': 0, 'n_folds': 1, 'family': 'tweedie' } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 10 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM2_params_rand2(self): csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr="B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ('covtype.20k', 54+1, 54+1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, 'n_folds': 1, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() if 'family' not in kwargs or kwargs['family']=='binomial': bHack = {'destination_key': 'B.hex'} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'num_cross_validation_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=150, parseKey=parseKey, **kwargs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds' : 3, 'family' : 'binomial', 'max_iter' : 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds']*10 + params['max_iter']*10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_8977501266014959103(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) # SEED = random.randint(0, sys.maxint) SEED = 8977501266014959103 # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_tweedie_rand2(self): if 1 == 1: csvPathname = "standard/covtype.data" hex_key = "covtype.hex" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, schema="put" ) else: csvPathname = "covtype/covtype.20k.data" hex_key = "covtype.20k.hex" parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, hex_key=hex_key, schema="put") paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = {"y": 54, "case": 4, "case_mode": "=", "lambda": 0, "alpha": 0, "n_folds": 1, "family": "tweedie"} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, "took", time.time() - start, "seconds" print "Trial #", trial, "completed\n"
def test_GLM_params_rand2(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key="covtype.20k") # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') key = 'covtype.20k' parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'case': 1, 'lambda': 0, 'alpha': 0, 'n_folds': 1 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_poisson_rand2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): params = { 'response': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=120, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gamma_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'num_cross_validation_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_4082088627997819015(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key='covtype') paramDict = define_params() for trial in range(40): # params is mutable. This is default. params = { 'y': 54, 'n_folds': 3, 'family': 'binomial', 'max_iter': 5, 'case': 1, 'alpha': 0, 'lambda': 0 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() timeoutSecs = max(150, params['n_folds'] * 10 + params['max_iter'] * 10) glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) # FIX! I suppose we have the problem of stdout/stderr not having flushed? # should hook in some way of flushing the remote node stdout/stderr h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "Trial #", trial, "completed\n"
def test_GLM_poisson_rand2(self): csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 54, 'n_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 15, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['n_folds']*40) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 5, 'n_folds': 1, 'family': "poisson", 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 3, 'standardize': 1, 'expert': 1, 'lsm_solver': 'GenGradient', } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') print "\nParsing", csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.info_from_inspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 6, 'num_cross_validation_folds': 3, 'family': "binomial", 'case_mode': ['>'], 'case': ['20'], 'alpha': 0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 120 + (kwargs['num_cross_validation_folds']*30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time()-start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file("smalldata/poisson/Goalies.csv") parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey["destination_key"]) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { "y": 5, "n_folds": 1, "family": "poisson", "alpha": 0.0, "lambda": 0, "beta_epsilon": 0.001, "max_iter": 3, "standardize": 1, "expert": 1, "lsm_solver": "GenGradient", } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs["n_folds"] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, "took", time.time() - start, "seconds" print "Trial #", trial, "completed\n"
def test_GLM2_binomial_goalies(self): h2o.beta_features = True csvPathname = 'poisson/Goalies.csv' print "\nParsing", csvPathname parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="A.hex") inspect = h2o_cmd.runInspect(None, "A.hex") # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) case = 20 execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % (6 + 1, 6 + 1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'response': 6, 'n_folds': 1, 'family': "binomial", 'alpha': 0, # seems we always need a little regularization 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult={'destination_key': 'A.hex'}, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_8977501266014959103(self): csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_gaussian_rand2(self): csvPathname = "standard/covtype.data" parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, schema="put") paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {"response": 54, "n_folds": 3, "family": "gaussian", "alpha": 0.5, "lambda": 1e-4, "max_iter": 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gaussian_rand2(self): csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=120, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_gamma_rand2(self): csvPathname = 'UCI/UCI-large/covtype/covtype.data' parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = {'y': 54, 'n_folds': 3, 'family': "gamma", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 24} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=300, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_poisson_goalies_gg(self): csvPathname = 'poisson/Goalies.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) paramDict = define_params() for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 5, 'n_folds': 1, 'family': "poisson", 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 3, 'standardize': 1, 'expert_settings': 1, 'lsm_solver': 'GenGradient', } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_8977501266014959103(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file('smalldata/covtype/covtype.20k.data') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = {'y': 54, 'alpha': 0, 'lambda': 0, 'case': 1, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_binomial_goalies(self): h2o.beta_features = True csvPathname = 'poisson/Goalies.csv' print "\nParsing", csvPathname parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="A.hex") inspect = h2o_cmd.runInspect(None, "A.hex") # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) case = 20 execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % (6+1, 6+1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'response': 6, 'n_folds': 1, 'family': "binomial", 'alpha': 0, # seems we always need a little regularization 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds']*30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult={'destination_key': 'A.hex'}, **kwargs) elapsed = time.time()-start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug info_from_inspect(inspect, csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 6, 'num_cross_validation_folds': 3, 'family': "poisson", 'alpha': 0.5, 'lambda': 1e-4, 'beta_eps': 0.001, 'max_iter': 30 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs['num_cross_validation_folds']*20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM_binomial_goalies(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') print "\nParsing", csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 6, 'n_folds': 2, 'family': "binomial", 'case_mode': '>', 'case': '20', 'alpha': 0, # seems we always need a little regularization 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds'] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_binomial_goalies(self): csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv') print "\nParsing", csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 6, 'n_folds': 2, 'family': "binomial", 'case_mode': '>', 'case': '20', 'alpha': 0, # seems we always need a little regularization 'lambda': 1e-4, 'beta_epsilon': 0.001, 'max_iter': 8 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds']*30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) elapsed = time.time()-start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_binomial_goalies(self): csvPathname = "poisson/Goalies.csv" print "\nParsing", csvPathname parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put") inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) # need more info about the dataset for debug h2o_cmd.infoFromInspect(inspect, csvPathname) paramDict = define_params() for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { "y": 6, "n_folds": 1, "family": "binomial", "case_mode": ">", "case": "20", "alpha": 0, # seems we always need a little regularization "lambda": 1e-4, "beta_epsilon": 0.001, "max_iter": 8, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs["n_folds"] * 30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) elapsed = time.time() - start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2_newargs(self): # csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') csvPathname = h2o.find_file("smalldata/covtype/covtype.20k.data") key = "covtype.20k" parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key=key) paramDict = define_params() for trial in range(50): # params is mutable. This is default. params = {"y": 54, "case": 1, "lambda": 0, "alpha": 0, "n_folds": 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=70, parseKey=parseKey, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, "took", time.time() - start, "seconds" print "Trial #", trial, "completed\n"
def test_GLM_poisson_goalies_gg(self): csvPathname = 'poisson/Goalies.csv' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put') inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) paramDict = define_params() for trial in range(5): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { 'y': 5, 'n_folds': 1, 'family': "poisson", 'alpha': 0.0, 'lambda': 0, 'beta_epsilon': 0.001, 'max_iter': 3, 'standardize': 1, 'expert_settings': 1, 'lsm_solver': 'GenGradient', } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 180 + (kwargs['n_folds']*30) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter']+1))) start = time.time() print "May not solve. Expanded categorical columns causing a large # cols, small # of rows" glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs, parseResult=parseResult, **kwargs) elapsed = time.time()-start print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) start = time.time() h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "simpleCheckGLM end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_tweedie_rand2(self): h2o.beta_features = True if 1 == 1: csvPathname = 'standard/covtype.data' hex_key = 'covtype.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='put') else: csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(10): # params is mutable. This is default. params = { 'response': 54, 'lambda': 0, 'alpha': 0, 'n_folds': 1, 'family': 'tweedie' } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=180, parseResult=parseResult, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_lambda_search(self): h2o.beta_features = True csvPathname = "covtype/covtype.20k.data" parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="put", hex_key="covtype.20k") CLASS = 1 # make a binomial version execExpr = "B.hex=%s; B.hex[,%s]=(B.hex[,%s]==%s)" % ("covtype.20k", 54 + 1, 54 + 1, CLASS) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) paramDict = define_params() for trial in range(20): params = {} colX = h2o_glm.pickRandGlmParams(paramDict, params) # override choices with these params = { "response": 54, "alpha": 0.1, "max_iter": 8, # 'lambda': 1e-4, # 'lambda': 0, "lambda": None, "lambda_search": 1, "n_folds": 1, } kwargs = params.copy() if "family" not in kwargs or kwargs["family"] == "binomial": bHack = {"destination_key": "B.hex"} else: bHack = parseResult start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=bHack, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, "took", time.time() - start, "seconds" print "Trial #", trial, "completed\n"
def test_loop_random_param_covtype(self): csvPathname = h2o.find_dataset("UCI/UCI-large/covtype/covtype.data") parseKey = h2o_cmd.parseFile(csvPathname=csvPathname) # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) paramDict = define_params() print "\nUsing random seed:", SEED for trial in range(20): # params is mutable. This is default. # FIX! does it never end if we don't have alpha specified? params = { "y": 54, "n_folds": 3, "family": "poisson", "alpha": 0.5, "lambda": 1e-4, "beta_eps": 0.001, "max_iter": 30, } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() # make timeout bigger with xvals timeoutSecs = 60 + (kwargs["n_folds"] * 20) # or double the 4 seconds per iteration (max_iter+1 worst case?) timeoutSecs = max(timeoutSecs, (8 * (kwargs["max_iter"] + 1))) start = time.time() glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs, parseKey=parseKey, **kwargs) print "glm end on ", csvPathname, "took", time.time() - start, "seconds" h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', hex_key="covtype.20k") paramDict = define_params() for trial in range(20): # params is mutable. This is default. y = 54 params = {'response': y, 'case': 1, 'alpha': 0, 'lambda': 0, 'n_folds': 1} colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() case = kwargs.pop('case') execExpr = "aHack=%s; aHack[,%s] = aHack[,%s]==%s" % ('covtype.20k', y+1, y+1, case) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=70, parseResult={'destination_key': 'aHack'}, **kwargs) # pass the kwargs with all the params, so we know what we asked for! h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) h2o.check_sandbox_for_errors() print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_GLM2_gaussian_rand2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(20): # params is mutable. This is default. params = { 'response': 54, 'n_folds': 3, 'family': "gaussian", 'alpha': 0.5, 'lambda': 1e-4, 'max_iter': 30 } colX = h2o_glm.pickRandGlmParams(paramDict, params) kwargs = params.copy() start = time.time() glm = h2o_cmd.runGLM(timeoutSecs=300, parseResult=parseResult, **kwargs) print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) print "Trial #", trial, "completed\n"
def test_GLM_params_rand2(self): importFolderPath = "covtype" csvFilename = "covtype.20k.data" hex_key = "covtype20k.hex" binomial_key = "covtype20k.b.hex" b = Key(hex_key) csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) ## columnTypeDict = {54: 'Enum'} columnTypeDict = None parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, columnTypeDict=columnTypeDict, check_header=1, timeoutSecs=180, doSummary=False) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. Assign(b[:, 54], b[:, 54] - 1) # make 1 thru 6 go to 1 Assign(b[:, 54], b[:, 54] != 0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) numColsUsed = numCols paramDict = define_params() for trial in range(5): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? # params is mutable. This is default. parameters = { 'response_column': 'C55', 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, } h2o_glm.pickRandGlmParams(paramDict, parameters) if 'family' not in parameters or parameters['family'] == 'binomial': bHack = binomial_key else: bHack = hex_key co = h2o_cmd.runSummary(key=binomial_key, column=54) print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) co = h2o_cmd.runSummary(key=hex_key, column=54) print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum( co.histogram_bins) # fix stupid params fixList = [ 'alpha', 'lambda', 'ignored_columns', 'class_sampling_factors' ] for f in fixList: if f in parameters: parameters[f] = "[%s]" % parameters[f] model_key = 'rand_glm.hex' bmResult = h2o.n0.build_model(algo='glm', model_id=model_key, training_frame=bHack, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # FIX! when is this legal doClassification = False if doClassification: mcms = OutputObj( {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) if doClassification: thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1 == 0: print "" for i, c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
def test_GLM_params_rand2(self): importFolderPath = "covtype" csvFilename = "covtype.20k.data" hex_key = "covtype20k.hex" binomial_key = "covtype20k.b.hex" b = Key(hex_key) csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, timeoutSecs=180, doSummary=False) ## columnTypeDict = {54: 'Enum'} columnTypeDict = None parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=binomial_key, columnTypeDict=columnTypeDict, check_header=1, timeoutSecs=180, doSummary=False) # don't have to make it enum, if 0/1 (can't operate on enums like this) # make 1-7 go to 0-6. 0 isn't there. Assign(b[:,54], b[:,54]-1) # make 1 thru 6 go to 1 Assign(b[:,54], b[:,54]!=0) # now we have just 0 and 1 pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList expected = [] allowedDelta = 0 # loop, to see if we get same centers labelListUsed = list(labelList) numColsUsed = numCols paramDict = define_params() for trial in range(5): # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie'] # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie'] # can we do classification with probabilities? # are only lambda and alpha grid searchable? # params is mutable. This is default. parameters = { 'response_column': 'C55', 'alpha': 0.1, # 'lambda': 1e-4, 'lambda': 0, } h2o_glm.pickRandGlmParams(paramDict, parameters) if 'family' not in parameters or parameters['family']=='binomial': bHack = binomial_key else: bHack = hex_key co = h2o_cmd.runSummary(key=binomial_key, column=54) print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins) co = h2o_cmd.runSummary(key=hex_key, column=54) print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(co.histogram_bins) # fix stupid params fixList = ['alpha', 'lambda', 'ignored_columns', 'class_sampling_factors'] for f in fixList: if f in parameters: parameters[f] = "[%s]" % parameters[f] model_key = 'rand_glm.hex' bmResult = h2o.n0.build_model( algo='glm', model_id=model_key, training_frame=bHack, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowNaN=True) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') # FIX! when is this legal doClassification = False if doClassification: mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms') m1 = mcms.data[1:] h0 = mcms.data[0] print "\nmcms", tabulate(m1, headers=h0) if doClassification: thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms') cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms') if 1==0: print "" for i,c in enumerate(cmms.cm): print "\ncmms.cm[%s]" % i, tabulate(c) print "" mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')