def test_NN_mnist_multi(self): # h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = "mnist/train.csv.gz" csvPathname_test = "mnist/test.csv.gz" hex_key = "mnist_train.hex" validation_key = "mnist_test.hex" timeoutSecs = 60 parseResult = h2i.import_parse( bucket="smalldata", path=csvPathname_train, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs ) parseResultV = h2i.import_parse( bucket="smalldata", path=csvPathname_test, schema="put", hex_key=validation_key, timeoutSecs=timeoutSecs ) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"]) response = inspect["numCols"] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) "SingleNode", ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: # Making random id identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = "nn_" + identifier + ".hex" kwargs = { "ignored_cols": None, "response": response, "classification": 1, "mode": mode, "activation": "RectifierWithDropout", "input_dropout_ratio": 0.2, "hidden": "117,131,129", "rate": 0.005, "rate_annealing": 1e-6, "momentum_start": 0.5, "momentum_ramp": 100000, "momentum_stable": 0.9, "l1": 0.00001, "l2": 0.0000001, "seed": 98037452452, "loss": "CrossEntropy", "max_w2": 15, "warmup_samples": 0, "initial_weight_distribution": "UniformAdaptive", #'initial_weight_scale' : 0.01, "epochs": 20.0, "destination_key": model_key, "validation": validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds" relTol = 0.02 if mode == "SingleThread" else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError( self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs ) ### Now score using the model, and check the validation error kwargs = { "source": validation_key, "max_rows": 0, "response": response, "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "score_" + identifier + ".hex", "model": model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != "MapReduce": print "WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results." h2o.beta_features = False
def test_NN_covtype(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'covtype/covtype.20k.data' csvPathname_test = 'covtype/covtype.20k.data' hex_key = 'covtype.hex' validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'Tanh', #'input_dropout_ratio' : 0.1, 'hidden' : '200,200', 'rate' : 0.005, 'rate_annealing' : 1e-5, 'momentum_start' : 0.1, 'momentum_ramp' : 100000, 'momentum_stable' : 0.3, 'l1' : 0.0000, 'l2' : 0.0000, 'seed' : 28372348842, 'loss' : 'CrossEntropy', #'max_w2' : 10, 'warmup_samples' : 0, 'initial_weight_distribution' : 'Normal', 'initial_weight_scale' : 1, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, } expectedErr = 0.3413 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_covtype(self): # h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = "covtype/covtype.20k.data" csvPathname_test = "covtype/covtype.20k.data" hex_key = "covtype.hex" validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse( bucket="smalldata", path=csvPathname_train, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs ) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"]) response = inspect["numCols"] - 1 modes = ["SingleThread", "SingleNode"] for mode in modes: # Making random id identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = "nn_" + identifier + ".hex" kwargs = { "ignored_cols": None, "response": response, "classification": 1, "mode": mode, "activation": "Tanh", #'input_dropout_ratio' : 0.1, "hidden": "200,200", "rate": 0.005, "rate_annealing": 1e-5, "momentum_start": 0.1, "momentum_ramp": 100000, "momentum_stable": 0.3, "l1": 0.0000, "l2": 0.0000, "seed": 28372348842, "loss": "CrossEntropy", #'max_w2' : 10, "warmup_samples": 0, "initial_weight_distribution": "Normal", "initial_weight_scale": 1, "epochs": 2.0, "destination_key": model_key, "validation": validation_key, } expectedErr = 0.3413 if mode == "SingleThread" else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds" relTol = 0.03 if mode == "SingleThread" else 0.15 ### 15% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError( self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs ) ### Now score using the model, and check the validation error kwargs = { "source": validation_key, "max_rows": 0, "response": response, "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "score_" + identifier + ".hex", "model": model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_covtype_1(self): tryList = ["covtype.shuffled.90pct.sorted.data", "covtype.shuffled.90pct.data"] importFolderPath = "standard" for csvFilename in tryList: csvPathname = importFolderPath + "/" + csvFilename hex_key = "covtype.hex" parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=10 ) inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvPathname, " num_rows:", "{:,}".format( inspect["num_rows"] ), " num_cols:", "{:,}".format(inspect["num_cols"]) # print "WARNING: just doing the first 33 features, for comparison to ??? numbers" # x = ",".join(map(str,range(33))) x = "" response = 54 modelKey = "a.hex" kwargs = { # this is ignore?? "response": response, # 'cols': x, # apparently no longer required? "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "validation": hex_key, "activation": "Tanh", # 'Rectifier' "hidden": 500, # comma separated values, or from:to:step "rate": 0.01, # learning rate "l2": 1.0e-4, # regularization "epochs": 1, # how many times dataset should be iterated "destination_key": modelKey, } timeoutSecs = 600 start = time.time() h2o.beta_features = True nnFirstResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) print "nnFirstResult:", h2o.dump_json(nnFirstResult) print "Hack: neural net apparently doesn't support the right polling response yet?" h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) print "neural net end on ", csvPathname, "took", time.time() - start, "seconds" # hack it! job_key = nnFirstResult["job_key"] # is the job finishing before polling would say it's done? params = {"job_key": job_key, "destination_key": modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/NeuralNetProgress.json", params=params) # fake it ## response = {'redirect_url': "2/NeuralNetProgress.json?job_key=%s&destination_key=%s" % (job_key, modelKey)} ## a = h2o.nodes[0].poll_url(response, timeoutSecs=30) print "NeuralNetProgress:", h2o.dump_json(a) # print 'From hack url for neural net result:', h2o.dump_json(a) if DO_SCORE: kwargs = { "max_rows": 0, "response": response, # 'cols': x, # apparently no longer required? "ignored_cols": None, # this is not consistent with ignored_cols_by_name "cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "b.hex", "model": modelKey, } nnScoreFirstResult = h2o_cmd.runNNetScore( key=parseResult["destination_key"], timeoutSecs=timeoutSecs, noPoll=True, **kwargs ) h2o.beta_features = False print "Hack: neural net apparently doesn't support the right polling response yet?" h2o_jobs.pollWaitJobs( pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5 ) print "neural net score end on ", trainCsvFilename, "took", time.time() - start, "seconds" print "nnScoreResult:", h2o.dump_json(nnScoreResult) h2o.beta_features = False
def test_NN_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols' : None, 'response' : 'C' + str(response), 'classification' : 1, 'mode' : 'SingleThread', 'activation' : 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden' : '500', 'rate' : 0.01, 'rate_annealing' : 1e-6, 'momentum_start' : 0, 'momentum_ramp' : 0, 'momentum_stable' : 0, 'l1' : 0.0, 'l2' : 1e-4, 'seed' : 80023842348, 'loss' : 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 1.0, 'destination_key' : model_key, 'validation' : hex_key, } timeoutSecs = 60 start = time.time() h2o.beta_features = True h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "NN end on ", csvFilename, ' took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.0 relTol = 0.01 kwargs = { 'source' : hex_key, 'max_rows': 0, 'response': 'C' + str(response), 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score' + str(trial) + '.hex', 'model': model_key } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.check_sandbox_for_errors() trial += 1
def test_NN_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, } expectedErr = 0.0565 ## expected validation error for the above model on 1 thread timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' #### Look at model progress, and check the last reported validation error relTol = 0.3 if mode == 'SingleThread' else 0.15 h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) #### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_mnist(self): csvFilelist = [ ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', hex_key=testKey2, timeoutSecs=timeoutSecs, noise=('StoreView', None)) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put', hex_key=trainKey2, timeoutSecs=timeoutSecs, noise=('StoreView', None)) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # NN**************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + trainCsvFilename, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) response = inspect['num_cols'] - 1 # up to but not including x = ",".join(map(str,range(response))) modelKey = 'a.hex' kwargs = { # this is ignore?? 'response': 0, # first column is pixel value # 'cols': x, # apparently no longer required? 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'validation': trainKey2, 'activation': 'Tanh', # 'Rectifier' 'hidden': 500, # comma separated values, or from:to:step 'rate': 0.01, # learning rate 'l2': 1.0E-4, # regularization 'epochs': 2, # how many times dataset should be iterated 'destination_key': modelKey, } timeoutSecs = 600 start = time.time() h2o.beta_features = True nnFirstResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) print "nnFirstResult:", h2o.dump_json(nnFirstResult) print "Hack: neural net apparently doesn't support the right polling response yet?" h2o_jobs.pollWaitJobs(pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) print "neural net end on ", trainCsvFilename, 'took', time.time() - start, 'seconds' # hack it! job_key = nnFirstResult['job_key'] params = {'job_key': job_key, 'destination_key': modelKey} a = h2o.nodes[0].completion_redirect(jsonRequest="2/NeuralNetProgress.json", params=params) print "NeuralNetProgress:", h2o.dump_json(a) # print 'From hack url for neural net result:', h2o.dump_json(a) if DO_SCORE: kwargs = { 'max_rows': 0, 'response': 0, # first column is pixel value # 'cols': x, # apparently no longer required? 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'b.hex', 'model': modelKey, } nnScoreFirstResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o.beta_features = False print "Hack: neural net apparently doesn't support the right polling response yet?" h2o_jobs.pollWaitJobs(pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) print "neural net score end on ", trainCsvFilename, 'took', time.time() - start, 'seconds' print "nnScoreResult:", h2o.dump_json(nnScoreResult) h2o.beta_features = False
def test_NN_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) 'SingleNode', ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'warmup_samples': 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != 'MapReduce': print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.' h2o.beta_features = False
def test_NN_covtype_1(self): h2o.beta_features = True tryList = [ ("covtype.shuffled.90pct.sorted.data", "covtype.shuffled.10pct.sorted.data"), ("covtype.shuffled.90pct.data", "covtype.shuffled.10pct.data"), ] importFolderPath = "standard" for trainFilename, testFilename in tryList: # Parse Train******************************** trainPathname = importFolderPath + "/" + trainFilename trainHexKey = "covtype90.hex" trainParseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=trainPathname, schema="local", hex_key=trainHexKey, timeoutSecs=10 ) inspect = h2o_cmd.runInspect(None, trainParseResult["destination_key"]) print "\n" + trainPathname, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"]) # Parse Test******************************** testPathname = importFolderPath + "/" + testFilename testHexKey = "covtype10.hex" testParseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=testPathname, schema="local", hex_key=testHexKey, timeoutSecs=10 ) # NN Train******************************** x = "" response = "C54" modelKey = "a.hex" kwargs = { # this is ignore?? "response": response, # 'cols': x, # apparently no longer required? "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "validation": testHexKey, "activation": "Tanh", # 'Rectifier' "hidden": 500, # comma separated values, or from:to:step "rate": 0.01, # learning rate "l2": 1.0e-2, # regularization # can overfit the training data "epochs": 5, # how many times dataset should be iterated "destination_key": modelKey, } timeoutSecs = 600 start = time.time() nnResult = h2o_cmd.runNNet(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) print "nnResult:", h2o.dump_json(nnResult) print "neural net end on ", trainPathname, "took", time.time() - start, "seconds" # NN Score******************************** kwargs = { "max_rows": 0, "response": response, # 'cols': x, # apparently no longer required? "ignored_cols": None, # this is not consistent with ignored_cols_by_name "cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "b.hex", "model": modelKey, } # doesn't need polling? nnScoreResult = h2o_cmd.runNNetScore( key=testParseResult["destination_key"], timeoutSecs=timeoutSecs, noPoll=True, **kwargs ) print "neural net score end on ", testPathname, "took", time.time() - start, "seconds" # print "nnScoreResult:", h2o.dump_json(nnScoreResult) cm = nnScoreResult["confusion_matrix"] mean_square_error = nnScoreResult["mean_square_error"] classification_error = nnScoreResult["classification_error"] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print "classification_error:", classification_error print "mean_square_error:", mean_square_error print h2o_gbm.pp_cm(cm)
def test_NN_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'warmup_samples': 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 2.0, 'destination_key': model_key, 'validation': validation_key, } expectedErr = 0.0565 ## expected validation error for the above model on 1 thread timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Look at model progress, and check the last reported validation error relTol = 0.3 if mode == 'SingleThread' else 0.15 h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) #### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'mode': 'SingleThread', 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_ramp': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o.beta_features = True h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "NN end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.0 relTol = 0.01 kwargs = { 'source': hex_key, 'max_rows': 0, 'response': 'C' + str(response), 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score' + str(trial) + '.hex', 'model': model_key } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.check_sandbox_for_errors() trial += 1
def test_NN_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) 'SingleNode', ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 20.0, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.0331 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != 'MapReduce': print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.' h2o.beta_features = False
def test_NN_covtype(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'covtype/covtype.20k.data' csvPathname_test = 'covtype/covtype.20k.data' hex_key = 'covtype.hex' validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'Tanh', #'input_dropout_ratio' : 0.1, 'hidden': '200,200', 'rate': 0.005, 'rate_annealing': 1e-5, 'momentum_start': 0.1, 'momentum_ramp': 100000, 'momentum_stable': 0.3, 'l1': 0.0000, 'l2': 0.0000, 'seed': 28372348842, 'loss': 'CrossEntropy', #'max_w2' : 10, 'warmup_samples': 0, 'initial_weight_distribution': 'Normal', 'initial_weight_scale': 1, 'epochs': 2.0, 'destination_key': model_key, 'validation': validation_key, } expectedErr = 0.35195 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False