def test_NN2_params_rand2(self): csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(5): # params is mutable. This is default. params = {'response': 'C55'} h2o_nn.pickRandDeepLearningParams(paramDict, params) kwargs = params.copy() start = time.time() nn = h2o_cmd.runDeepLearning(timeoutSecs=300, parseResult=parseResult, **kwargs) print "nn result:", h2o.dump_json(nn) h2o.check_sandbox_for_errors() # FIX! simple check? print "Deep Learning end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_NN2_params_rand2(self): h2o.beta_features = True csvPathname = 'covtype/covtype.20k.data' hex_key = 'covtype.20k.hex' parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put') paramDict = define_params() for trial in range(3): # params is mutable. This is default. params = {'response': 'C55', 'epochs': '1'} h2o_nn.pickRandDeepLearningParams(paramDict, params) kwargs = params.copy() start = time.time() nn = h2o_cmd.runDeepLearning(timeoutSecs=500, parseResult=parseResult, **kwargs) print "nn result:", h2o.dump_json(nn) h2o.check_sandbox_for_errors() deeplearning_model = nn['deeplearning_model'] errors = deeplearning_model['errors'] # print "errors", h2o.dump_json(errors) # print "errors, classification", errors['classification'] # assert 1==0 # unstable = nn['model_info']['unstable'] # unstable case caused by : # normal initial distribution with amplitude 1 and input_dropout_ratio=1. # blowing up numerically during propagation of all zeroes as input repeatedly. # arnon added logging to stdout in addition to html in 7899b92ad67. # Will have to check that first before making predictions. # print "unstable:", unstable # FIX! simple check? print "Deep Learning end on ", csvPathname, 'took', time.time() - start, 'seconds' print "Trial #", trial, "completed\n"
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } h2o.beta_features = True predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) h2o.beta_features = False
def test_DeepLearning_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' validation_key = 'test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = 'any_response' #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # use defaults otherwise # need to change epochs otherwise it takes too long kwargs = { 'epochs' : 0.001, 'response' : response, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.35 # allow 35% tolerance. kbn predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr))
def test_NN_airlines_small(self): #h2b.browseTheCloud() csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'airlines_train.hex' validation_key = 'airlines_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # this gives the last col number, which is IsDepDelayed_REC (1 or -1) # response = inspect['numCols'] - 1 # this is "YES"/"NO" response = 'IsDepDelayed' #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # get the column names colNames = [c['name'] for c in inspect['cols']] print "colNames:", colNames usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance") ignoredCols = [] for c in colNames: # don't put the response in the ignore list (is there a problem if so?) if c not in usedCols and c != response: ignoredCols.append(c) ignoredColsString = ",".join(ignoredCols) print "Telling h2o to ignore these cols:" print ignoredColsString kwargs = { 'ignored_cols': ignoredColsString, 'response': response, 'classification': 1, 'destination_key': model_key, } expectedErr = 0.45 ## expected validation error for the above model relTol = 0.50 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.001 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr)) trial += 1
def test_DeepLearning_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '1024,1024,2048', 'adaptive_rate' : 1, 'rho' : 0.99, 'epsilon' : 1e-8, 'train_samples_per_iteration' : -1, ## 0: better accuracy! -1: best scalability! 10000: best accuracy? # 'rate' : 0.01, # 'rate_annealing' : 1e-6, # 'momentum_start' : 0.5, # 'momentum_ramp' : 1800000, # 'momentum_stable' : 0.99, 'l1' : 1e-5, 'l2' : 0.0, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', 'epochs' : 128, #enough for 64 nodes 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 #don't score until the end } timeoutSecs = 7200 start = time.time() deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr)
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 20.0, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)
def test_anomaly_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, COLS, 'x.hex', 1, 20000), (ROWS, COLS, 'x.hex', -5000, 0), (ROWS, COLS, 'x.hex', -100000, 100000), (ROWS, COLS, 'x.hex', -1, 1), (ROWS, COLS, 'A.hex', 1, 100), (ROWS, COLS, 'A.hex', -99, 99), (ROWS, COLS, 'B.hex', 1, 10000), (ROWS, COLS, 'B.hex', -100, 100), (ROWS, COLS, 'C.hex', 1, 100000), (ROWS, COLS, 'C.hex', -101, 101), ] trial = 1 x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numRows = inspect["numRows"] numCols = inspect["numCols"] print "numRows:", numRows, "numCols:", numCols model_key = "m.hex" kwargs = { 'ignored_cols' : None, 'response' : numCols-1, 'classification' : 0, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117', 'adaptive_rate' : 0, 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, # 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, # 'validation' : None, 'score_interval' : 10000, 'autoencoder' : 1, } timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end. took", time.time() - start, "seconds" kwargs = { 'destination_key': "a.hex", 'source': parseResult['destination_key'], 'dl_autoencoder_model': model_key, 'thresh': 1.0 } anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs) inspect = h2o_cmd.runInspect(None, "a.hex") numRows = inspect["numRows"] numCols = inspect["numCols"] print "anomaly: numRows:", numRows, "numCols:", numCols self.assertEqual(numCols,1) # twice as many rows because of NA injection self.assertEqual(numRows,rowCount*(1 + NA_ROW_RATIO)) # first col has the anomaly info. other cols are the same as orig data aSummary = h2o_cmd.runSummary(key='a.hex', cols=0) h2o_cmd.infoFromSummary(aSummary) print "anomaly:", h2o.dump_json(anomaly) trial += 1 h2i.delete_keys_at_all_nodes()
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'standard/covtype.shuffled.90pct.data' csvPathname_test = 'standard/covtype.shuffled.10pct.data' hex_key = 'covtype.hex' validation_key = 'covtype.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 96.0, 'destination_key': model_key, 'validation': validation_key, 'score_interval': 10000 } expectedErr = 0.24 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_DeepLearning_c21(self): importFolderPath = '/mnt/0xcustomer-datasets/c21' csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip' csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip' hex_key = 'train.hex' validation_key = 'test.hex' timeoutSecs = 300 parseResult = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = 'any_response' #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # use defaults otherwise # need to change epochs otherwise it takes too long kwargs = { 'epochs': 0.001, 'response': response, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.35 # allow 35% tolerance. kbn predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_NN_airlines_small(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'airlines/AirlinesTrain.csv.zip' csvPathname_test = 'airlines/AirlinesTest.csv.zip' hex_key = 'airlines_train.hex' validation_key = 'airlines_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # this gives the last col number, which is IsDepDelayed_REC (1 or -1) # response = inspect['numCols'] - 1 # this is "YES"/"NO" response = 'IsDepDelayed' #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' # get the column names colNames = [c['name'] for c in inspect['cols']] print "colNames:", colNames usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance") ignoredCols = [] for c in colNames: # don't put the response in the ignore list (is there a problem if so?) if c not in usedCols and c != response: ignoredCols.append(c) ignoredColsString = ",".join(ignoredCols) print "Telling h2o to ignore these cols:" print ignoredColsString kwargs = { 'ignored_cols' : ignoredColsString, 'response' : response, 'classification' : 1, 'destination_key' : model_key, } expectedErr = 0.45 ## expected validation error for the above model relTol = 0.50 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } h2o.beta_features = True predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr)) h2o.beta_features = False
def test_anomaly_uniform_w_NA(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # colname, (min, 25th, 50th, 75th, max) (ROWS, COLS, 'x.hex', 1, 20000), (ROWS, COLS, 'x.hex', -5000, 0), (ROWS, COLS, 'x.hex', -100000, 100000), (ROWS, COLS, 'x.hex', -1, 1), (ROWS, COLS, 'A.hex', 1, 100), (ROWS, COLS, 'A.hex', -99, 99), (ROWS, COLS, 'B.hex', 1, 10000), (ROWS, COLS, 'B.hex', -100, 100), (ROWS, COLS, 'C.hex', 1, 100000), (ROWS, COLS, 'C.hex', -101, 101), ] trial = 1 x = 0 timeoutSecs = 60 for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) x += 1 csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True) print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) numRows = inspect["numRows"] numCols = inspect["numCols"] print "numRows:", numRows, "numCols:", numCols model_key = "m.hex" kwargs = { 'ignored_cols': None, 'response': numCols - 1, 'classification': 0, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117', 'adaptive_rate': 0, 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, # 'loss' : 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 2.0, 'destination_key': model_key, # 'validation' : None, 'score_interval': 10000, 'autoencoder': 1, } timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end. took", time.time() - start, "seconds" kwargs = { 'destination_key': "a.hex", 'source': parseResult['destination_key'], 'dl_autoencoder_model': model_key, 'thresh': 1.0 } anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs) inspect = h2o_cmd.runInspect(None, "a.hex") numRows = inspect["numRows"] numCols = inspect["numCols"] print "anomaly: numRows:", numRows, "numCols:", numCols self.assertEqual(numCols, 1) # twice as many rows because of NA injection self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO)) # first col has the anomaly info. other cols are the same as orig data aSummary = h2o_cmd.runSummary(key='a.hex', cols=0) h2o_cmd.infoFromSummary(aSummary) print "anomaly:", h2o.dump_json(anomaly) trial += 1 h2i.delete_keys_at_all_nodes()
def test_NN_mnist(self): #h2b.browseTheCloud() csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'adaptive_rate' : 0, 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, 'score_interval' : 10000 } expectedErr = 0.057 ## expected validation error for the above model relTol = 0.20 ## 20% rel. error tolerance due to Hogwild! timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' predict_key = 'score_' + identifier + '.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100.; print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr))
def test_DeepLearning_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' validation_key = hex_key parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] response = 'C' + str(response) kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden' : '113,71,54', 'rate' : 0.01, 'rate_annealing' : 1e-6, 'momentum_start' : 0, 'momentum_stable' : 0, 'l1' : 0.0, 'l2' : 1e-6, 'seed' : 80023842348, 'loss' : 'CrossEntropy', #'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 100, 'destination_key' : model_key, 'validation' : hex_key, } timeoutSecs = 60 start = time.time() h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.00 relTol = 0.01 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = { } predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm)/100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol: raise Exception("Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol)*100, expectedErr)) trial += 1
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 90 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.046 relTol = 0.1 predict_key = 'Predict.hex' kwargs = { 'data_key': validation_key, 'destination_key': predict_key, 'model_key': model_key } predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key=predict_key, verbose=True) kwargs = {} predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=validation_key, vactual=response, predict=predict_key, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) cm = predictCMResult['cm'] print h2o_gbm.pp_cm(cm) actualErr = h2o_gbm.pp_cm_summary(cm) / 100. print "actual classification error:" + format(actualErr) print "expected classification error:" + format(expectedErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "Scored classification error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' relTol = 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runDeepLearningScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False