def testCheckWithModelAPI(self): ###################################################################### # Now look for kmeans_model_name using the one-model API and find_compatible_frames, and check it model = self.a_node.models(key=self.kmeans_model_name, find_compatible_frames=True) found_kmeans = False h2o_util.assertKeysExist(model['models'][0], '', ['compatible_frames']) assert self.prostate_key in model['models'][0]['compatible_frames'], \ "Failed to find " + self.prostate_key + " in compatible_frames list." ###################################################################### # Now look for prostate_key using the one-frame API and find_compatible_models, and check it result = self.a_node.frames(key='prostate.hex', find_compatible_models=True) frames = result['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert_true('prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list.") compatible_models = result['compatible_models'] models_dict = h2o_util.list_to_dict(compatible_models, 'key') assert_true ( self.dl_prostate_model_name in models_dict, "Failed to find " + \ self.dl_prostate_model_name + " in compatible models list.") assert_true( self.dl_prostate_model_name in frames[0]['compatible_models']) assert_true(self.kmeans_model_name in frames[0]['compatible_models'])
def validate_predictions(result, model_name, frame_key, expected_rows): ''' Validate a /Predictions result. ''' assert p is not None, "FAIL: Got a null result for scoring: " + model_name + " on: " + frame_key assert 'model_metrics' in p, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a model_metrics object." mm = p['model_metrics'][0] h2o.H2O.verboseprint('mm: ', repr(mm)) assert 'auc' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an AUC." assert 'cm' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a CM." assert 'predictions' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an predictions section." assert 'key' in mm['predictions'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key." assert 'name' in mm['predictions']['key'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key name." predictions_key = mm['predictions']['key']['name'] result = a_node.frames(key=predictions_key, find_compatible_models=True, len=5) frames = result['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert predictions_key in frames_dict, "FAIL: Failed to find predictions key" + predictions_key + " in Frames list." predictions = mm['predictions'] h2o.H2O.verboseprint('p: ', repr(p)) assert 'columns' in predictions, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an columns section." assert len(predictions['columns']) > 0, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain any columns." assert 'label' in predictions['columns'][0], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 has no label element." assert 'predict' == predictions['columns'][0]['label'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 is not 'predict'." assert expected_rows == predictions['rows'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " has an unexpected number of rows."
def validate_predictions(result, model_name, frame_key, expected_rows, destination_key=None): ''' Validate a /Predictions result. ''' assert p is not None, "FAIL: Got a null result for scoring: " + model_name + " on: " + frame_key assert 'model_metrics' in p, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a model_metrics object." mm = p['model_metrics'][0] h2o.H2O.verboseprint('mm: ', repr(mm)) #assert 'auc' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an AUC." #assert 'cm' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a CM." assert 'predictions' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an predictions section." assert 'key' in mm['predictions'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key." assert 'name' in mm['predictions']['key'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key name." predictions_key = mm['predictions']['key']['name'] f = a_node.frames(key=predictions_key, find_compatible_models=True, row_count=5) frames = f['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert predictions_key in frames_dict, "FAIL: Failed to find predictions key" + predictions_key + " in Frames list." predictions = mm['predictions'] h2o.H2O.verboseprint('p: ', repr(p)) assert 'columns' in predictions, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an columns section." assert len(predictions['columns']) > 0, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain any columns." assert 'label' in predictions['columns'][0], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 has no label element." assert 'predict' == predictions['columns'][0]['label'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 is not 'predict'." assert expected_rows == predictions['rows'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " has an unexpected number of rows." assert 'destination_key' in result, "FAIL: failed to find 'destination_key' in predict result:" + h2o_util.dump_json(result) assert 'name' in result['destination_key'], "FAIL: failed to find name in 'destination_key' in predict result:" + h2o_util.dump_json(result) if destination_key is not None: assert destination_key == result['destination_key']['name'], "FAIL: bad value for 'destination_key' in predict result; expected: " + destination_key + ", got: " + result['destination_key']['name']
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k,v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))
def testCheckWithModelAPI(self): ###################################################################### # Now look for kmeans_model_name using the one-model API and find_compatible_frames, and check it model = self.a_node.models(key=self.kmeans_model_name, find_compatible_frames=True) found_kmeans = False; h2o_util.assertKeysExist(model['models'][0], '', ['compatible_frames']) assert self.prostate_key in model['models'][0]['compatible_frames'], \ "Failed to find " + self.prostate_key + " in compatible_frames list." ###################################################################### # Now look for prostate_key using the one-frame API and find_compatible_models, and check it result = self.a_node.frames(key='prostate.hex', find_compatible_models=True) frames = result['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list." compatible_models = result['compatible_models'] models_dict = h2o_util.list_to_dict(compatible_models, 'key') assert self.dl_prostate_model_name in models_dict, "Failed to find " + \ self.dl_prostate_model_name + " in compatible models list." assert self.dl_prostate_model_name in frames[0]['compatible_models'] assert self.kmeans_model_name in frames[0]['compatible_models']
def testImportProstate(self): cleanup(self.a_node) import_result = self.a_node.import_files(path="/Users/radu/h2o-dev/smalldata/logreg/prostate.csv") parse_result = self.a_node.parse(key=import_result['keys'][0]) # TODO: handle multiple files self.prostate_key = parse_result['frames'][0]['key']['name'] # Test /Frames for prostate.csv frames = self.a_node.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list." # Test /Frames/{key} for prostate.csv frames = self.a_node.frames(key='prostate.hex')['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list." columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'CAPSULE' in columns_dict, "Failed to find CAPSULE in Frames/prostate.hex." assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.columns(key='prostate.hex')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'ID' in columns_dict, "Failed to find ID in Frames/prostate.hex/columns." assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.column(key='prostate.hex', column='AGE')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.summary(key='prostate.hex', column='AGE')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns/AGE/summary." col = columns_dict['AGE'] h2o_util.assertKeysExistAndNonNull(col, '', ['label', 'missing', 'zeros', 'pinfs', 'ninfs', 'mins', 'maxs', 'mean', 'sigma', 'type', 'data', 'precision', 'bins', 'base', 'stride', 'pctiles']) h2o_util.assertKeysExist(col, '', ['domain', 'str_data']) assert col['mins'][0] == 43, 'Failed to find 43 as the first min for AGE.' assert col['maxs'][0] == 79, 'Failed to find 79 as the first max for AGE.' assert col['mean'] == 66.03947368421052, 'Failed to find 66.03947368421052 as the mean for AGE.' assert col['sigma'] == 6.527071269173308, 'Failed to find 6.527071269173308 as the sigma for AGE.' assert col['type'] == 'int', 'Failed to find int as the type for AGE.' assert col['data'][0] == 65, 'Failed to find 65 as the first data for AGE.' assert col['precision'] == -1, 'Failed to find -1 as the precision for AGE.' assert col['bins'][0] == 1, 'Failed to find 1 as the first bin for AGE.' assert col['base'] == 43, 'Failed to find 43 as the base for AGE.' assert col['stride'] == 1, 'Failed to find 1 as the stride for AGE.' assert col['pctiles'][0] == 50.5, 'Failed to find 50.5 as the first pctile for AGE.'
def find_key(pattern=None): try: patternObj = re.compile(pattern) except: raise Exception("Need legal pattern in find_key, not %s", pattern) frames = h2o_nodes.nodes[0].frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') result = [] for key in frames_dict: if patternObj.search(key): result.append(key) if len(result) == 0: verboseprint("Warning: No match for %s" % pattern) return None if len(result) > 1: verboseprint("Warning: multiple imported keys match the key pattern %s, Using: %s" % (pattern, result[0])) return result[0]
DatasetSpec('airlines_binomial', '../../smalldata/airlines/allyears2k_headers.zip', 43978, 'Binomial', 'IsDepDelayed', ['IsArrDelayed', 'ArrDelay', 'DepDelay']), # TODO: more ignored? DatasetSpec('iris_multinomial', '../../smalldata/iris/iris_wheader.csv', 150, 'Multinomial', 'class', []), ] datasets = {} # the dataset spec for dataset_spec in datasets_to_import: dataset = dataset_spec.import_and_validate_dataset(a_node) # it's also stored in dataset_spec['dataset'] datasets[dataset_spec['dest_key']] = dataset_spec ################################################ # Test /Frames for prostate.csv frames = a_node.frames(len=5)['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # TODO: remove: if h2o.H2O.verbose: print "frames: " pp.pprint(frames) if h2o.H2O.verbose: print "frames_dict: " pp.pprint(frames_dict) # TODO: test len and offset (they aren't working yet) assert 'prostate_binomial' in frames_dict, "FAIL: Failed to find " + 'prostate_binomial' + " in Frames list." assert not frames_dict['prostate_binomial']['isText'], "FAIL: Parsed Frame is isText"
def testImportProstate(self): cleanup(self.a_node) prostate_tuple = self.cfg.data['prostate'] if (prostate_tuple[0] == "file"): import_result = self.a_node.import_files( path=os.path.abspath(prostate_tuple[1])) else: raise RuntimeError("Unsupported file type specified") parse_result = self.a_node.parse( key=import_result['keys'][0]) # TODO: handle multiple files self.prostate_key = parse_result['frames'][0]['key']['name'] # Test /Frames for prostate.csv frames = self.a_node.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list." # Test /Frames/{key} for prostate.csv frames = self.a_node.frames(key='prostate.hex')['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list." columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'CAPSULE' in columns_dict, "Failed to find CAPSULE in Frames/prostate.hex." assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict[ 'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE'][ 'bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.columns(key='prostate.hex')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'ID' in columns_dict, "Failed to find ID in Frames/prostate.hex/columns." assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict[ 'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE'][ 'bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.column(key='prostate.hex', column='AGE')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns." assert 'bins' in columns_dict[ 'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE." assert None is columns_dict['AGE'][ 'bins'], "Failed to clear bins field." # should be cleared except for /summary frames = self.a_node.summary(key='prostate.hex', column='AGE')['frames'] columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label') assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns/AGE/summary." col = columns_dict['AGE'] h2o_util.assertKeysExistAndNonNull(col, '', [ 'label', 'missing', 'zeros', 'pinfs', 'ninfs', 'mins', 'maxs', 'mean', 'sigma', 'type', 'data', 'precision', 'bins', 'base', 'stride', 'pctiles' ]) h2o_util.assertKeysExist(col, '', ['domain', 'str_data']) assert col['mins'][ 0] == 43, 'Failed to find 43 as the first min for AGE.' assert col['maxs'][ 0] == 79, 'Failed to find 79 as the first max for AGE.' assert col[ 'mean'] == 66.03947368421052, 'Failed to find 66.03947368421052 as the mean for AGE.' assert col[ 'sigma'] == 6.527071269173308, 'Failed to find 6.527071269173308 as the sigma for AGE.' assert col['type'] == 'int', 'Failed to find int as the type for AGE.' assert col['data'][ 0] == 65, 'Failed to find 65 as the first data for AGE.' assert col[ 'precision'] == -1, 'Failed to find -1 as the precision for AGE.' assert col['bins'][ 0] == 1, 'Failed to find 1 as the first bin for AGE.' assert col['base'] == 43, 'Failed to find 43 as the base for AGE.' assert col['stride'] == 1, 'Failed to find 1 as the stride for AGE.' assert col['pctiles'][ 0] == 50.5, 'Failed to find 50.5 as the first pctile for AGE.'
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only( pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1 } rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
DatasetSpec('airlines_binomial', '../../smalldata/airlines/allyears2k_headers.zip', 43978, 'Binomial', 'IsDepDelayed', ['IsArrDelayed', 'ArrDelay', 'DepDelay']), # TODO: more ignored? DatasetSpec('iris_multinomial', '../../smalldata/iris/iris_wheader.csv', 150, 'Multinomial', 'class', []), ] datasets = {} # the dataset spec for dataset_spec in datasets_to_import: dataset = dataset_spec.import_and_validate_dataset(a_node) # it's also stored in dataset_spec['dataset'] datasets[dataset_spec['dest_key']] = dataset_spec ################################################ # Test /Frames for prostate.csv frames = a_node.frames(row_count=5)['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # TODO: remove: if h2o.H2O.verbose: print "frames: " pp.pprint(frames) if h2o.H2O.verbose: print "frames_dict: " pp.pprint(frames_dict) assert 'prostate_binomial' in frames_dict, "FAIL: Failed to find " + 'prostate_binomial' + " in Frames list." assert not frames_dict['prostate_binomial']['is_text'], "FAIL: Parsed Frame is is_text" # Test /Frames/{key} for prostate.csv
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k, v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse( key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))