def test_KMeansGrid_params_rand2_fvec(self): if h2o.localhost: csvFilenameList = [ # ('covtype.data', 60), ("covtype.data", 800) ] else: csvFilenameList = [("covtype.data", 800)] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60 ) inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) paramDict = define_params(SEED) for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + ".hex" params = {"k": "2,3", "destination_key": destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans( parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs ) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeansGrid_params_rand2_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) paramDict = define_params(SEED) for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + '.hex' params = {'k': '2,3', 'destination_key': destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 800), ] else: csvFilenameList = [ ('covtype20x.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeansGrid_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) h2o.beta_features = True # no grid for VA for trial in range(3): # default destinationKey = csvFilename + "_" + str(trial) + '.hex' params = {'k': 'c(2,3)', 'destination_key': destinationKey} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) elapsed = time.time() - start print "FIX! how do we get results..need redirect_url" print "Have to inspect different models? (grid)" print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) # destination_key is ignored by kmeans...what are the keys for the results # inspect = h2o_cmd.runInspect(None,key=destinationKey) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_params_rand2_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'max_iter': 20, 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = {'k': 1 } # 'destination_key': csvFilename + "_" + str(trial) + '.hex'} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_create_frame_fvec(self): for trial in range(20): cfParamDict = define_create_frame_params(SEED) # default params = { 'rows': 5, 'cols': 10 } h2o_util.pickRandParams(cfParamDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() timeoutSecs = 300 hex_key = 'temp_%s.hex' % trial cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n%s" % hex_key, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) kmeansParamDict = define_KMeans_params(SEED) # default params = { 'max_iter': 20, 'k': 1, 'destination_key': "KM_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params) kwargs = params.copy() start = time.time() parseResult = {'destination_key': hex_key } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"