def test_KMeans_covtype_cols_fvec(self): h2o.beta_features = True # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': range(11, numCols), 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], for trial2 in range(3): timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str,range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) ### print h2o.dump_json(inspect) # compare this kmeans to the first one. since the files are replications, the results # should be similar? KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30): print "\nStarting KMeans of", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10) # hastie has two values, 1 and -1. # we could not specify cols, but this is more fun cols = ",".join(map(str, range(11))) kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': cols, 'destination_key': 'KMeansModel.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) ### print h2o.dump_json(inspect) # compare this kmeans to the first one. since the files are replications, the results # should be similar? KMeansModel = inspect['KMeansModel'] clusters = KMeansModel['clusters'][0] print "clusters:", h2o.dump_json(clusters) if self.clusters1: h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1) else: self.clusters1 = copy.deepcopy(clusters)
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_KMeans2_winesPCA(self): h2o.beta_features = True csvPathname = 'winesPCA.csv' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) kwargs = { 'initialization': 'Furthest', # 'initialization': '', # 'initialization': 'PlusPlus', 'max_iter': 50, 'k': 3, 'seed': '265211114317615310', } timeoutSecs = 480 # try the same thing 5 times for trial in range (10): start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent if OLD_KMEANS: expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794), ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] else: # error: 258.051462872 expected = [ ([-2.23406681758209, -0.7729819755373136], 67, 96.85372611195429), ([0.25174392601612905, 1.792222172419355], 62, 99.21823733913352), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] # multipliers on the expected values for allowed # within 2% of best with random seeds? allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_kmeans_sphere5(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good for trial in range(5): # pass SEED so it's repeatable kwargs = { 'k': CLUSTERS, 'max_iter': 10, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'syn_spheres100.hex', 'seed': SEED } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] clustersSorted = sorted(clusters, key=itemgetter(0)) ### print clustersSorted print "\nh2o result, centers sorted" print clustersSorted print "\ngenerated centers" print centersList for i,center in enumerate(centersList): a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+" z not correct.") print "Trial #", trial, "completed"
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = {'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex'} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_winesPCA(self): csvPathname = 'winesPCA.csv' start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range(10): start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794), ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745), ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474), ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_libsvm(self): SYNDATASETS_DIR = h2o.make_syn_dir() for trial in range(2): csvFilename = "syn_ints.csv" hex_key = "1.hex" csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, trial) timeoutSecs = 10 # have to import each time, because h2o deletes source after parse # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000) parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000) # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360) print "Inspect:", hex_key, "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] summaryResult = h2o_cmd.runSummary(key=hex_key) h2o_cmd.infoFromSummary(summaryResult) if DO_KMEANS: # KMEANS****************************************** kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_KMeans_covtype20x_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 1200, 'cA'), ] else: # None is okay for hex_key csvFilenameList = [ ('covtype20x.data', 1200, 'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = "standard" for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseResult = h2i.import_parse( bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time( ) - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) k = 2 kwargs = { 'max_iter': 25, 'initialization': 'Furthest', 'k': k, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_KMeans_winesPCA(self): if localhost: csvFilenameList = [ #with winesPCA2.csv speciy cols = "1,2" ('winesPCA.csv', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('winesPCA.csv', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = os.path.abspath(h2o.find_file('smalldata')) h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, 'winesPCA.csv', importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'epsilon': 1e-6, 'k': 3 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) centers = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) print "Expected centers: [-2.276318, -0.965151], with 59 rows." print " [0.0388763, 1.63886039], with 71 rows." print " [2.740469, -1.237816], with 48 rows." model_key = kmeans['destination_key'] kmeansScoreResult = h2o.nodes[0].kmeans_score( key = parseKey['destination_key'], model_key = model_key) score = kmeansScoreResult['score']
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 800), ] else: csvFilenameList = [ ('covtype20x.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeansGrid_basic(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path='standard/covtype.data', schema='local', timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "python_source:", parseResult['python_source'] csvPathname = parseResult['python_source'] print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) destination_key = 'd.hex' params = { 'k': 2, # 'initialization': 'Furthest', 'initialization': None, 'seed': 3923021996079663354, 'normalize': 0, 'max_iter': '2', 'destination_key': destination_key } for trial in range(3): kwargs = params.copy() h2o.beta_features = True start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans (with grid) end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # This doesn't work (inspecting the model) # inspect = h2o_cmd.runInspect(None,key=destination_key) # print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_covtype20x_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 1200, 'cA'), ] else: # None is okay for hex_key csvFilenameList = [ ('covtype20x.data', 1200,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = "standard" for csvFilename, timeoutSecs, hex_key in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, hex_key=hex_key) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) k = 2 kwargs = { 'max_iter': 25, 'initialization': 'Furthest', 'k': k, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) gs = h2o.nodes[0].gap_statistic(source=hex_key, k_max=8) print "gap_statistic:", h2o.dump_json(gs)
def test_GLM_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] # a browser window too, just because we can h2b.browseTheCloud() importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2, noise=('JStack', None)) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") # loop, to see if we get same centers for i in range(2): kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) centers = kmeansResult['KMeansModel']['clusters'] h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) show_results(csvPathname, parseKey, model_key, centers, 'd')
def test_kmeans_benign(self): importFolderPath = "logreg" csvFilename = "benign.csv" hex_key = "benign.hex" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) expected = [ ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), ] # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01, 0.01) # loop, to see if we get same centers # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(5): kmeansSeed = random.randint(0, sys.maxint) # kmeansSeed = 6655548259421773879 parameters = { 'validation_frame': parse_key, 'ignored_columns': None, 'score_each_iteration': False, 'K': 4, 'max_iters': 50, 'normalize': False, 'seed': kmeansSeed, 'init': 'PlusPlus', } model_key = 'benign_k.hex' kmeansResult = h2o.n0.build_model( algo='kmeans', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) modelResult = h2o.n0.models(key=model_key) # this prints too tuplesSorted, iters, mse, names = \ h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed) h2o_cmd.runStoreView() # zip with * is it's own inverse here. It's sorted by centers for easy comparisons ids, mses, rows, clusters = zip(*tuplesSorted)
def test_KMeans_params_rand2_fvec(self): h2o.beta_features = True if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) paramDict = define_params(SEED) for trial in range(3): # default params = { 'max_iter': 20, 'k': 1, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = { 'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex' } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='spheres3.hex') ### print h2o.dump_json(kmeans) print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) clusters = kmeansResult['KMeansModel']['clusters'] # cluster centers can return in any order clustersSorted = sorted(clusters, key=itemgetter(0)) self.assertAlmostEqual(clustersSorted[0][0], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][0], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][0], 300, delta=.2) self.assertAlmostEqual(clustersSorted[0][1], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][1], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][1], 300, delta=.2) self.assertAlmostEqual(clustersSorted[0][2], 100, delta=.2) self.assertAlmostEqual(clustersSorted[1][2], 200, delta=.2) self.assertAlmostEqual(clustersSorted[2][2], 300, delta=.2)
def test_KMeans_winesPCA(self): csvPathname = h2o.find_file('smalldata/winesPCA.csv') start = time.time() parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, timeoutSecs=10) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { #appears not to take 'cols'? 'cols': None, 'initialization': 'Furthest', 'k': 3, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } timeoutSecs = 480 # try the same thing 5 times for trial in range (10): start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans #", trial, "end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = \ h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs) # tupleResultList has tuples = center, rows_per_cluster, sqr_error_per_cluster # now compare expected vs actual. By sorting on center, we should be able to compare # since the centers should be separated enough to have the order be consistent expected = [ ([-2.25977535371875, -0.8631572635625001], 64, 83.77800617624794) , ([0.16232721958461543, 1.7626161107230771], 65, 111.64440134649745) , ([2.7362112930204074, -1.2107751495102044], 49, 62.6290553489474) , ] # multipliers on the expected values for allowed allowedDelta = (0.01, 0.01, 0.01) h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial)
def test_KMeans_params_rand2(self): if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype.data', 800), ] else: csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params(SEED) for trial in range(3): # default params = {'k': 1 } # 'destination_key': csvFilename + "_" + str(trial) + '.hex'} h2o_kmeans.pickRandKMeansParams(paramDict, params) kwargs = params.copy() start = time.time() kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_covtype20x(self): if localhost: csvFilenameList = [ # 68 secs on my laptop? ('covtype20x.data', 480, 'cA'), ] else: # None is okay for key2 csvFilenameList = [ ('covtype20x.data', 480,'cA'), # ('covtype200x.data', 1000,'cE'), ] importFolderPath = '/home/0xdiag/datasets/standard' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs, key2 in csvFilenameList: csvPathname = importFolderPath + "/" + csvFilename # creates csvFilename.hex from file in importFolder dir start = time.time() parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, key2=key2) # noise=('JStack', None) print "parse end on ", csvPathname, 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) kwargs = { 'cols': None, 'epsilon': 1e-4, 'k': 2, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_many_cols_with_syn(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 11, 'cA', 5), (100, 10, 'cB', 5), (100, 9, 'cC', 5), (100, 8, 'cD', 5), (100, 7, 'cE', 5), (100, 6, 'cF', 5), (100, 5, 'cG', 5), ] ### h2b.browseTheCloud() lenNodes = len(h2o.nodes) cnum = 0 for (rowCount, colCount, key2, timeoutSecs) in tryList: cnum += 1 csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") print "Parse result['destination_key']:", parseKey['destination_key'] kwargs = {'k': 2, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'benign_k.hex'} kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) model_key = kmeans['destination_key'] kmeansResult = h2o_cmd.runInspect(key=model_key) ## h2o.nodes[0].kmeans_apply(data_key=parseKey['destination_key'], model_key=model_key, destination_key='a') # this is failing for some reason ## h2o.nodes[0].kmeans_score(key=parseKey['destination_key'], model_key=model_key) clusters = kmeansResult['KMeansModel']['clusters'] for i,c in enumerate(clusters): print "clusters["+str(i)+"]: ", clusters[i] ## print h2o.dump_json(kmeans) ## print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_C_kmeans_prostate(self): csvFilename = "prostate.csv" print "\nStarting", csvFilename csvPathname = h2o.find_file('smalldata/logreg' + '/' + csvFilename) parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = { 'k': 1, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'prostate_k.hex' } kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=5, **kwargs) kmeansResult = h2o_cmd.runInspect(key='prostate_k.hex') print h2o.dump_json(kmeans) print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
def test_KMeans_covtype_fvec(self): h2o.beta_features = True csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) for trial in range(3): kwargs = { 'source': u'covtype.hex', 'destination_key': 'covtype.data_2.hex', 'initialization': 'Furthest', # 'max_iter': 20, 'max_iter': 50, 'k': 2, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_kmeans_sphere3(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres3_' + str(SEED) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename write_syn_dataset(csvPathname, 1000000, SEED) print "\nStarting", csvFilename parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex") kwargs = {'k': 3, 'epsilon': 1e-6, 'cols': None, 'destination_key': 'spheres3.hex'} timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='spheres3.hex') ### print h2o.dump_json(kmeans) print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) clusters = kmeansResult['KMeansModel']['clusters'] # cluster centers can return in any order clustersSorted = sorted(clusters, key=itemgetter(0)) self.assertAlmostEqual(clustersSorted[0][0],100,delta=.2) self.assertAlmostEqual(clustersSorted[1][0],200,delta=.2) self.assertAlmostEqual(clustersSorted[2][0],300,delta=.2) self.assertAlmostEqual(clustersSorted[0][1],100,delta=.2) self.assertAlmostEqual(clustersSorted[1][1],200,delta=.2) self.assertAlmostEqual(clustersSorted[2][1],300,delta=.2) self.assertAlmostEqual(clustersSorted[0][2],100,delta=.2) self.assertAlmostEqual(clustersSorted[1][2],200,delta=.2) self.assertAlmostEqual(clustersSorted[2][2],300,delta=.2)
def test_KMeans_allstate_s3n_thru_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'allstate' csvFilename = "train_set.csv" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 600 trialMax = 3 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print 'h2o reported parse time:', parseResult['response']['time'] print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = {'cols': None, 'initialization': 'Furthest', 'k': 12} start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_KMeans_covtype_fvec(self): h2o.beta_features = True csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) for trial in range(3): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': range(11, inspect['numCols']), 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310 } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_KMeans_allstate_s3n_thru_hdfs(self): bucket = 'home-0xdiag-datasets' importFolderPath = 'allstate' csvFilename = "train_set.csv" csvPathname = importFolderPath + "/" + csvFilename timeoutSecs = 600 trialMax = 3 for trial in range(trialMax): trialStart = time.time() hex_key = csvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60) elapsed = time.time() - start print 'h2o reported parse time:', parseResult['response']['time'] print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_KMeans_covtype(self): csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) for trial in range(3): kwargs = { 'source_key': u'covtype.hex', 'destination_key': 'covtype.data_2.hex', 'initialization': 'Furthest', # 'max_iter': 20, 'max_iter': 50, 'k': 2, } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_parse_bounds_libsvm(self): # just do the import folder once importFolderPath = "/home/0xdiag/datasets/libsvm" # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ ("mnist_train.svm", "cM", 30, 1), # FIX! fails KMeansScore ("tmc2007_train.svm", "cJ", 30, 1), ("covtype.binary.svm", "cC", 30, 1), ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("news20.svm", "cH", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse h2i.setupImportFolder(None, importFolderPath) csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, timeoutSecs=2000) print csvPathname, 'parse time:', parseKey['response']['time'] print "Parse result['destination_key']:", parseKey['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=360) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # KMEANS****************************************** for trial in range(2): kwargs = { 'k': 3, 'epsilon': 1e-6, # 'cols': 2, # 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310 } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseKey['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)
def test_KMeans_create_frame_fvec(self): for trial in range(20): cfParamDict = define_create_frame_params(SEED) # default params = { 'rows': 5, 'cols': 10 } h2o_util.pickRandParams(cfParamDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() timeoutSecs = 300 hex_key = 'temp_%s.hex' % trial cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n%s" % hex_key, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) kmeansParamDict = define_KMeans_params(SEED) # default params = { 'max_iter': 20, 'k': 1, 'destination_key': "KM_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params) kwargs = params.copy() start = time.time() parseResult = {'destination_key': hex_key } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_KMeans_libsvm_fvec(self): h2o.beta_features = True # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ # FIX! fails KMeansScore ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # multi-label class # ("tmc2007_train.svm", "cJ", 30, 1), ("mnist_train.svm", "cM", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 120, 1 ), #Summary2 is slow with 5001 columns ("mushrooms.svm", "cG", 30, 1), # ("news20.svm", "cH", 120, 1), #Summary2 is very slow - disable for now ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print "Parse result['destination_key']:", parseResult[ 'destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs)
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)) elapsed = time.time() - start print 'h2o reported parse time:', parseResult['response']['time'] print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \ if __name__ == '__main__': h2o.unit_main()
def test_KMeans_allstate_s3n_thru_hdfs(self): # csvFilename = "covtype20x.data" # csvPathname = csvFilename csvFilename = "CAT*" csvPathname = "cats/" + csvFilename # https://s3.amazonaws.com/home-0xdiag-datasets/allstate/train_set.csv URI = "s3n://home-0xdiag-datasets/" s3nKey = URI + csvPathname trialMax = 1 for trial in range(trialMax): trialStart = time.time() # since we delete the key, we have to re-import every iteration # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList), 8, "Didn't see more than 8 files in s3n?") storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) for s in storeView['keys']: print "\nkey:", s['key'] if 'rows' in s: print "rows:", s['rows'], "value_size_bytes:", s[ 'value_size_bytes'] key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' # ec2 is about 400 secs on four m2.4xlarge nodes # should be less on more nodes? timeoutSecs = 600 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None)) elapsed = time.time() - start print s3nKey, 'h2o reported parse time:', parseKey['response'][ 'time'] print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] kwargs = {'cols': None, 'epsilon': 1e-6, 'k': 12} start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() # pattern matching problem # h2o removes key afte parse now ### print "Removing", s3nKey ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
def test_kmeans_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'initialization': 'Furthest', 'cols': cols, 'destination_key': 'syn_spheres100.hex' } timeoutSecs = 100 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_kmeans2_sphere100(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) if SHUFFLE_SPHERES: # since we create spheres in order csvFilename2 = 'syn_spheres100_shuffled.csv' csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename2 h2o_util.file_shuffle(csvPathname, csvPathname2) else: csvFilename2 = csvFilename csvPathname2 = csvPathname print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename2 + ".hex") ### h2b.browseTheCloud() # try 5 times, to see if all inits by h2o are good # does it break if cols is not specified? cols = ",".join(map(str,range(DIMENSIONS))) for trial in range(10): kwargs = { 'k': CLUSTERS, 'initialization': 'Furthest', 'destination_key': 'syn_spheres100.hex', 'max_iter': 15, } timeoutSecs = 100 start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # can't inspect a kmeans2 model # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') # print h2o.dump_json(kmeansResult) ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) # cluster centers can return in any order model = kmeansResult['model'] clusters = model["centers"] cluster_variances = model["within_cluster_variances"] error = model["total_within_SS"] iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] # the way we create the centers above, if we sort on the sum of xyz # we should get the order the same as when they were created. # to be safe, we'll sort the centers that were generated too, the same way clustersSorted = sorted(clusters, key=sum) centersSorted = sorted(centersList, key=sum) ### print clustersSorted print "\ntrial #", trial, "h2o result, centers (sorted by key=sum)" cf = '{0:6.2f}' for c in clustersSorted: print ' '.join(map(cf.format,c)) print "\ngenerated centers (sorted by key=sum)" for c in centersSorted: print ' '.join(map(cf.format,c)) for i,center in enumerate(centersSorted): # Doing the compare of gen'ed/actual centers is kind of a hamming distance problem. # Assuming that the difference between adjacent sums of all center values, # is greater than 2x the sum of all max allowed variance on each value, # Then the sums will be unique and non-overlapping with allowed variance. # So a sort of the centers, keyed on sum of all values for a center. # will create an ordering that can be compared. # sort gen'ed and actual separately. # Adjacent center hamming distance check is done during gen above. a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) for i, v in enumerate(a): emsg = aStr+" != "+bStr+". Sorted cluster center "+iStr+" axis "+str(i)+" not correct." self.assertAlmostEqual(a[i], b[i], delta=ALLOWED_CENTER_DELTA, msg=emsg) print "Trial #", trial, "completed"
def test_KMeans_libsvm_fvec(self): # hack this into a function so we can call it before and after kmeans # kmeans is changing the last col to enum?? (and changing the data) def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename=='covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i,c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c) # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ # FIX! fails KMeansScore ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # multi-label class # ("tmc2007_train.svm", "cJ", 30, 1), ("mnist_train.svm", "cM", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 120, 1), #Summary2 is slow with 5001 columns ("mushrooms.svm", "cG", 30, 1), # ("news20.svm", "cH", 120, 1), #Summary2 is very slow - disable for now ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), ] csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000, doSummary=False) do_summary_and_inspect() # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) do_summary_and_inspect() # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) print "hello" (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) do_summary_and_inspect()
def test_KMeans_covtype_fvec(self): csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) for trial in range(2): kwargs = { 'k': 6, 'initialization': 'Furthest', # 'initialization': '', # 'ignored_cols': range(11, inspect['numCols']), # ignore the response 'ignored_cols_by_name': 'C55', 'max_iter': 100, # 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310 } start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) expected = [ ([ 2781.64184460309, 162.69950733599902, 16.545275983574268, 243.73547234768156, 50.48239522121315, 942.4480922085701, 208.3915356763203, 218.7135425941215, 140.10956243018794, 1040.6795741397266, 0.22024185323685105, 0.0845245225799837, 0.4957505706376572, 0.19948305354550802, 0.01635558145683929, 0.033196811983660604, 0.026025394050259283, 0.04566180477986607, 0.008617572941792261, 0.03547936261257615, 0.0, 0.0, 0.006189327591882107, 0.13606268110663236, 0.037222303163733886, 0.024007252359445064, 0.040891651692487006, 0.003232264365769295, 1.6188302332734367e-05, 0.004667627172605076, 0.00910861811255187, 9.173371321882807e-05, 0.0025415634662392956, 0.008946735089224526, 0.0023095311328034363, 0.04957397784361021, 0.09252154393235448, 0.03887890610245037, 0.0, 0.0, 0.0010792201555156243, 0.004867282901375466, 0.08281935473426902, 0.045640220376755754, 0.04933654940939677, 0.08426550974265995, 0.07772003949945769, 0.001327440791284218, 0.0014191745045030462, 0.0, 0.0, 0.009513325670870229, 0.010970272880816322, 0.009443176360761713 ], 185319, 116283720155.37769), ([ 2892.8730376693256, 119.94759695676377, 11.22516236778623, 189.0301354611245, 24.621525329374652, 2631.9842642419744, 219.94967526442753, 223.3794395991835, 135.71226572647987, 5409.1797365002785, 0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325, 0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0, 0.026498422712933754, 0.0, 0.04152904063833735, 0.005158656522545927, 0.0695490814622379, 0.0, 0.0634997216552236, 0.05418444980515866, 0.010391538318797551, 0.0002969010948227871, 0.0, 0.0, 0.0, 0.3677862312117276, 0.07596956763778066, 0.0, 0.01109667841900167, 0.005641120801632956, 0.0, 0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586, 0.018444980515865652, 0.010354425681944703 ], 26945, 46932273891.61873), ([ 3022.020861415003, 137.8546989122598, 13.3449108178427, 282.99227296949937, 45.23691263596753, 1606.0215197015768, 216.64941537882825, 222.64791856054669, 137.40339644525253, 2529.4366555907336, 0.4113429046111407, 0.08617284724616782, 0.5024842481426914, 0.0, 0.0, 0.0052506191028494405, 0.0, 0.014176671577693489, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018949249239835743, 0.029850161436945546, 0.05403435628977148, 0.020892761982382997, 0.0, 0.0, 0.0018494718033917432, 0.011731607159650168, 0.005979436381304661, 0.0047098837027052445, 0.013714303626845553, 0.0007601642581737249, 0.047788470580859534, 0.10631328171530674, 0.04641704021817498, 0.0036519231372057308, 0.011872668568383437, 0.0, 0.00034481677690354536, 0.17267483777937995, 0.044473527475627724, 0.05637754302372967, 0.1292435973793925, 0.11970627880003762, 0.0013871038525438075, 0.004858781856368139, 0.0, 0.0, 0.03151155136202627, 0.028988119494686687, 0.012491771417823892 ], 127604, 95229063588.02844), ([ 3051.365089986695, 168.1268450579292, 14.114846831985933, 287.6101588092033, 50.702549817536706, 2835.266162979793, 209.89460702308608, 226.92302305495684, 148.84282479633362, 1461.8985753079312, 0.3284728328107128, 0.0006069141527711857, 0.670920253036516, 0.0, 0.0, 0.0054700083256172235, 0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03886584862938554, 0.013250959002170886, 0.04277966681969203, 0.05480901656564399, 0.0, 0.0, 0.0010426473906581905, 0.0018440853103432178, 0.0, 0.0035014278044491476, 0.011671426014830491, 0.002435437561761296, 0.044405885511091744, 0.10662236712081483, 0.042756323967662366, 0.0, 0.007384122192049426, 0.006263665294625696, 0.0, 0.14390868276285998, 0.022152366576148275, 0.07071327974851968, 0.14799368186805065, 0.1011367968938445, 0.009111493242244337, 0.006427065258833325, 0.0009259331305098857, 0.002318723301612991, 0.03055579330682623, 0.041044514818820564, 0.024074261393257027 ], 128519, 106432862495.53804), ([ 3052.088693852026, 149.15056174929376, 11.549996765359152, 328.4748452763461, 44.2420589567205, 4786.68757682272, 215.8348392383499, 226.91413106764713, 143.9780260065124, 4192.589071226791, 0.8949819938326181, 0.0, 0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0022642485929312314, 0.002415198499126647, 0.0, 0.00012938563388178466, 0.0, 0.1351648588618377, 0.0, 0.0, 0.0, 0.014836219351777974, 0.0, 0.0, 0.010674314795247235, 0.03553792077286352, 0.0, 0.039290104155435275, 0.09289888512712138, 0.03864317598602636, 0.0, 0.0, 0.0, 0.0, 0.4371509283419232, 0.08636491061609126, 0.0003665926293317232, 0.002717098311517478, 0.017100467944709204, 0.0, 0.0028249196730856323, 0.0, 0.0, 0.03226015138119164, 0.017316110667845514, 0.03204450865805533 ], 46373, 77991941653.19676), ([ 3119.4885286481917, 165.13178470083923, 11.672206122079334, 271.2690333876713, 39.407851838435064, 4959.81440560285, 212.5861709835175, 227.95909557447322, 148.6725381875264, 1613.4457676749382, 0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323, 0.0, 0.0, 0.0, 0.008346917828895732, 0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865, 0.0, 0.0, 0.02815009358208054, 0.012512829801364487, 0.0, 0.13355068526233171, 0.11424560767976816, 0.008799734347642335, 0.0, 0.0018867354947775161, 0.0012226046006158305, 0.0, 0.44056028497252914, 0.10774014369377528, 0.0033810300066413087, 0.014580691903640641, 0.02313892410795146, 0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0, 0.06503954597597053, 0.022625732053371973, 0.008256354525146411 ], 66252, 74666940350.2879), ] ### print h2o.dump_json(kmeans) predictKey = 'd' (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeansResult, csvPathname, parseResult, predictKey, **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) print "Trial #", trial, "completed\n"
def test_KMeans_libsvm_fvec(self): # hack this into a function so we can call it before and after kmeans # kmeans is changing the last col to enum?? (and changing the data) def do_summary_and_inspect(): # SUMMARY****************************************** summaryResult = h2o_cmd.runSummary(key=hex_key) coltypeList = h2o_cmd.infoFromSummary(summaryResult) # INSPECT****************************************** inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) h2o_cmd.infoFromInspect(inspect, csvFilename) numRows = inspect['numRows'] numCols = inspect['numCols'] # Now check both inspect and summary if csvFilename == 'covtype.binary.svm': for k in range(55): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0)) stype = inspect['cols'][k]['type'] print k, stype self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int')) # summary may report type differently than inspect..check it too! # we could check na here too for i, c in enumerate(coltypeList): print "column index: %s column type: %s" % (i, c) # inspect says 'int?" assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % ( i, c) # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ # FIX! fails KMeansScore ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # multi-label class # ("tmc2007_train.svm", "cJ", 30, 1), ("mnist_train.svm", "cM", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 120, 1 ), #Summary2 is slow with 5001 columns ("mushrooms.svm", "cG", 30, 1), # ("news20.svm", "cH", 120, 1), #Summary2 is very slow - disable for now ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), ] csvFilenameList = [ ("covtype.binary.svm", "cC", 30, 1), ] ### csvFilenameList = random.sample(csvFilenameAll,1) h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000, doSummary=False) do_summary_and_inspect() # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) do_summary_and_inspect() # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) print "hello" (centers, tupleResultList) = h2o_kmeans.bigCheckResults( self, kmeans, csvPathname, parseResult, 'd', **kwargs) do_summary_and_inspect()
def test_kmeans_sphere5(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename centersList = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good for trial in range(5): # pass SEED so it's repeatable kwargs = { 'k': CLUSTERS, 'max_iter': 10, 'initialization': 'Furthest', 'cols': None, 'destination_key': 'syn_spheres100.hex', 'seed': SEED } timeoutSecs = 30 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) # cluster centers can return in any order clusters = kmeansResult['KMeansModel']['clusters'] clustersSorted = sorted(clusters, key=itemgetter(0)) ### print clustersSorted print "\nh2o result, centers sorted" print clustersSorted print "\ngenerated centers" print centersList for i, center in enumerate(centersList): a = center b = clustersSorted[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str, a)) bStr = ",".join(map(str, b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + " z not correct.") print "Trial #", trial, "completed"
start = time.time() parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)) elapsed = time.time() - start print "parse end on ", hex_key, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] kwargs = { 'cols': None, 'initialization': 'Furthest', 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \ if __name__ == '__main__': h2o.unit_main()
def test_KMeans2_sphere5_inits(self): SYNDATASETS_DIR = h2o.make_syn_dir() CLUSTERS = 5 SPHERE_PTS = 10000 csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename expectedCenters = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good savedResults = [] Result = collections.namedtuple( 'Result', 'trial clusters size cluster_variances error iterations normalized max_iter clustersSorted' ) # save the best for comparison. Print messages when we update best sameAsBest = 0 # big number? to init bestResult = Result(None, None, None, None, None, None, None, None, None) for trial in range(TRIALS): # pass SEED so it's repeatable kwargs = { 'normalize': 0, 'k': CLUSTERS, 'max_iter': MAX_ITER, 'initialization': INIT, # 'initialization': 'PlusPlus', 'destination_key': 'syn_spheres100.hex', 'seed': SEED } timeoutSecs = 30 start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # see if we took the full limit to get an answer # inspect of model doesn't work # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) model = kmeansResult['model'] clusters = model["centers"] size = model["size"] cluster_variances = model["within_cluster_variances"] # round to int to avoid fp error when saying "same" error = int(model["total_within_SS"]) iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] # clustersSorted = sorted(clusters, key=itemgetter(2)) clustersSorted = sorted(clusters) r = Result( trial, clusters, size, cluster_variances, error, iterations, normalized, max_iter, clustersSorted, ) savedResults.append(r) if iterations >= ( max_iter - 1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception( "KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (iterations, max_iter)) print "iterations", iterations ### print clustersSorted # For now, just analyze the one with the lowest error # we could analyze how many are not best, and how many are best (maybe just look at error print "savedResults, error" print r.error if bestResult.error and r.error <= bestResult.error: sameAsBest += 1 # we can check that if it has the same error, the sizes should be the same (integer) and reflects centers? # should if r.size != bestResult.size: raise Exception( "Would expect that if two trials got the same error (rounded to int), the cluster sizes would likely be the same? %s %s" % (r.size, bestResult.size)) if not bestResult.error: # init case bestResult = r elif r.error < bestResult.error: print "Trial", r.trial, "has a lower error", r.error, "than current lowest error", bestResult.error print "Using it for best now" bestResult = r print "Trial #", trial, "completed" print "\nApparently, %s out of %s trials, got the same best error: %s (lowest) " % ( sameAsBest, TRIALS, bestResult.error) print "\nh2o best result was from trial %s, centers sorted:" % bestResult.trial print bestResult.clustersSorted print "\ngenerated centers for comparison" print expectedCenters for i, center in enumerate(expectedCenters): a = center bb = bestResult.clustersSorted print "bb:", bb b = bb[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str, a)) bStr = ",".join(map(str, b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + "; x not correct.") self.assertAlmostEqual(a[1], b[1], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + "; y not correct.") self.assertAlmostEqual(a[2], b[2], delta=1, msg=aStr + "!=" + bStr + ". Sorted cluster center " + iStr + "; z not correct.")
def test_KMeans2_sphere5_bad_inits(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = 'syn_spheres100.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename expectedCenters = write_spheres_dataset(csvPathname, CLUSTERS, SPHERE_PTS) print "\nStarting", csvFilename parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=csvFilename + ".hex") # try 5 times, to see if all inits by h2o are good savedResults = [] Result = collections.namedtuple('Result', 'trial clusters size cluster_variances error iterations normalized max_iter clustersSorted') # save the best for comparison. Print messages when we update best sameAsBest = 1 # big number? to init bestResult = Result(None, None, None, None, None, None, None, None, None) for trial in range(TRIALS): # pass SEED so it's repeatable kwargs = { 'normalize': 0, 'k': CLUSTERS, 'max_iter': MAX_ITER, 'initialization': INIT, # 'initialization': 'PlusPlus', 'destination_key': 'syn_spheres100.hex', 'seed': SEED } timeoutSecs = 30 start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.',\ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # see if we took the full limit to get an answer # inspect of model doesn't work # kmeansResult = h2o_cmd.runInspect(key='syn_spheres100.hex') ### print h2o.dump_json(kmeans) ### print h2o.dump_json(kmeansResult) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) model = kmeansResult['model'] clusters = model["centers"] size = model["size"] cluster_variances = model["within_cluster_variances"] # round to int to avoid fp error when saying "same" error = int(model["total_within_SS"]) iterations = model["iterations"] normalized = model["normalized"] max_iter = model["max_iter"] # clustersSorted = sorted(clusters, key=itemgetter(0)) clustersSorted = sorted(clusters) r = Result ( trial, clusters, size, cluster_variances, error, iterations, normalized, max_iter, clustersSorted, ) savedResults.append(r) if iterations >= (max_iter-1): # h2o hits the limit at max_iter-1..shouldn't hit it raise Exception("KMeans unexpectedly took %s iterations..which was the full amount allowed by max_iter %s", (iterations, max_iter)) print "iterations", iterations ### print clustersSorted # For now, just analyze the one with the lowest error # we could analyze how many are not best, and how many are best (maybe just look at error print "savedResults, error" print r.error if bestResult.error and r.error <= bestResult.error: sameAsBest += 1 # we can check that if it has the same error, the sizes should be the same (integer) and reflects centers? # should if sorted(r.size)!=sorted(bestResult.size): raise Exception("Would expect that if two trials got the same error (rounded to int), the cluster sizes would likely be the same? %s %s" % (r.size, bestResult.size)) if not bestResult.error: # init case bestResult = r elif r.error < bestResult.error: print "Trial", r.trial, "has a lower error", r.error, "than current lowest error", bestResult.error print "Using it for best now" bestResult = r print "Trial #", trial, "completed" print "\nApparently, %s out of %s trials, got the same best error: %s (lowest) " % (sameAsBest, TRIALS, bestResult.error) print "\nh2o best result was from trial %s, centers sorted:" % bestResult.trial print bestResult.clustersSorted print "\ngenerated centers for comparison" print expectedCenters for i,center in enumerate(expectedCenters): a = center bb = bestResult.clustersSorted print "bb:", bb b = bb[i] print "\nexpected:", a print "h2o:", b # h2o result aStr = ",".join(map(str,a)) bStr = ",".join(map(str,b)) iStr = str(i) self.assertAlmostEqual(a[0], b[0], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; x not correct.") self.assertAlmostEqual(a[1], b[1], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; y not correct.") self.assertAlmostEqual(a[2], b[2], delta=2, msg=aStr+"!="+bStr+". Sorted cluster center "+iStr+"; z not correct.")
def test_KMeans_covtype_fvec(self): csvFilenameList = [ ('covtype.data', 800), ] importFolderPath = "standard" for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) for trial in range(2): kwargs = { 'k': 6, 'initialization': 'Furthest', # 'initialization': '', # 'ignored_cols': range(11, inspect['numCols']), # ignore the response 'ignored_cols_by_name': 'C55', 'max_iter': 100, # 'normalize': 0, # reuse the same seed, to get deterministic results 'seed': 265211114317615310 } start = time.time() kmeansResult = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeansResult, **kwargs) expected = [ ([2781.64184460309, 162.69950733599902, 16.545275983574268, 243.73547234768156, 50.48239522121315, 942.4480922085701, 208.3915356763203, 218.7135425941215, 140.10956243018794, 1040.6795741397266, 0.22024185323685105, 0.0845245225799837, 0.4957505706376572, 0.19948305354550802, 0.01635558145683929, 0.033196811983660604, 0.026025394050259283, 0.04566180477986607, 0.008617572941792261, 0.03547936261257615, 0.0, 0.0, 0.006189327591882107, 0.13606268110663236, 0.037222303163733886, 0.024007252359445064, 0.040891651692487006, 0.003232264365769295, 1.6188302332734367e-05, 0.004667627172605076, 0.00910861811255187, 9.173371321882807e-05, 0.0025415634662392956, 0.008946735089224526, 0.0023095311328034363, 0.04957397784361021, 0.09252154393235448, 0.03887890610245037, 0.0, 0.0, 0.0010792201555156243, 0.004867282901375466, 0.08281935473426902, 0.045640220376755754, 0.04933654940939677, 0.08426550974265995, 0.07772003949945769, 0.001327440791284218, 0.0014191745045030462, 0.0, 0.0, 0.009513325670870229, 0.010970272880816322, 0.009443176360761713], 185319, 116283720155.37769) , ([2892.8730376693256, 119.94759695676377, 11.22516236778623, 189.0301354611245, 24.621525329374652, 2631.9842642419744, 219.94967526442753, 223.3794395991835, 135.71226572647987, 5409.1797365002785, 0.883243644460939, 0.11675635553906105, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0015587307478196325, 0.0, 0.0, 0.0, 0.23410651326776769, 0.0, 0.0, 0.0, 0.026498422712933754, 0.0, 0.04152904063833735, 0.005158656522545927, 0.0695490814622379, 0.0, 0.0634997216552236, 0.05418444980515866, 0.010391538318797551, 0.0002969010948227871, 0.0, 0.0, 0.0, 0.3677862312117276, 0.07596956763778066, 0.0, 0.01109667841900167, 0.005641120801632956, 0.0, 0.0018185192057895714, 0.0, 0.0, 0.0021154203006123586, 0.018444980515865652, 0.010354425681944703], 26945, 46932273891.61873) , ([3022.020861415003, 137.8546989122598, 13.3449108178427, 282.99227296949937, 45.23691263596753, 1606.0215197015768, 216.64941537882825, 222.64791856054669, 137.40339644525253, 2529.4366555907336, 0.4113429046111407, 0.08617284724616782, 0.5024842481426914, 0.0, 0.0, 0.0052506191028494405, 0.0, 0.014176671577693489, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018949249239835743, 0.029850161436945546, 0.05403435628977148, 0.020892761982382997, 0.0, 0.0, 0.0018494718033917432, 0.011731607159650168, 0.005979436381304661, 0.0047098837027052445, 0.013714303626845553, 0.0007601642581737249, 0.047788470580859534, 0.10631328171530674, 0.04641704021817498, 0.0036519231372057308, 0.011872668568383437, 0.0, 0.00034481677690354536, 0.17267483777937995, 0.044473527475627724, 0.05637754302372967, 0.1292435973793925, 0.11970627880003762, 0.0013871038525438075, 0.004858781856368139, 0.0, 0.0, 0.03151155136202627, 0.028988119494686687, 0.012491771417823892], 127604, 95229063588.02844) , ([3051.365089986695, 168.1268450579292, 14.114846831985933, 287.6101588092033, 50.702549817536706, 2835.266162979793, 209.89460702308608, 226.92302305495684, 148.84282479633362, 1461.8985753079312, 0.3284728328107128, 0.0006069141527711857, 0.670920253036516, 0.0, 0.0, 0.0054700083256172235, 0.0, 0.01653452018767653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03886584862938554, 0.013250959002170886, 0.04277966681969203, 0.05480901656564399, 0.0, 0.0, 0.0010426473906581905, 0.0018440853103432178, 0.0, 0.0035014278044491476, 0.011671426014830491, 0.002435437561761296, 0.044405885511091744, 0.10662236712081483, 0.042756323967662366, 0.0, 0.007384122192049426, 0.006263665294625696, 0.0, 0.14390868276285998, 0.022152366576148275, 0.07071327974851968, 0.14799368186805065, 0.1011367968938445, 0.009111493242244337, 0.006427065258833325, 0.0009259331305098857, 0.002318723301612991, 0.03055579330682623, 0.041044514818820564, 0.024074261393257027], 128519, 106432862495.53804) , ([3052.088693852026, 149.15056174929376, 11.549996765359152, 328.4748452763461, 44.2420589567205, 4786.68757682272, 215.8348392383499, 226.91413106764713, 143.9780260065124, 4192.589071226791, 0.8949819938326181, 0.0, 0.10501800616738188, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0022642485929312314, 0.002415198499126647, 0.0, 0.00012938563388178466, 0.0, 0.1351648588618377, 0.0, 0.0, 0.0, 0.014836219351777974, 0.0, 0.0, 0.010674314795247235, 0.03553792077286352, 0.0, 0.039290104155435275, 0.09289888512712138, 0.03864317598602636, 0.0, 0.0, 0.0, 0.0, 0.4371509283419232, 0.08636491061609126, 0.0003665926293317232, 0.002717098311517478, 0.017100467944709204, 0.0, 0.0028249196730856323, 0.0, 0.0, 0.03226015138119164, 0.017316110667845514, 0.03204450865805533], 46373, 77991941653.19676) , ([3119.4885286481917, 165.13178470083923, 11.672206122079334, 271.2690333876713, 39.407851838435064, 4959.81440560285, 212.5861709835175, 227.95909557447322, 148.6725381875264, 1613.4457676749382, 0.9052556903942522, 0.0, 0.09474430960574776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00037734709895550323, 0.0, 0.0, 0.0, 0.008346917828895732, 0.0021584254060254783, 0.0, 0.0, 0.0031395278633097865, 0.0, 0.0, 0.02815009358208054, 0.012512829801364487, 0.0, 0.13355068526233171, 0.11424560767976816, 0.008799734347642335, 0.0, 0.0018867354947775161, 0.0012226046006158305, 0.0, 0.44056028497252914, 0.10774014369377528, 0.0033810300066413087, 0.014580691903640641, 0.02313892410795146, 0.0002565960272897422, 3.018776791644026e-05, 0.0, 0.0, 0.06503954597597053, 0.022625732053371973, 0.008256354525146411], 66252, 74666940350.2879) , ] ### print h2o.dump_json(kmeans) predictKey = 'd' (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, predictKey, **kwargs) # all are multipliers of expected tuple value allowedDelta = (0.01, 0.01, 0.01) # these clusters were sorted compared to the cluster order in training h2o_kmeans.showClusterDistribution(self, tupleResultList, expected, trial=trial) # why is the expected # of rows not right in KMeans2. That means predictions are wrong h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=False, allowRowError=True, trial=trial) print "Trial #", trial, "completed\n"
def test_parse_bounds_libsvm(self): # just do the import folder once # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) # so probably 10x that for covtype200 csvFilenameList = [ # FIX! fails KMeansScore ("colon-cancer.svm", "cA", 30, 1), ("connect4.svm", "cB", 30, 1), ("covtype.binary.svm", "cC", 30, 1), # multi-label class # ("tmc2007_train.svm", "cJ", 30, 1), ("mnist_train.svm", "cM", 30, 1), ("duke.svm", "cD", 30, 1), # too many features? 150K inspect timeout? # ("E2006.train.svm", "cE", 30, 1), ("gisette_scale.svm", "cF", 30, 1), ("mushrooms.svm", "cG", 30, 1), ("news20.svm", "cH", 30, 1), ("syn_6_1000_10.svm", "cK", 30, 1), ("syn_0_100_1000.svm", "cL", 30, 1), # normal csv ] ### csvFilenameList = random.sample(csvFilenameAll,1) # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False importFolderPath = "libsvm" for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: # have to import each time, because h2o deletes source after parse csvPathname = importFolderPath + "/" + csvFilename # PARSE****************************************** # creates csvFilename.hex from file in importFolder dir parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=2000) print csvPathname, 'parse time:', parseResult['response']['time'] print "Parse result['destination_key']:", parseResult['destination_key'] # INSPECT****************************************** start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvFilename) # KMEANS****************************************** for trial in range(1): kwargs = { 'k': 3, 'initialization': 'Furthest', 'cols': range(10), # 'max_iter': 10, # 'normalize': 0, # reuse the same seed, to get deterministic results (otherwise sometimes fails 'seed': 265211114317615310, } # fails if I put this in kwargs..i.e. source = dest # 'destination_key': parseResult['destination_key'], timeoutSecs = 600 start = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) # this does an inspect of the model and prints the clusters h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
def test_KMeans_params_rand2(self): SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED if localhost: csvFilenameList = [ # ('covtype.data', 60), ('covtype20x.data', 400), ] else: csvFilenameList = [ ('covtype20x.data', 400), ('covtype200x.data', 2000), ] importFolderPath = '/home/0xdiag/datasets' h2i.setupImportFolder(None, importFolderPath) for csvFilename, timeoutSecs in csvFilenameList: # creates csvFilename.hex from file in importFolder dir parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000, pollTimeoutSecs=60) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) csvPathname = importFolderPath + "/" + csvFilename print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) paramDict = define_params() for trial in range(3): randomV = paramDict['k'] k = random.choice(randomV) randomV = paramDict['epsilon'] epsilon = random.choice(randomV) randomV = paramDict['cols'] cols = random.choice(randomV) kwargs = { 'k': k, 'epsilon': epsilon, 'cols': cols, 'destination_key': csvFilename + "_" + str(trial) + '.hex' } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Trial #", trial, "completed\n"
def test_KMeans_allstate_s3n_thru_hdfs(self): csvFilename = "CAT*" URI = "s3n://home-0xdiag-datasets/cats" s3nKey = URI + "/" + csvFilename trialMax = 1 for trial in range(trialMax): trialStart = time.time() # since we delete the key, we have to re-import every iteration # s3n URI thru HDFS is not typical. importHDFSResult = h2o.nodes[0].import_hdfs(URI) s3nFullList = importHDFSResult['succeeded'] ### print "s3nFullList:", h2o.dump_json(s3nFullList) self.assertGreater(len(s3nFullList),1,"Didn't see more than 1 files in s3n?") storeView = h2o.nodes[0].store_view() ### print "storeView:", h2o.dump_json(storeView) for s in storeView['keys']: print "\nkey:", s['key'] if 'rows' in s: print "rows:", s['rows'], "value_size_bytes:", s['value_size_bytes'] key2 = csvFilename + "_" + str(trial) + ".hex" print "Loading s3n key: ", s3nKey, 'thru HDFS' # ec2 is about 400 secs on four m2.4xlarge nodes # should be less on more nodes? timeoutSecs = 600 start = time.time() parseKey = h2o.nodes[0].parse(s3nKey, key2, timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60, noise=('JStack', None)) elapsed = time.time() - start print s3nKey, 'h2o reported parse time:', parseKey['response']['time'] print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] kwargs = { 'cols': None, 'epsilon': 1e-6, 'k': 12 } start = time.time() kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs) elapsed = time.time() - start print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key']) print h2o.dump_json(inspect) print "Deleting key in H2O so we get it from S3 (if ec2) or nfs again.", \ "Otherwise it would just parse the cached key." storeView = h2o.nodes[0].store_view() # pattern matching problem # h2o removes key afte parse now ### print "Removing", s3nKey ### removeKeyResult = h2o.nodes[0].remove_key(key=s3nKey) print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \