def test_PCA_UCIwine(self): csvFilename = "wine.data" timeoutSecs = 300 trialStart = time.time() #parse trainKey = "wine.hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] #PCA params params = { 'destination_key': "python_PCA_key", 'tolerance': 0.0, 'standardize': 1 } kwargs = params.copy() h2o.beta_features = True #TODO(spencer): Hack around no polling FVEC PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) #time.sleep(100) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) #check PCA results pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key") h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView)
def test_PCA_UCIwine(self): csvFilename = "wine.data" timeoutSecs=300 trialStart = time.time() #parse trainKey = "wine.hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] #PCA params params = { 'destination_key': "python_PCA_key", 'tolerance':0.0, 'standardize':1 } kwargs = params.copy() h2o.beta_features = True #TODO(spencer): Hack around no polling FVEC PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) #time.sleep(100) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) #check PCA results pcaView = h2o_cmd.runPCAView(modelKey = "python_PCA_key") h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView)
def test_PCA_manyfiles_fvec(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'PCAModelKey' files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800) ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378] print ignore_x ignored_cols = ",".join(map(lambda x: "C" + str(x), ignore_x)) # for comparison ignore_x = h2o_glm.goodXFromColumnInfo(378, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) print ignore_x # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'ignored_cols': ignored_cols, 'tolerance': tolerance, 'standardize': 1, 'max_pc': None, } print "Using these parameters for PCA: ", params kwargs = params.copy() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout" print "Checking PCA results: " h2o_pca.simpleCheckPCA(self,pcaResult) h2o_pca.resultsCheckPCA(self,pcaResult) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = h2o_cmd.runInspect(key=modelKey) # errrs from end of list? is that the last tree? sdevs = pcaInspect["PCAModel"]["stdDev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["PCAModel"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_ignore_enums_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 3, 'cA', 300), # (10001, 2, 'cA', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'ignored_cols': 'C1', 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] if localhost: tryList = [ (10000, 100, 'cA', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE **************************************** h2o.beta_features = False #turn off beta_features start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = trainKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** #h2o.beta_features = True for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() #h2o.beta_features = True PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, noPoll = True, timeoutSecs=timeoutSecs, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print(rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** h2o.beta_features = False #turn off beta_features start = time.time() #h2o.beta_features = False modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = trainKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** h2o.beta_features = True for tolerance in [i / 10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } kwargs = params.copy() h2o.beta_features = True PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print #h2o.beta_features=False print print print num_pc = pcaInspect['pca_model']['num_pc'] print "The number of standard deviations obtained: ", num_pc print print print if DO_PCA_SCORE: # just score with same data score_params = { 'destination_key': scoreKey, 'model': modelKey, 'num_pc': num_pc, 'source': hex_key, } kwargs = score_params.copy() pcaScoreResult = h2o.nodes[0].pca_score( timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) print "PCAScore completed in", pcaScoreResult[ 'python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaScoreResult[ 'python_%timeout'], "% of the timeout" # Logging to a benchmark file algo = "PCAScore " + " num_pc=" + str( score_params['num_pc']) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l)
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print (rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print print print print num_pc = pcaInspect['pca_model']['num_pc'] print "The number of standard deviations obtained: ", num_pc print print print if DO_PCA_SCORE: # just score with same data score_params = { 'destination_key': scoreKey, 'model': modelKey, 'num_pc': num_pc, 'source': hex_key, } kwargs = score_params.copy() pcaScoreResult = h2o.nodes[0].pca_score(timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) print "PCAScore completed in", pcaScoreResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaScoreResult['python_%timeout'], "% of the timeout" # Logging to a benchmark file algo = "PCAScore " + " num_pc=" + str(score_params['num_pc']) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l)
def test_PCA_many_cols_enum_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", ] if localhost: tryList = [ (1001, 2, "cA", 300), # (10001, 2, 'cA', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, "cD", 300), (10000, 200, "cE", 300), (10000, 300, "cF", 300), (10000, 400, "cG", 300), (10000, 500, "cH", 300), (10000, 1000, "cI", 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE **************************************** start = time.time() modelKey = "PCAModelKey" # Parse **************************************** parseResult = h2i.import_parse( bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False ) elapsed = time.time() - start print "parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # Logging to a benchmark file algo = "Parse" l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed ) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) numRows = inspect["numRows"] numCols = inspect["numCols"] # PCA(tolerance iterate)**************************************** for tolerance in [i / 10.0 for i in range(11)]: params = {"ignored_cols": "C1", "destination_key": modelKey, "tolerance": tolerance, "standardize": 1} print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {"python_elapsed": 0, "python_%timeout": 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start PCAResult["python_elapsed"] = elapsed PCAResult["python_%timeout"] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult["python_elapsed"], "seconds.", "%f pct. of timeout" % ( PCAResult["python_%timeout"] ) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult["python_elapsed"] ) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_manyfiles(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800) ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = hexKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** #h2o.beta_features = True for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'ignore': 0, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() #h2o.beta_features = True pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout" print "Checking PCA results: " h2o_pca.simpleCheckPCA(self,pcaResult) h2o_pca.resultsCheckPCA(self,pcaResult) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = h2o_cmd.runInspect(key=modelKey) # errrs from end of list? is that the last tree? sdevs = pcaInspect["PCAModel"]["stdDev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["PCAModel"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print