def test_PCA_UCIwine(self): csvFilename = "wine.data" timeoutSecs = 300 trialStart = time.time() #parse trainKey = "wine.hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] #PCA params params = { 'destination_key': "python_PCA_key", 'tolerance': 0.0, 'standardize': 1 } kwargs = params.copy() h2o.beta_features = True #TODO(spencer): Hack around no polling FVEC PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) #time.sleep(100) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) #check PCA results pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key") h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView)
def test_PCA_UCIwine(self): csvFilename = "wine.data" timeoutSecs=300 trialStart = time.time() #parse trainKey = "wine.hex" start = time.time() parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local', hex_key=trainKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] #PCA params params = { 'destination_key': "python_PCA_key", 'tolerance':0.0, 'standardize':1 } kwargs = params.copy() h2o.beta_features = True #TODO(spencer): Hack around no polling FVEC PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) #time.sleep(100) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) #check PCA results pcaView = h2o_cmd.runPCAView(modelKey = "python_PCA_key") h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView)
def test_PCA_manyfiles_fvec(self): h2o.beta_features = True bucket = 'home-0xdiag-datasets' modelKey = 'PCAModelKey' files = [ # None forces numCols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800) ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files: # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378] print ignore_x ignored_cols = ",".join(map(lambda x: "C" + str(x), ignore_x)) # for comparison ignore_x = h2o_glm.goodXFromColumnInfo(378, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) print ignore_x # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'ignored_cols': ignored_cols, 'tolerance': tolerance, 'standardize': 1, 'max_pc': None, } print "Using these parameters for PCA: ", params kwargs = params.copy() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout" print "Checking PCA results: " h2o_pca.simpleCheckPCA(self,pcaResult) h2o_pca.resultsCheckPCA(self,pcaResult) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = h2o_cmd.runInspect(key=modelKey) # errrs from end of list? is that the last tree? sdevs = pcaInspect["PCAModel"]["stdDev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["PCAModel"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_ignore_enums_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (100, 3, 'cA', 300), # (10001, 2, 'cA', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'ignored_cols': 'C1', 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def doPCA(f, folderPath): debug = False bench = "bench" if debug: print "Doing PCA DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30 overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' pcabenchcsv = 'benchmarks/'+build+'/'+pre+'pcabench.csv' if not os.path.exists(pcabenchcsv): output = open(pcabenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(pcabenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' trainParseWallStart = time.time() hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = retryDelaySecs, pollTimeoutSecs = 7200, doSummary = False ) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nRows' : inspect['num_rows'], 'nCols' : inspect['num_cols'], 'parseWallTime' : parseWallTime, } params = {'destination_key' : "python_PCA_key", 'tolerance' : 0.0, 'standardize' : 1, } kwargs = params.copy() pcaStart = time.time() #h2o.beta_features = True pcaResult = h2o_cmd.runPCA(parseResult = parseResult, noPoll = True, timeoutSecs = 7200, **kwargs) h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2) pcaTime = time.time() - pcaStart cmd = 'bash startloggers.sh ' + json + ' stop_' #stop all loggers os.system(cmd) row.update({'pcaBuildTime' : pcaTime}) csvWrt.writerow(row) finally: output.close()
def test_PCA_many_cols_enum(self): SYNDATASETS_DIR = h2o.make_syn_dir() translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u'] if localhost: tryList = [ (10000, 100, 'cA', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, 'cD', 300), (10000, 200, 'cE', 300), (10000, 300, 'cF', 300), (10000, 400, 'cG', 300), (10000, 500, 'cH', 300), (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE **************************************** h2o.beta_features = False #turn off beta_features start = time.time() modelKey = 'PCAModelKey' # Parse **************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = trainKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** #h2o.beta_features = True for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() #h2o.beta_features = True PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, noPoll = True, timeoutSecs=timeoutSecs, **kwargs) h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey = modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print(rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** h2o.beta_features = False #turn off beta_features start = time.time() #h2o.beta_features = False modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = trainKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** h2o.beta_features = True for tolerance in [i / 10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } kwargs = params.copy() h2o.beta_features = True PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print #h2o.beta_features=False print print print num_pc = pcaInspect['pca_model']['num_pc'] print "The number of standard deviations obtained: ", num_pc print print print if DO_PCA_SCORE: # just score with same data score_params = { 'destination_key': scoreKey, 'model': modelKey, 'num_pc': num_pc, 'source': hex_key, } kwargs = score_params.copy() pcaScoreResult = h2o.nodes[0].pca_score( timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) print "PCAScore completed in", pcaScoreResult[ 'python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaScoreResult[ 'python_%timeout'], "% of the timeout" # Logging to a benchmark file algo = "PCAScore " + " num_pc=" + str( score_params['num_pc']) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l)
def test_PCA_many_cols(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), (10000, 50, 'cB', 300), (10000, 100, 'cC', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: print (rowCount, colCount, hex_key, timeoutSecs) SEEDPERFILE = random.randint(0, sys.maxint) # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) # PARSE **************************************** start = time.time() modelKey = 'PCAModelKey' scoreKey = 'PCAScoreKey' # Parse **************************************** parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numRows = inspect['numRows'] numCols = inspect['numCols'] # PCA(tolerance iterate)**************************************** for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'tolerance': tolerance, 'standardize': 1, } kwargs = params.copy() PCAResult = {'python_elapsed': 0, 'python_%timeout': 0} start = time.time() h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) elapsed = time.time() - start PCAResult['python_elapsed'] = elapsed PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs print "PCA completed in", PCAResult['python_elapsed'], "seconds.", \ "%f pct. of timeout" % (PCAResult['python_%timeout']) pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self,pcaView) h2o_pca.resultsCheckPCA(self,pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print print print print num_pc = pcaInspect['pca_model']['num_pc'] print "The number of standard deviations obtained: ", num_pc print print print if DO_PCA_SCORE: # just score with same data score_params = { 'destination_key': scoreKey, 'model': modelKey, 'num_pc': num_pc, 'source': hex_key, } kwargs = score_params.copy() pcaScoreResult = h2o.nodes[0].pca_score(timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2) print "PCAScore completed in", pcaScoreResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaScoreResult['python_%timeout'], "% of the timeout" # Logging to a benchmark file algo = "PCAScore " + " num_pc=" + str(score_params['num_pc']) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l)
def doPCA(f, folderPath): debug = False bench = "bench" if debug: print "Doing PCA DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30 overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' pcabenchcsv = 'benchmarks/' + build + '/' + pre + 'pcabench.csv' if not os.path.exists(pcabenchcsv): output = open(pcabenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(pcabenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' trainParseWallStart = time.time() hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=7200, doSummary=False) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = { 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nRows': inspect['num_rows'], 'nCols': inspect['num_cols'], 'parseWallTime': parseWallTime, } params = { 'destination_key': "python_PCA_key", 'tolerance': 0.0, 'standardize': 1, } kwargs = params.copy() pcaStart = time.time() #h2o.beta_features = True pcaResult = h2o_cmd.runPCA(parseResult=parseResult, noPoll=True, timeoutSecs=7200, **kwargs) h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2) pcaTime = time.time() - pcaStart cmd = 'bash startloggers.sh ' + json + ' stop_' #stop all loggers os.system(cmd) row.update({'pcaBuildTime': pcaTime}) csvWrt.writerow(row) finally: output.close()
def test_PCA_many_cols_enum_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() translateList = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", ] if localhost: tryList = [ (1001, 2, "cA", 300), # (10001, 2, 'cA', 300), # (10000, 500, 'cH', 300), # (10000, 1000, 'cI', 300), ] else: tryList = [ # (10000, 10, 'cB', 300), # (10000, 50, 'cC', 300), (10000, 100, "cD", 300), (10000, 200, "cE", 300), (10000, 300, "cF", 300), (10000, 400, "cG", 300), (10000, 500, "cH", 300), (10000, 1000, "cI", 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv" csvPathname = SYNDATASETS_DIR + "/" + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList) # PARSE **************************************** start = time.time() modelKey = "PCAModelKey" # Parse **************************************** parseResult = h2i.import_parse( bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False ) elapsed = time.time() - start print "parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs ) print "parse result:", parseResult["destination_key"] # Logging to a benchmark file algo = "Parse" l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed ) print l h2o.cloudPerfH2O.message(l) inspect = h2o_cmd.runInspect(key=parseResult["destination_key"]) print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) numRows = inspect["numRows"] numCols = inspect["numCols"] # PCA(tolerance iterate)**************************************** for tolerance in [i / 10.0 for i in range(11)]: params = {"ignored_cols": "C1", "destination_key": modelKey, "tolerance": tolerance, "standardize": 1} print "Using these parameters for PCA: ", params kwargs = params.copy() PCAResult = {"python_elapsed": 0, "python_%timeout": 0} start = time.time() pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start PCAResult["python_elapsed"] = elapsed PCAResult["python_%timeout"] = 1.0 * elapsed / timeoutSecs print "PCA completed in", PCAResult["python_elapsed"], "seconds.", "%f pct. of timeout" % ( PCAResult["python_%timeout"] ) print "Checking PCA results: " pcaView = h2o_cmd.runPCAView(modelKey=modelKey) h2o_pca.simpleCheckPCA(self, pcaView) h2o_pca.resultsCheckPCA(self, pcaView) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult["python_elapsed"] ) print l h2o.cloudPerfH2O.message(l) pcaInspect = pcaView # errrs from end of list? is that the last tree? sdevs = pcaInspect["pca_model"]["sdev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["pca_model"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print
def test_PCA_manyfiles(self): bucket = 'home-0xdiag-datasets' modelKey = 'GBMModelKey' files = [ # None forces num_cols to be used. assumes you set it from Inspect ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800) ] # if I got to hdfs, it's here # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz # h2b.browseTheCloud() for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files: h2o.beta_features = False #turn off beta_features # PARSE train**************************************** start = time.time() xList = [] eList = [] fList = [] # Parse (train)**************************************** if h2o.beta_features: print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!" csvPathname = importFolderPath + "/" + csvFilename parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', hex_key=hexKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False) # hack if h2o.beta_features: h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs) print "Filling in the parseResult['destination_key'] for h2o" parseResult['destination_key'] = hexKey elapsed = time.time() - start print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # Logging to a benchmark file algo = "Parse" l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed) print l h2o.cloudPerfH2O.message(l) # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA # h2o.beta_features = True inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] # PCA(tolerance iterate)**************************************** #h2o.beta_features = True for tolerance in [i/10.0 for i in range(11)]: params = { 'destination_key': modelKey, 'ignore': 0, 'tolerance': tolerance, 'standardize': 1, } print "Using these parameters for PCA: ", params kwargs = params.copy() #h2o.beta_features = True pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout" print "Checking PCA results: " h2o_pca.simpleCheckPCA(self,pcaResult) h2o_pca.resultsCheckPCA(self,pcaResult) # Logging to a benchmark file algo = "PCA " + " tolerance=" + str(tolerance) l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format( len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed']) print l h2o.cloudPerfH2O.message(l) #h2o.beta_features = True pcaInspect = h2o_cmd.runInspect(key=modelKey) # errrs from end of list? is that the last tree? sdevs = pcaInspect["PCAModel"]["stdDev"] print "PCA: standard deviations are :", sdevs print print propVars = pcaInspect["PCAModel"]["propVar"] print "PCA: Proportions of variance by eigenvector are :", propVars print print