Exemplo n.º 1
0
    def test_PCA_UCIwine(self):
        csvFilename = "wine.data"
        timeoutSecs = 300
        trialStart = time.time()
        #parse
        trainKey = "wine.hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       schema='local',
                                       hex_key=trainKey,
                                       timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        #PCA params
        params = {
            'destination_key': "python_PCA_key",
            'tolerance': 0.0,
            'standardize': 1
        }

        kwargs = params.copy()
        h2o.beta_features = True
        #TODO(spencer): Hack around no polling FVEC
        PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
        start = time.time()
        h2o_cmd.runPCA(parseResult=parseResult,
                       timeoutSecs=timeoutSecs,
                       noPoll=True,
                       returnFast=False,
                       **kwargs)
        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                         pollTimeoutSecs=120,
                         retryDelaySecs=2)
        #time.sleep(100)
        elapsed = time.time() - start
        PCAResult['python_elapsed'] = elapsed
        PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
        print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
              "%f pct. of timeout" % (PCAResult['python_%timeout'])
        #check PCA results
        pcaView = h2o_cmd.runPCAView(modelKey="python_PCA_key")
        h2o_pca.simpleCheckPCA(self, pcaView)
        h2o_pca.resultsCheckPCA(self, pcaView)
Exemplo n.º 2
0
    def test_PCA_UCIwine(self):
        csvFilename = "wine.data"
        timeoutSecs=300
        trialStart = time.time()
        #parse
        trainKey = "wine.hex"
        start = time.time()
        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='local',
            hex_key=trainKey, timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
        print "parse result:", parseResult['destination_key']

        #PCA params
        params = { 
            'destination_key': "python_PCA_key",
            'tolerance':0.0,
            'standardize':1
            }   

        kwargs = params.copy()
        h2o.beta_features = True
        #TODO(spencer): Hack around no polling FVEC
        PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
        start = time.time()
        h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, returnFast=False, **kwargs)
        h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2)
        #time.sleep(100)
        elapsed = time.time() - start
        PCAResult['python_elapsed']  = elapsed
        PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
        print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
              "%f pct. of timeout" % (PCAResult['python_%timeout'])
        #check PCA results
        pcaView = h2o_cmd.runPCAView(modelKey = "python_PCA_key")
        h2o_pca.simpleCheckPCA(self,pcaView)
        h2o_pca.resultsCheckPCA(self,pcaView)
Exemplo n.º 3
0
    def test_PCA_manyfiles_fvec(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'PCAModelKey'
        files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800)
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=hexKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]
            print ignore_x
            ignored_cols = ",".join(map(lambda x: "C" + str(x), ignore_x))
            
            # for comparison
            ignore_x = h2o_glm.goodXFromColumnInfo(378, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            print ignore_x


            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'ignored_cols': ignored_cols,
                    'tolerance': tolerance,
                    'standardize': 1,
                    'max_pc': None,
                }

                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult,
                     timeoutSecs=timeoutSecs, **kwargs)
                print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout"
                print "Checking PCA results: "
        
                h2o_pca.simpleCheckPCA(self,pcaResult)
                h2o_pca.resultsCheckPCA(self,pcaResult)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = h2o_cmd.runInspect(key=modelKey)
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["PCAModel"]["stdDev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["PCAModel"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Exemplo n.º 4
0
    def test_PCA_ignore_enums_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100, 3, 'cA', 300), 
            # (10001, 2, 'cA', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'ignored_cols': 'C1',
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2j.pollStatsWhileBusy(timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Exemplo n.º 5
0
def doPCA(f, folderPath):
    debug = False
    bench = "bench"
    if debug:
        print "Doing PCA DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30
    overallWallStart = time.time()
    pre = ""
    if debug: pre    = 'DEBUG'
    pcabenchcsv      = 'benchmarks/'+build+'/'+pre+'pcabench.csv'
    if not os.path.exists(pcabenchcsv):
        output = open(pcabenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(pcabenchcsv,'a')
    csvWrt     = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        
        hex_key             = f + '.hex'
        trainParseWallStart = time.time()
        hK                  = folderPath + "Header.csv"
        headerPathname      = importFolderPath + "/" + hK
        
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey           = h2i.find_key(hK)
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets', 
                                       path             = csvPathname, 
                                       schema           = 'local', 
                                       hex_key          = hex_key,
                                       header           = 1, 
                                       header_from_file = headerKey, 
                                       separator        = 44,
                                       timeoutSecs      = 7200, 
                                       retryDelaySecs   = retryDelaySecs,
                                       pollTimeoutSecs  = 7200,
                                       doSummary        = False
                                      )
        parseWallTime       = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        inspect             = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200)
        
        nMachines           = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row                 =  {'h2o_build'          : build, 
                                'nMachines'          : nMachines,
                                'nJVMs'              : len(h2o.nodes),
                                'Xmx/JVM'            : java_heap_GB,
                                'dataset'            : f,
                                'nRows'              : inspect['num_rows'],
                                'nCols'              : inspect['num_cols'],
                                'parseWallTime'      : parseWallTime,
                               }
    
        params              =  {'destination_key'    : "python_PCA_key",
                                'tolerance'          : 0.0,
                                'standardize'        : 1,
                               }

        kwargs              = params.copy()
        pcaStart            = time.time()
        #h2o.beta_features   = True
        pcaResult = h2o_cmd.runPCA(parseResult = parseResult, noPoll = True,
                                   timeoutSecs = 7200, 
                                   **kwargs)

        h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2)
        pcaTime   = time.time() - pcaStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        #stop all loggers
        os.system(cmd)
        row.update({'pcaBuildTime' : pcaTime})
        csvWrt.writerow(row)
    finally:
        output.close()
    def test_PCA_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        if localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE ****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            #h2o.beta_features = True
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                #h2o.beta_features = True
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, noPoll = True,
                     timeoutSecs=timeoutSecs, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=120, retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                #h2o.beta_features = True
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Exemplo n.º 7
0
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 50, 'cB', 300),
            (10000, 100, 'cC', 300),
            # (10000, 500, 'cH', 300),
            # (10000, 1000, 'cI', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print(rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            h2o.beta_features = False  #turn off beta_features
            start = time.time()

            #h2o.beta_features = False
            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseResult = h2i.import_parse(bucket=None,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           noPoll=h2o.beta_features,
                                           doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            h2o.beta_features = True
            for tolerance in [i / 10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                kwargs = params.copy()
                h2o.beta_features = True
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                h2o_cmd.runPCA(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               noPoll=True,
                               **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=120,
                                 retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed'] = elapsed
                PCAResult['python_%timeout'] = 1.0 * elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])

                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self, pcaView)
                h2o_pca.resultsCheckPCA(self, pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)

                #h2o.beta_features = True
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"]
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                #h2o.beta_features=False
                print
                print
                print
                num_pc = pcaInspect['pca_model']['num_pc']
                print "The number of standard deviations obtained: ", num_pc
                print
                print
                print

                if DO_PCA_SCORE:
                    # just score with same data
                    score_params = {
                        'destination_key': scoreKey,
                        'model': modelKey,
                        'num_pc': num_pc,
                        'source': hex_key,
                    }
                    kwargs = score_params.copy()
                    pcaScoreResult = h2o.nodes[0].pca_score(
                        timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                    h2j.pollWaitJobs(timeoutSecs=300,
                                     pollTimeoutSecs=120,
                                     retryDelaySecs=2)
                    print "PCAScore completed in", pcaScoreResult[
                        'python_elapsed'], "seconds. On dataset: ", csvPathname
                    print "Elapsed time was ", pcaScoreResult[
                        'python_%timeout'], "% of the timeout"

                    # Logging to a benchmark file
                    algo = "PCAScore " + " num_pc=" + str(
                        score_params['num_pc'])
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                        csvFilename, pcaScoreResult['python_elapsed'])
                    print l
                    h2o.cloudPerfH2O.message(l)
Exemplo n.º 8
0
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300), 
            (10000, 50, 'cB', 300), 
            (10000, 100, 'cC', 300), 
            # (10000, 500, 'cH', 300), 
            # (10000, 1000, 'cI', 300), 
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print (rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()

            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])
                
                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)

                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                print
                print
                print 
                num_pc = pcaInspect['pca_model']['num_pc']
                print "The number of standard deviations obtained: ", num_pc
                print 
                print
                print


                if DO_PCA_SCORE:
                    # just score with same data
                    score_params = {
                        'destination_key': scoreKey,
                        'model': modelKey,
                        'num_pc': num_pc,
                        'source':  hex_key,
                    }
                    kwargs = score_params.copy()
                    pcaScoreResult = h2o.nodes[0].pca_score(timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                    h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=2)
                    print "PCAScore completed in", pcaScoreResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                    print "Elapsed time was ", pcaScoreResult['python_%timeout'], "% of the timeout"

                    # Logging to a benchmark file
                    algo = "PCAScore " + " num_pc=" + str(score_params['num_pc'])
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaScoreResult['python_elapsed'])
                    print l
                    h2o.cloudPerfH2O.message(l)
Exemplo n.º 9
0
def doPCA(f, folderPath):
    debug = False
    bench = "bench"
    if debug:
        print "Doing PCA DEBUG"
        bench = "bench/debug"
    #date = '-'.join([str(x) for x in list(time.localtime())][0:3])
    retryDelaySecs = 5  #if f == 'AirlinesTrain1x' else 30
    overallWallStart = time.time()
    pre = ""
    if debug: pre = 'DEBUG'
    pcabenchcsv = 'benchmarks/' + build + '/' + pre + 'pcabench.csv'
    if not os.path.exists(pcabenchcsv):
        output = open(pcabenchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(pcabenchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            csvPathname = importFolderPath + "/" + f + "/*linked*"

        hex_key = f + '.hex'
        trainParseWallStart = time.time()
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK

        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=retryDelaySecs,
                                       pollTimeoutSecs=7200,
                                       doSummary=False)
        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime, " seconds."
        inspect = h2o.nodes[0].inspect(parseResult['destination_key'],
                                       timeoutSecs=7200)

        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row = {
            'h2o_build': build,
            'nMachines': nMachines,
            'nJVMs': len(h2o.nodes),
            'Xmx/JVM': java_heap_GB,
            'dataset': f,
            'nRows': inspect['num_rows'],
            'nCols': inspect['num_cols'],
            'parseWallTime': parseWallTime,
        }

        params = {
            'destination_key': "python_PCA_key",
            'tolerance': 0.0,
            'standardize': 1,
        }

        kwargs = params.copy()
        pcaStart = time.time()
        #h2o.beta_features   = True
        pcaResult = h2o_cmd.runPCA(parseResult=parseResult,
                                   noPoll=True,
                                   timeoutSecs=7200,
                                   **kwargs)

        h2j.pollWaitJobs(timeoutSecs=4800,
                         pollTimeoutSecs=4800,
                         retryDelaySecs=2)
        pcaTime = time.time() - pcaStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        #stop all loggers
        os.system(cmd)
        row.update({'pcaBuildTime': pcaTime})
        csvWrt.writerow(row)
    finally:
        output.close()
Exemplo n.º 10
0
    def test_PCA_many_cols_enum_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            "a",
            "b",
            "c",
            "d",
            "e",
            "f",
            "g",
            "h",
            "i",
            "j",
            "k",
            "l",
            "m",
            "n",
            "o",
            "p",
            "q",
            "r",
            "s",
            "t",
            "u",
        ]

        if localhost:
            tryList = [
                (1001, 2, "cA", 300),
                # (10001, 2, 'cA', 300),
                # (10000, 500, 'cH', 300),
                # (10000, 1000, 'cI', 300),
            ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300),
                # (10000, 50, 'cC', 300),
                (10000, 100, "cD", 300),
                (10000, 200, "cE", 300),
                (10000, 300, "cF", 300),
                (10000, 400, "cG", 300),
                (10000, 500, "cH", 300),
                (10000, 1000, "cI", 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE ****************************************
            start = time.time()
            modelKey = "PCAModelKey"

            # Parse ****************************************
            parseResult = h2i.import_parse(
                bucket=None, path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False
            )

            elapsed = time.time() - start
            print "parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # Logging to a benchmark file
            algo = "Parse"
            l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed
            )
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # PCA(tolerance iterate)****************************************
            for tolerance in [i / 10.0 for i in range(11)]:
                params = {"ignored_cols": "C1", "destination_key": modelKey, "tolerance": tolerance, "standardize": 1}
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {"python_elapsed": 0, "python_%timeout": 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                PCAResult["python_elapsed"] = elapsed
                PCAResult["python_%timeout"] = 1.0 * elapsed / timeoutSecs
                print "PCA completed in", PCAResult["python_elapsed"], "seconds.", "%f pct. of timeout" % (
                    PCAResult["python_%timeout"]
                )

                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey=modelKey)
                h2o_pca.simpleCheckPCA(self, pcaView)
                h2o_pca.resultsCheckPCA(self, pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult["python_elapsed"]
                )
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"]
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
Exemplo n.º 11
0
    def test_PCA_manyfiles(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800)
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, csvFilename, hexKey, timeoutSecs) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=hexKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseResult['destination_key'] for h2o"
                parseResult['destination_key'] = hexKey

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the PCA
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # PCA(tolerance iterate)****************************************
            #h2o.beta_features = True
            for tolerance in [i/10.0 for i in range(11)]:
                params = {
                    'destination_key': modelKey,
                    'ignore': 0,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                #h2o.beta_features = True

                pcaResult = h2o_cmd.runPCA(parseResult=parseResult,
                     timeoutSecs=timeoutSecs, **kwargs)
                print "PCA completed in", pcaResult['python_elapsed'], "seconds. On dataset: ", csvPathname
                print "Elapsed time was ", pcaResult['python_%timeout'], "% of the timeout"
                print "Checking PCA results: "
        
                h2o_pca.simpleCheckPCA(self,pcaResult)
                h2o_pca.resultsCheckPCA(self,pcaResult)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, pcaResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                #h2o.beta_features = True
                pcaInspect = h2o_cmd.runInspect(key=modelKey)
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["PCAModel"]["stdDev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["PCAModel"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print