Exemplo n.º 1
0
    def process_dataset(self, parseResult, Y, e_coefs, e_ndev, e_rdev, e_aic, **kwargs):
        # no regularization
        kwargs['alpha'] = 0
        kwargs['lambda'] = 0
        kwargs['response'] = 'CAPSULE'
        glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, **kwargs)

        (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs)
        cstring = "".join([("%.5e  " % c) for c in clist])
        h2p.green_print("h2o coefficient list:", cstring)
        h2p.green_print("h2o intercept", "%.5e  " %  intercept)

        # other stuff in the json response

        # the first submodel is the right one, if onely one lambda is provided as a parameter above
        glm_model = glmResult['glm_model']
        submodels = glm_model['submodels'][0]
        validation = submodels['validation']
        null_deviance = validation['null_deviance']
        residual_deviance = validation['residual_deviance']

        errors = []
        # FIX! our null deviance doesn't seem to match
        h2o.verboseprint("Comparing:", null_deviance, e_ndev)
        # if abs(float(nullDev) - e_ndev) > (0.001 * e_ndev): 
        #    errors.append('NullDeviance: %f != %s' % (e_ndev,nullDev))

        # FIX! our res deviance doesn't seem to match
        h2o.verboseprint("Comparing:", residual_deviance, e_rdev)
        # if abs(float(resDev) - e_rdev) > (0.001 * e_rdev): 
        #    errors.append('ResDeviance: %f != %s' % (e_rdev,resDev))

        # FIX! we don't have an AIC to compare?
        return errors
Exemplo n.º 2
0
    def test_exec_assign(self):
        ### h2b.browseTheCloud()

        lenNodes = len(h2o.nodes)
        trial = 0
        while (trial < 200):
            for execExpr in initList:
                if (trial==100):
                    print "\nNow switching between nodes"

                if (trial < 100):
                    nodeX = 0
                else:
                    nodeX = random.randint(0,lenNodes-1)
                ### print nodeX

                resultKey = "Result" + str(trial % period)
                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey=resultKey, timeoutSecs=4)

                ### print "\nexecResult:", execResultInspect

                print "trial: #" + str(trial), min_value, execExpr
                h2o.verboseprint("min_value: ", min_value, "trial:", trial)
                self.assertEqual(float(min_value), float((trial % period) - 1), 
                    "exec constant assigns don't seem to be getting done and visible to Inspect")

                sys.stdout.write('.')
                sys.stdout.flush()

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                trial += 1
Exemplo n.º 3
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = [
                'covtype.data',
                ]
        else:
            csvFilenameList = [
                'covtype200x.data',
                'covtype200x.data',
                'covtype.data',
                'covtype.data',
                'covtype20x.data',
                'covtype20x.data',
                ]

        # a browser window too, just because we can
        ## h2b.browseTheCloud()
        importFolderPath = "standard"
        validations1= {}
        coefficients1= {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {'y': 54, 'n_folds': 2, 'family': "binomial", 'case': 1}
            glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm['GLMModel']
            coefficients = GLMModel['coefficients']
            validationsList = GLMModel['validations']
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, 'err', validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, '0', coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write('.')
            sys.stdout.flush() 
Exemplo n.º 4
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult["destination_key"]
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key, ":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    type = inspectGG["type"]
    if "unparsed" in type:
        print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
        print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG["response"]  # dict
    ### rows = inspectGG['rows']
    value_size_bytes = inspectGG["value_size_bytes"]

    model0 = glmGridResult["models"][0]
    alpha = model0["alpha"]
    area_under_curve = model0["area_under_curve"]
    error_0 = model0["error_0"]
    error_1 = model0["error_1"]
    key = model0["key"]
    print "best GLM model key:", key

    glm_lambda = model0["lambda"]

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, key)
    h2o.verboseprint("GLMGrid inspectGLM:", h2o.dump_json(inspectGLM))
    simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)
Exemplo n.º 5
0
    def test_exec2_fast_locks(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)
            
        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 6
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult, ntrees=ntrees, max_depth=40, response=response,
                timeoutSecs=600, retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,k) in enumerate(zip(classErrorPctList1, classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i], 
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(totalError1, totalError2, delta=.2 * totalError2, msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(elapsed1, elapsed2, delta=.5 * elapsed2, msg="Comparing RF times for DRF2 and SpeeDRF")
Exemplo n.º 7
0
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5",
    timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    key2 = csvFilename + ".hex"
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"
    x = ""
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5",
        timeoutSecs=timeoutSecs)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))
    ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    validation = glmScore['validation']
    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validation)
Exemplo n.º 8
0
    def tryThemAll(self,set,rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows,tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    parseResult = h2i.import_parse(path=csvPathname, schema='local', noPrint=not h2o.verbose)
                    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                    print "\n" + csvPathname, \
                        "    num_rows:", "{:,}".format(inspect['num_rows']), \
                        "    num_cols:", "{:,}".format(inspect['num_cols'])
                    num_rows = inspect['num_rows']
                    num_cols = inspect['num_cols']
                    self.assertEqual(num_cols, 4, "Parsed wrong number of cols: %s" % num_cols)
                    self.assertEqual(num_rows, 29, "Parsed wrong number of rows: %s" % num_rows)

                    h2o_cmd.runRF(parseResult=parseResult, trees=1, 
                        timeoutSecs=10, retryDelaySecs=1.0, noPrint=True)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Exemplo n.º 9
0
    def test_import_file(self):
        timeoutSecs = 500
        cAll = [
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
            'smalldata/jira/v-3.csv',
            ]

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        for c in cAll:

            for i in range(10):
                # race between remove and import?
                csvPathname = h2o.find_file('smalldata/jira/v-3.csv')
                h2o.nodes[0].remove_all_keys()
                importResult = h2o.nodes[0].import_files(csvPathname, timeoutSecs=15)
                h2o.verboseprint(h2o.dump_json(importResult))
                files = importResult['files']
                keys = importResult['keys']
                fails = importResult['fails']
                dels = importResult['dels']

                if len(files) == 0:
                    raise Exception("empty files: %s after import" % files)
                if len(keys) == 0:
                    raise Exception("empty keys: %s after import" % keys)
                if len(fails) != 0:
                    raise Exception("non-empty fails: %s after import" % fails)
                if len(dels) != 0:
                    raise Exception("non-empty dels: %s after import" % dels)
def exec_list(exprList, lenNodes, csvFilename, key2):
        h2e.exec_zero_list(zeroList)
        # start with trial = 1 because trial-1 is used to point to Result0 which must be initted
        trial = 1
        while (trial < 100):
            for exprTemplate in exprList:
                # do each expression at a random node, to facilate key movement
                nodeX = random.randint(0,lenNodes-1)
                colX = random.randint(1,54)
                # FIX! should tune this for covtype20x vs 200x vs covtype.data..but for now
                row = str(random.randint(1,400000))

                execExpr = h2e.fill_in_expr_template(exprTemplate, colX, trial, row, key2)
                execResultInspect = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey="Result"+str(trial)+".hex", timeoutSecs=60)

                eri0 = execResultInspect[0]
                eri1 = execResultInspect[1]
                columns = eri0.pop('cols')
                columnsDict = columns[0]
                print "\nexecResult columns[0]:", h2o.dump_json(columnsDict)
                print "\nexecResult [0]:", h2o.dump_json(eri0)
                print "\nexecResult [1] :", h2o.dump_json(eri1)
                
                min = columnsDict["min"]
                h2o.verboseprint("min: ", min, "trial:", trial)
                ### self.assertEqual(float(min), float(trial),"what can we check here")

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                # slows things down to check every iteration, but good for isolation
                h2o.check_sandbox_for_errors()
                print "Trial #", trial, "completed\n"
                trial += 1
Exemplo n.º 11
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        for x in [10000]:
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        trial = 1
        for x in xrange (1,10,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            # just use one file for now
            csvFilename = "parity_128_4_" + str(10000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            parseResult = h2i.import_parse(path=csvPathname, schema='put')

            h2o.verboseprint("Trial", trial)
            h2o_cmd.runRF(parseResult=parseResult, trees=237, depth=45, timeoutSecs=480)

            # don't change tree count yet
            ## trees += 10
            ### timeoutSecs += 2
            trial += 1
Exemplo n.º 12
0
 def parseS3File(self, s3bucket, filename, **kwargs):
     start      = time.time()
     parseKey   = h2o_cmd.parseS3File(bucket=s3bucket, filename=filename, **kwargs)
     parse_time = time.time() - start 
     h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time))
     parseKey['python_call_timer'] = parse_time
     return parseKey
Exemplo n.º 13
0
def simpleCheckGBMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GBM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    model0 = glmGridResult['models'][0]
    alpha = model0['alpha']
    area_under_curve = model0['area_under_curve']
    error_0 = model0['error_0']
    error_1 = model0['error_1']
    model_key = model0['key']
    print "best GBM model key:", model_key

    glm_lambda = model0['lambda']

    # now indirect to the GBM result/model that's first in the list (best)
    inspectGBM = h2o_cmd.runInspect(None, model_key)
    h2o.verboseprint("GBMGrid inspectGBM:", h2o.dump_json(inspectGBM))
    simpleCheckGBM(self, inspectGBM, colX, allowFailWarning=allowFailWarning, **kwargs)
Exemplo n.º 14
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad " + SYNDATASETS_DIR
            )
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 3):
            sys.stdout.write(".")
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30
            )

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=10000, depth=2, timeoutSecs=900, retryDelaySecs=3)
            print "RF #", trial, "end on ", csvFilename, "took", time.time() - start, "seconds"

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Exemplo n.º 15
0
    def tryThemAll(self,set,rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows,tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in self.tokenChangeDict[tokenCase][0]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes,
                        noPrint=not h2o.verbose)

                    if DO_RF:
                        h2o_cmd.runRF(parseResult=parseResult, trees=1, timeoutSecs=30, retryDelaySecs=0.1)
                    h2o.verboseprint("Set", set)
                    sys.stdout.write('.')
                    sys.stdout.flush()
    def test_1ktrees_job_cancel_many_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range (1,5):
            # random 0 or 1 delay
            delay = random.uniform(0,1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, rfView=False, noPoll=True, timeoutSecs=30, retryDelaySecs=0.25)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult, trees=trial, max_depth=50, timeoutSecs=600, retryDelaySecs=3)
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
Exemplo n.º 17
0
def exec_expr_list_across_cols(lenNodes, exprList, keyX, 
    minCol=0, maxCol=54, timeoutSecs=10, incrementingResult=True):
    colResultList = []
    for colX in range(minCol, maxCol):
        for i, exprTemplate in enumerate(exprList):

            # do each expression at a random node, to facilate key movement
            # UPDATE: all execs are to a single node. No mixed node streams
            # eliminates some store/store race conditions that caused problems.
            # always go to node 0 (forever?)
            if lenNodes is None:
                execNode = 0
            else:
                ### execNode = random.randint(0,lenNodes-1)
                ### print execNode
                execNode = 0

            execExpr = fill_in_expr_template(exprTemplate, colX, colX, 0, keyX)
            if incrementingResult: # the Result<col> pattern
                resultKey = "Result"+str(colX)
            else: # assume it's a re-assign to self
                resultKey = keyX

            # kbn

            # v1
            # execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, resultKey, timeoutSecs)
            # v2
            execResultInspect = exec_expr(h2o.nodes[execNode], execExpr, None, timeoutSecs)
            print "\nexecResult:", h2o.dump_json(execResultInspect)
            execResultKey = execResultInspect[0]['key']

            # v2: Exec2 'apply' can have no key field? (null) maybe just use keyX then
            if execResultKey:
                resultInspect = h2o_cmd.runInspect(None, execResultKey)
            else:
                resultInspect = h2o_cmd.runInspect(None, keyX)
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")

            # min is keyword. shouldn't use.
            if incrementingResult: # a col will have a single min
                min_value = checkScalarResult(execResultInspect, resultKey)
                h2o.verboseprint("min_value: ", min_value, "col:", colX)
                print "min_value: ", min_value, "col:", colX
            else:
                min_value = None

            sys.stdout.write('.')
            sys.stdout.flush()

            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            # slows things down to check every iteration, but good for isolation
            if (h2o.check_sandbox_for_errors()):
                raise Exception(
                    "Found errors in sandbox stdout or stderr, on trial #%s." % trial)

        print "Column #", colX, "completed\n"
        colResultList.append(min_value)

    return colResultList
Exemplo n.º 18
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=1000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Exemplo n.º 19
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
    destination_key = glmGridResult['destination_key']
    inspectGG = h2o_cmd.runInspect(None, destination_key)
    h2o.verboseprint("Inspect of destination_key", destination_key,":\n", h2o.dump_json(inspectGG))

    # FIX! currently this is all unparsed!
    #type = inspectGG['type']
    #if 'unparsed' in type:
    #    print "Warning: GLM Grid result destination_key is unparsed, can't interpret. Ignoring for now"
    #    print "Run with -b arg to look at the browser output, for minimal checking of result"

    ### cols = inspectGG['cols']
    response = inspectGG['response'] # dict
    ### rows = inspectGG['rows']
    #value_size_bytes = inspectGG['value_size_bytes']

    # FIX! does error_0/1 only exist for binomial?
    for m, model in enumerate(glmGridResult['models']):
        alpha = model['alpha']
        area_under_curve = model['area_under_curve']
        # FIX! should check max error?
        error_0 = model['error_0']
        error_1 = model['error_1']
        model_key = model['key']
        print "#%s GLM model key: %s" % (m, model_key)
        glm_lambda = model['lambda']

    # now indirect to the GLM result/model that's first in the list (best)
    inspectGLM = h2o_cmd.runInspect(None, glmGridResult['models'][0]['key'])
    h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(inspectGLM))
    g = simpleCheckGLM(self, inspectGLM, colX, allowFailWarning=allowFailWarning, **kwargs)

    return g
    def test_1ktrees_job_cancel_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

        print "Kick off twenty, then cancel them all..there's a timeout on the wait after cancelling"
        for trial in range (1,20):
            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=trial, depth=50, rfView=False, noPoll=True,
                timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "started on ", csvFilename, 'took', time.time() - start, 'seconds'


        h2o.check_sandbox_for_errors()
        h2o_jobs.cancelAllJobs(timeoutSecs=10)
Exemplo n.º 21
0
    def test_rf_1ktrees_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [500]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in range (1,5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult, trees=1000, max_depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'

        print "Waiting 60 secs for TIME_WAIT sockets to go away"
        time.sleep(60)
Exemplo n.º 22
0
    def test_exec_filter_slice2(self):
        timeoutSecs = 10
        csvFilename = "covtype.data"
        csvPathname = 'UCI/UCI-large/covtype/covtype.data'
        hex_key = 'c'

        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=hex_key, 
            timeoutSecs=10)

        print csvFilename, 'parse time:', parseResult['response']['time']
        print "Parse result['desination_key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

        for trial in range(10):
            print "Doing the execs in order, to feed filters into slices"
            nodeX = 0
            for exprTemplate in exprList:
                execExpr = h2e.fill_in_expr_template(exprTemplate, colX=0, n=0, row=1, keyX=hex_key, m=2)
                time.sleep(2)
                h2o.check_sandbox_for_errors()

                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr, 
                    resultKey="Result.hex", timeoutSecs=4)
                print "min_value:", min_value, "execExpr:", execExpr
                h2o.verboseprint("min: ", min_value, "trial:", trial)
Exemplo n.º 23
0
    def test_GLM2_model_key_unique(self):
        h2o.beta_features = True
        modelKeyDict = {}
        for trial in range (1,5):
            csvPathname = 'iris/iris2.csv'
            start = time.time()
                        # make sure each parse is unique dest key (not in use
            hex_key = "iris2_" + str(trial) + ".hex"
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=10)
            y = 4
            execExpr="%s[,%s]=(%s[,%s]==%s)" % (hex_key, y+1, hex_key, y+1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
            
            # h2o.py now sets destination_key for a fixed default model name, 
            # we want h2o to create model names for this test, so use none here
            kwargs = {'destination_key': None, 'response':4, 'family': 'gaussian'}
            glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=10, noPoll=True, **kwargs )
            print "GLM #%d" % trial,  "started on ", csvPathname, 'took', time.time() - start, 'seconds'

            model_key = glmResult['destination_key']
            print "GLM model_key:", model_key
            if model_key in modelKeyDict:
                raise Exception("same model_key used in GLM #%d that matches prior GLM #%d" % (trial, modelKeyDict[model_key]))
            modelKeyDict[model_key] = trial

        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 24
0
    def tryThemAll(self, set, rows, enumsOnly=False):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            if enumsOnly:
                tcd = self.tokenChangeDict
            else:
                tcd = self.tokenChangeDictEnumsOnly

            for tokenCase in range(len(tcd)):
                newRows1 = self.changeTokens(rows, tokenCase, tcd)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1,sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname,newRows2,eol)
                    if "'" in self.tokenChangeDict[tokenCase]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname, schema='put', single_quotes=single_quotes,
                        noPrint=not h2o.verbose)

                    h2o_cmd.runRF(parseResult=parseResult, trees=1,
                        timeoutSecs=10, retryDelaySecs=0.1, noPrint=True, print_params=True)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Exemplo n.º 25
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 "+ str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(),4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"  

        # always match the gen above!
        for trial in xrange (1,3,1):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"  
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            cmd.runRFOnly(parseKey=parseKey, trees=10000, depth=2, timeoutSecs=600, retryDelaySecs=3)
            print "RF #", trial,  "end on ", csvFilename, 'took', time.time() - start, 'seconds'
Exemplo n.º 26
0
    def test_exec2_result_race(self):
        ### h2b.browseTheCloud()

        lenNodes = len(h2o.nodes)
        # zero the list of Results using node[0]
        # FIX! is the zerolist not eing seen correctl? is it not initializing to non-zero?
        for execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey="Result.hex", timeoutSecs=20)
            ### print "\nexecResult:", execResult

        trial = 0
        while (trial < 200):
            for execExpr in exprList:
                # for the first 100 trials: do each expression at node 0,
                # for the second 100 trials: do each expression at a random node, to facilate key movement
                # FIX! there's some problem with the initList not taking if rotated amongst nodes?
                if (trial < 100):
                    nodeX = 0
                else:
                    nodeX = random.randint(0,lenNodes-1)
                
                resultKey = "Result.hex"
                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr,
                    resultKey=resultKey, timeoutSecs=20)

                print min_value, execExpr
                h2o.verboseprint("min_value: ", min_value, "trial:", trial)

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                trial += 1
Exemplo n.º 27
0
def runRFOnly(node=None, parseKey=None, trees=5, 
        timeoutSecs=20, retryDelaySecs=2, rfview=True, noise=None, noPrint=False, **kwargs):
    if not parseKey: raise Exception('No parsed key for RF specified')
    if not node: node = h2o.nodes[0]
    #! FIX! what else is in parseKey that we should check?
    h2o.verboseprint("runRFOnly parseKey:", parseKey)
    Key = parseKey['destination_key']
    rf = node.random_forest(Key, trees, timeoutSecs, **kwargs)

    # FIX! check all of these somehow?
    # if we model_key was given to rf via **kwargs, remove it, since we're passing 
    # model_key from rf. can't pass it in two places. (ok if it doesn't exist in kwargs)
    data_key  = rf['data_key']
    kwargs.pop('model_key',None)
    model_key = rf['model_key']
    rfCloud = rf['response']['h2o']

    # same thing. if we use random param generation and have ntree in kwargs, get rid of it.
    kwargs.pop('ntree',None)

    # this is important. it's the only accurate value for how many trees RF was asked for.
    ntree    = rf['ntree']

    # /ip:port of cloud (can't use h2o name)
    rfClass= rf['response_variable']

    rfViewResult = None
    if rfview:
        rfViewResult = runRFView(node, data_key, model_key, ntree, 
            timeoutSecs, retryDelaySecs, noise=noise, noPrint=noPrint, **kwargs)
    
    return rfViewResult
Exemplo n.º 28
0
def glm_score(self, csvFilename, bucket, csvPathname, modelKey, modelPathname, timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    hex_key = csvFilename + ".hex"
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"


    # save and restore the model
    h2o.nodes[0].save_model(model=modelKey, path=modelPathname, force=1)
    # FIX! should we remove the existing key to make sure it loads? really should try both cases (existing or not)
    h2o.nodes[0].load_model(path=modelPathname)

    start = time.time()
    glmScore = h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey, 
        vactual=y, vpredict=1, expectedAuc=0.5, doAUC=False)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    if self.glmScore1:
        h2o_glm.compareToFirstGlm(self, 'mse', glmScore, self.glmScore1)
    else:
        self.glmScore1 = copy.deepcopy(glmScore)
Exemplo n.º 29
0
    def test_GLM_from_import_hosts(self):
        if localhost:
            csvFilenameList = ["covtype.data"]
        else:
            csvFilenameList = [
                "covtype200x.data",
                "covtype200x.data",
                "covtype.data",
                "covtype.data",
                "covtype20x.data",
                "covtype20x.data",
            ]

        # a browser window too, just because we can
        h2b.browseTheCloud()

        importFolderPath = "/home/0xdiag/datasets/standard"
        validations1 = {}
        coefficients1 = {}
        for csvFilename in csvFilenameList:
            # have to re-import each iteration now, since the source key
            # is removed and if we re-parse it, it's not there
            h2i.setupImportFolder(None, importFolderPath, timeoutSecs=60)
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=2000)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(key=parseKey["destination_key"])
            print "\n" + csvFilename

            start = time.time()
            # can't pass lamba as kwarg because it's a python reserved word
            # FIX! just look at X=0:1 for speed, for now
            kwargs = {"y": 54, "n_folds": 2, "family": "binomial", "case": 1}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=2000, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            h2o.verboseprint("\nglm:", glm)
            h2b.browseJsonHistoryAsUrlLastMatch("GLM")

            GLMModel = glm["GLMModel"]
            coefficients = GLMModel["coefficients"]
            validationsList = GLMModel["validations"]
            validations = validationsList.pop()
            # validations['err']

            if validations1:
                h2o_glm.compareToFirstGlm(self, "err", validations, validations1)
            else:
                validations1 = copy.deepcopy(validations)

            if coefficients1:
                h2o_glm.compareToFirstGlm(self, "0", coefficients, coefficients1)
            else:
                coefficients1 = copy.deepcopy(coefficients)

            sys.stdout.write(".")
            sys.stdout.flush()
Exemplo n.º 30
0
def file_append(infile, outfile):
    h2o.verboseprint("\nAppend'ing", infile, "to", outfile)
    start = time.time()
    in_file = open(infile,'rb')
    out_file = open(outfile,'a')
    out_file.write(in_file.read())
    out_file.close()
    h2o.verboseprint("\nAppend took",  (time.time() - start), "secs")
Exemplo n.º 31
0
def find_key(pattern=None):
    found = None
    kwargs = {'filter': pattern}
    storeViewResult = h2o.nodes[0].store_view(**kwargs)
    keys = storeViewResult['keys']
    if len(keys) == 0:
        return None

    if len(keys) > 1:
        h2o.verboseprint(
            "Warning: multiple imported keys match the key pattern given, Using: %s"
            % keys[0]['key'])

    return keys[0]['key']
Exemplo n.º 32
0
def exec_expr(node,
              execExpr,
              resultKey="Result.hex",
              timeoutSecs=10,
              ignoreH2oError=False):
    start = time.time()
    # FIX! Exec has 'escape_nan' arg now. should we test?
    # 5/14/13 removed escape_nan=0
    resultExec = h2o_cmd.runExecOnly(node,
                                     expression=execExpr,
                                     timeoutSecs=timeoutSecs,
                                     ignoreH2oError=ignoreH2oError)
    h2o.verboseprint(resultExec)
    h2o.verboseprint('exec took', time.time() - start, 'seconds')
    ### print 'exec took', time.time() - start, 'seconds'

    h2o.verboseprint("\nfirst look at the default Result key")
    # new offset=-1 to get the metadata?
    defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1)
    checkScalarResult(defaultInspectM1, "Result.hex")

    h2o.verboseprint("\nNow look at the assigned " + resultKey + " key")
    resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1)
    min_value = checkScalarResult(resultInspectM1, resultKey)

    return resultInspectM1, min_value
    def test_rf_1ktrees_job_cancel_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()

            # without rfview, do we get the 'first" rf json?
            rfv = h2o_cmd.runRF(parseResult=parseResult,
                                trees=1000,
                                max_depth=2,
                                rfView=False,
                                timeoutSecs=600,
                                retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            # rf_model = rfv['drf_model']
            # data_key = rf_model['_dataKey']
            # model_key = rf_model['_key']
            data_key = rfv['source']['_key']
            model_key = rfv['destination_key']

            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
Exemplo n.º 34
0
def check_cloud_and_setup_next():
    h2b.browseTheCloud()
    h2o.verify_cloud_size()
    h2o.check_sandbox_for_errors()
    print "Tearing down cloud of size", len(h2o.nodes)
    h2o.tear_down_cloud()
    # this will delete the flatfile in sandbox
    h2o.clean_sandbox()
    # wait to make sure no sticky ports or anything os-related
    # so let's expand the delay if larger number of jvms
    # 1 second per node seems good
    h2o.verboseprint("Waiting", node_count,
                     "seconds to avoid OS sticky port problem")
    time.sleep(node_count)
Exemplo n.º 35
0
    def test_A_putfile_to_all_nodes(self):

        cvsfile = h2o.find_file(file_to_put())
        origSize = h2o.get_file_size(cvsfile)

        # Putfile to each node and check the returned size
        for node in h2o.nodes:
            sys.stdout.write('.')
            sys.stdout.flush()
            h2o.verboseprint("put_file:", cvsfile, "node:", node, "origSize:",
                             origSize)
            key = node.put_file(cvsfile)
            resultSize = node.inspect(key)['value_size_bytes']
            self.assertEqual(origSize, resultSize)
Exemplo n.º 36
0
    def test_1ktrees_job_cancel_many_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2o_cmd.parseResult = h2i.import_parse(path=csvPathname,
                                                             schema='put',
                                                             hex_key=hex_key,
                                                             timeoutSecs=30)

        print "kick off jobs, then cancel them"
        for trial in range(1, 5):
            # random 0 or 1 delay
            delay = random.uniform(0, 1)
            time.sleep(delay)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trial,
                          max_depth=50,
                          rfView=False,
                          noPoll=True,
                          timeoutSecs=30,
                          retryDelaySecs=0.25)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            ### h2o_jobs.cancelAllJobs(timeoutSecs=10)
            h2o.check_sandbox_for_errors()

        # do one last good one
        rfView = h2o_cmd.runRF(parseResult=parseResult,
                               trees=trial,
                               max_depth=50,
                               timeoutSecs=600,
                               retryDelaySecs=3)
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trial)
    def test_rf_1ktrees_job_cancel_3_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(
                    x) + " quad " + SYNDATASETS_DIR
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 20):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            parseResult = h2o_cmd.parseResult = h2i.import_parse(
                path=csvPathname,
                schema='put',
                hex_key=hex_key,
                timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            h2o_cmd.runRF(parseResult=parseResult,
                          trees=trial,
                          max_depth=2,
                          rfView=False,
                          timeoutSecs=600,
                          retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            # FIX! need to get more intelligent here
            time.sleep(1)
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # "destination_key": "pytest_model",
            # FIX! using 'key': 'pytest_model" with no time delay causes a failure
            time.sleep(1)
            jobsList = a['jobs']
            for j in jobsList:
                b = h2o.nodes[0].jobs_cancel(key=j['key'])
                print "jobs_cancel():", h2o.dump_json(b)
Exemplo n.º 38
0
    def tryThemAll(self, set, rows):
        for eolCase in range(len(self.eolDict)):
            eol = self.eolDict[eolCase]
            # change tokens must be first
            for tokenCase in range(len(self.tokenChangeDict)):
                newRows1 = self.changeTokens(rows, tokenCase)
                for sepCase in range(len(self.sepChangeDict)):
                    newRows2 = self.changeSep(newRows1, sepCase)
                    csvPathname = SYNDATASETS_DIR + '/parsetmp_' + \
                        str(set) + "_" + \
                        str(eolCase) + "_" + \
                        str(tokenCase) + "_" + \
                        str(sepCase) + \
                        '.data'
                    self.writeRows(csvPathname, newRows2, eol)

                    # use the single_quotes param if single quote in the
                    # tokenCase (creates token wrapper)
                    if "'" in self.tokenChangeDict[tokenCase][0]:
                        single_quotes = 1
                    else:
                        single_quotes = 0
                    parseResult = h2i.import_parse(path=csvPathname,
                                                   schema='local',
                                                   single_quotes=single_quotes,
                                                   noPrint=not h2o.verbose)
                    inspect = h2o_cmd.runInspect(
                        key=parseResult['destination_key'])
                    print "\n" + csvPathname, \
                        "    num_rows:", "{:,}".format(inspect['num_rows']), \
                        "    num_cols:", "{:,}".format(inspect['num_cols'])
                    num_rows = inspect['num_rows']
                    num_cols = inspect['num_cols']
                    self.assertEqual(
                        num_cols, 4,
                        "Parsed wrong number of cols: %s" % num_cols)
                    self.assertEqual(
                        num_rows, 29,
                        "Parsed wrong number of rows: %s" % num_rows)

                    h2o_cmd.runRF(parseResult=parseResult,
                                  trees=1,
                                  timeoutSecs=10,
                                  retryDelaySecs=1.0,
                                  noPrint=True)
                    h2o.verboseprint("Set", set)
                    h2o.check_sandbox_for_errors()
                    sys.stdout.write('.')
                    sys.stdout.flush()
Exemplo n.º 39
0
 def changeTokens(self, rows, tokenCase):
     [cOpen, cClose] = self.tokenChangeDict[tokenCase]
     newRows = []
     for r in rows:
         # don't quote lines that start with #
         # can quote lines start with some spaces or tabs? maybe
         comment = re.match(r'^[ \t]*#', r)
         empty = re.match(r'^$', r)
         if not (comment or empty):
             r = re.sub('^', cOpen, r)
             r = re.sub('\|', cClose + '|' + cOpen, r)
             r = re.sub('$', cClose, r)
         h2o.verboseprint(r)
         newRows.append(r)
     return newRows
Exemplo n.º 40
0
def runRFOnly(node=None,
              parseKey=None,
              trees=5,
              timeoutSecs=20,
              retryDelaySecs=2,
              rfView=True,
              noise=None,
              noPrint=False,
              **kwargs):
    if not parseKey: raise Exception('No parsed key for RF specified')
    if not node: node = h2o.nodes[0]
    #! FIX! what else is in parseKey that we should check?
    h2o.verboseprint("runRFOnly parseKey:", parseKey)
    Key = parseKey['destination_key']
    rf = node.random_forest(Key, trees, timeoutSecs, **kwargs)

    if h2o.beta_features and rfView == False:
        # just return for now
        return rf
    # FIX! check all of these somehow?
    # if we model_key was given to rf via **kwargs, remove it, since we're passing
    # model_key from rf. can't pass it in two places. (ok if it doesn't exist in kwargs)
    data_key = rf['data_key']
    kwargs.pop('model_key', None)
    model_key = rf['model_key']
    rfCloud = rf['response']['h2o']

    # same thing. if we use random param generation and have ntree in kwargs, get rid of it.
    kwargs.pop('ntree', None)

    # this is important. it's the only accurate value for how many trees RF was asked for.
    ntree = rf['ntree']
    response_variable = rf['response_variable']
    if rfView:
        # ugly..we apparently pass/use response_variable in RFView, gets passed thru kwargs here
        # print kwargs['response_variable']
        rfViewResult = runRFView(node,
                                 data_key,
                                 model_key,
                                 ntree,
                                 timeoutSecs,
                                 retryDelaySecs,
                                 noise=noise,
                                 noPrint=noPrint,
                                 **kwargs)
        return rfViewResult
    else:
        return rf
Exemplo n.º 41
0
Arquivo: h2o_rf.py Projeto: zed9/h2o
def scoreRF(scoreParseKey, trainResult, **kwargs):
    # Run validation on dataset
    rfModelKey = trainResult['model_key']
    ntree = trainResult['ntree']

    start = time.time()
    data_key = scoreParseKey['destination_key']
    scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree,
                                    **kwargs)

    rftime = time.time() - start
    h2o.verboseprint("RF score results: ", scoreResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))

    scoreResult['python_call_timer'] = rftime
    return scoreResult
Exemplo n.º 42
0
    def testCloud(self):
        base_port = 54300
        ports_per_node = 2
        for tryNodes in range(2,8):
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o.build_cloud(use_this_ip_addr="127.0.0.1", 
                base_port=base_port, node_count=tryNodes, 
                timeoutSecs=30, retryDelaySecs=2, java_heap_GB=1)
            print "Build cloud of %d in %d secs" % (tryNodes, (time.time() - start)) 

            h2o.verboseprint(h2o.nodes)
            h2o.verify_cloud_size()
            h2o.tear_down_cloud(h2o.nodes)
Exemplo n.º 43
0
def wait_for_live_port(ip, port, retries=3):
    h2o.verboseprint("Waiting for {0}:{1} {2}times...".format(
        ip, port, retries))
    if not port_live(ip, port):
        count = 0
        while count < retries:
            if port_live(ip, port):
                count += 1
            else:
                count = 0
            time.sleep(1)
            dot()
    if not port_live(ip, port):
        raise Exception(
            "[h2o_cmd] Error waiting for {0}:{1} {2}times...".format(
                ip, port, retries))
Exemplo n.º 44
0
    def parseFile(self, bucket, pathname, timeoutSecs, header, **kwargs):
        # this can get redirected
        if USE_LOCAL:
            schema = None
        else:
            schema = 's3n'

        start = time.time()
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=pathname,
                                       schema='local',
                                       timeoutSecs=180)
        parse_time = time.time() - start
        h2o.verboseprint("parse took {0} sec".format(parse_time))
        parseResult['python_call_timer'] = parse_time
        return parseResult
Exemplo n.º 45
0
    def test_Cloud(self):
        # FIX! weird timeout H2O exceptions with >8? maybe shouldn't
        # don't know if we care
        base_port = 54300
        ports_per_node = 2
        for tryNodes in range(2,17):
            h2o.verboseprint("Trying cloud of", tryNodes)
            sys.stdout.write('.')
            sys.stdout.flush()

            start = time.time()
            h2o.build_cloud(tryNodes, base_port=base_port, 
                retryDelaySecs=2, timeoutSecs=max(30,10*tryNodes), java_heap_GB=1)
            print "Built cloud of %d in %d s" % (tryNodes, (time.time() - start)) 
            h2o.verify_cloud_size()
            h2o.tear_down_cloud()
Exemplo n.º 46
0
    def test_F_no_mc_loop(self):
        print "\nwith flatfile, with multicast disabled, and RF, 5 trials"
        allAcceptIptables()
        multicastDropReceiveIptables()
        showIptables()
        csvPathname = h2o.find_file('smalldata/poker/poker1000')

        for x in range(1, 5):
            h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
            h2o_cmd.runRF(trees=50, timeoutSecs=10, csvPathname=csvPathname)
            h2o.tear_down_cloud()
            h2o.verboseprint("Waiting", nodes_per_host,
                             "seconds to avoid OS sticky port problem")
            time.sleep(nodes_per_host)
            print "Trial", x
            sys.stdout.write('.')
            sys.stdout.flush()
Exemplo n.º 47
0
    def test_rand_inspect(self):
        ### h2b.browseTheCloud()
        csvFilename = 'covtype.data'
        csvPathname = h2o.find_dataset('UCI/UCI-large/covtype/'+ csvFilename)
        print "\n" + csvPathname

        parseKey = h2o_cmd.parseFile(None, csvPathname, key=csvFilename, timeoutSecs=10)
        destination_key = parseKey['destination_key']
        print csvFilename, 'parse time:', parseKey['response']['time']
        print "Parse result['destination_key']:", destination_key 

        def inspect_and_check(nodeX,destination_key,offset,view,inspect=None):
            inspectNew = h2o_cmd.runInspect(h2o.nodes[nodeX], destination_key, offset=offset, view=view)
            # FIX! get min/max/mean/variance for a col too?
            constantNames = [
                'num_cols',
                'num_rows',
                ]
            if inspect is not None:
                for i in constantNames:
                    self.assertEqual(inspect[i], inspectNew[i])

            return inspectNew

        # going to use this to compare against future. num_rows/num_cols should always
        # be the same, regardless of the view. just a coarse sanity check
        origInspect = inspect_and_check(0,destination_key,0,1)
        h2o.verboseprint(h2o.dump_json(origInspect))

        num_rows = origInspect['num_rows']
        num_cols = origInspect['num_cols']

        lenNodes = len(h2o.nodes)
        for i in range (1000):
            # we want to use the boundary conditions, so have two level of random choices
            offset = good_choices(num_rows)
            view = good_choices(num_cols)
            # randomize the node used
            nodeX = random.randint(0,lenNodes-1)
            print "nodeX:", nodeX, "offset:", offset, "view:", view
            inspect_and_check(nodeX,destination_key,offset,view,origInspect)

            # do it again, once in a while
            r = random.randint(0,10)
            if (r==0):
                inspect_and_check(nodeX,destination_key,offset,view,origInspect)
Exemplo n.º 48
0
        def doBoth():
            h2o.verboseprint("Trial", trial)
            start = time.time()
            # make sure ntrees and max_depth are the same for both
            rfView = h2o_cmd.runRF(parseResult=parseResult,
                                   ntrees=ntrees,
                                   max_depth=40,
                                   response=response,
                                   timeoutSecs=600,
                                   retryDelaySecs=3)
            elapsed1 = time.time() - start
            (totalError1, classErrorPctList1,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            rfView = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                        ntrees=ntrees,
                                        max_depth=40,
                                        response=response,
                                        timeoutSecs=600,
                                        retryDelaySecs=3)
            elapsed2 = time.time() - start
            (totalError2, classErrorPctList2,
             totalScores2) = h2o_rf.simpleCheckRFView(rfv=rfView)

            print "Checking that results are similar (within 20%)"
            print "DRF2 then SpeeDRF"
            print "per-class variance is large..basically we can't check very well for this dataset"
            for i, (j,
                    k) in enumerate(zip(classErrorPctList1,
                                        classErrorPctList2)):
                print "classErrorPctList[%s]:i %s %s" % (i, j, k)
                # self.assertAlmostEqual(classErrorPctList1[i], classErrorPctList2[i],
                #    delta=1 * classErrorPctList2[i], msg="Comparing RF class %s errors for DRF2 and SpeeDRF" % i)

            print "totalError: %s %s" % (totalError1, totalError2)
            self.assertAlmostEqual(
                totalError1,
                totalError2,
                delta=.2 * totalError2,
                msg="Comparing RF total error for DRF2 and SpeeDRF")
            print "elapsed: %s %s" % (elapsed1, elapsed2)
            self.assertAlmostEqual(
                elapsed1,
                elapsed2,
                delta=.5 * elapsed2,
                msg="Comparing RF times for DRF2 and SpeeDRF")
Exemplo n.º 49
0
    def test_F_no_mc_loop(self):
        print "\nwith flatfile, with multicast disabled, and RF, 5 trials"
        allAcceptIptables()
        multicastDropReceiveIptables()
        showIptables()

        for x in range(1,5):
            h2o_hosts.build_cloud_with_hosts(nodes_per_host, use_flatfile=True)
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', schema='put')
            h2o_cmd.runRF(parseResult=parseResult, trees=50, timeoutSecs=10)
            h2o.tear_down_cloud()
            h2o.verboseprint("Waiting", nodes_per_host,
                "seconds to avoid OS sticky port problem")
            time.sleep(nodes_per_host)
            print "Trial", x
            sys.stdout.write('.')
            sys.stdout.flush()
Exemplo n.º 50
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in xrange(1, 20, 1):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # broke out the put separately so we can iterate a test just on the RF
            key = h2o.nodes[0].put_file(csvPathname)
            parseKey = h2o.nodes[0].parse(key, key + "_" + str(trial) + ".hex")

            h2o.verboseprint("Trial", trial)
            start = time.time()
            # rfview=False used to inhibit the rfview completion
            h2o_cmd.runRFOnly(parseKey=parseKey,
                              trees=trial,
                              depth=2,
                              rfview=False,
                              timeoutSecs=600,
                              retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

            # FIX! need to get more intelligent here
            time.sleep(1)
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # "destination_key": "pytest_model",
            # FIX! using 'key': 'pytest_model" with no time delay causes a failure
            time.sleep(1)
            jobsList = a['jobs']
            for j in jobsList:
                b = h2o.nodes[0].jobs_cancel(key=j['key'])
                print "jobs_cancel():", h2o.dump_json(b)
Exemplo n.º 51
0
def simpleCheckKMeans(self, kmeans, **kwargs):
    ### print h2o.dump_json(kmeans)
    warnings = None
    if 'warnings' in kmeans:
        warnings = kmeans['warnings']
        # catch the 'Failed to converge" for now
        x = re.compile("[Ff]ailed")
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w): raise Exception(w)

    # Check other things in the json response dictionary 'kmeans' here
    if h2o.beta_features:
        destination_key = kmeans['model']['_key']
        # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

        # can't use inspect on a model key? now?
        kmeansResult = kmeans
    else:
        destination_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=destination_key)

    if h2o.beta_features:
        model = kmeansResult['model']
        clusters = model["centers"]
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        iterations = model["iterations"]
        normalized = model["normalized"]
        max_iter = model["max_iter"]
    else:
        h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult))
        model = kmeansResult['KMeansModel']
        clusters = model['clusters']
        error = model["error"]

    for i, c in enumerate(clusters):
        for n in c:
            if math.isnan(float(n)):
                raise Exception("center", i, "has NaN:", n, "center:", c)

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return warnings
Exemplo n.º 52
0
    def test(n, tries=None):
        rfView = n.random_forest_view(data_key,
                                      model_key,
                                      timeoutSecs,
                                      noise=noise,
                                      **kwargs)
        status = rfView['response']['status']
        numberBuilt = rfView['trees']['number_built']

        if status == 'done':
            if numberBuilt != ntree:
                raise Exception("RFview done but number_built!=ntree: %s %s",
                                numberBuilt, ntree)
            return True
        if status != 'poll': raise Exception('Unexpected status: ' + status)

        progress = rfView['response']['progress']
        progressTotal = rfView['response']['progress_total']

        # want to double check all this because it's new
        # and we had problems with races/doneness before
        errorInResponse = \
            numberBuilt<0 or ntree<0 or numberBuilt>ntree or \
            progress<0 or progressTotal<0 or progress>progressTotal or \
            ntree!=rfView['ntree']
        ## progressTotal!=ntree or
        # rfView better always agree with what RF ntree was

        if errorInResponse:
            raise Exception("\nBad values in response during RFView polling.\n" +
                "progress: %s, progressTotal: %s, ntree: %s, numberBuilt: %s, status: %s" % \
                (progress, progressTotal, ntree, numberBuilt, status))

        # don't print the useless first poll.
        # UPDATE: don't look for done. look for not poll was missing completion when looking for done
        if (status == 'poll'):
            if numberBuilt == 0:
                h2o.verboseprint(".")
            else:
                h2o.verboseprint(
                    "\nRFView polling #", tries,
                    "Status: %s. %s trees done of %s desired" %
                    (status, numberBuilt, ntree))

        return (status != 'poll')
Exemplo n.º 53
0
    def test_GenParity1(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # always match the run below!
        # just using one file for now
        for x in [1000]:
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(x) + " quad"
            h2o.spawn_cmd_and_wait('parity.pl', shCmdString.split(), 4)
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"

        # always match the gen above!
        for trial in range(1, 5):
            sys.stdout.write('.')
            sys.stdout.flush()

            csvFilename = "parity_128_4_" + str(1000) + "_quad.data"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            key2 = csvFilename + "_" + str(trial) + ".hex"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=key2,
                                         timeoutSecs=30)

            h2o.verboseprint("Trial", trial)
            start = time.time()
            rfResult = h2o_cmd.runRFOnly(parseKey=parseKey,
                                         trees=1000,
                                         depth=2,
                                         rfView=False,
                                         timeoutSecs=600,
                                         retryDelaySecs=3)
            print "RF #", trial, "started on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            model_key = rfResult['model_key']
            print "model_key:", model_key

            # FIX! need to get more intelligent here
            a = h2o.nodes[0].jobs_admin()
            print "jobs_admin():", h2o.dump_json(a)
            # this is the wrong key to ancel with
            # "destination_key": "pytest_model",
            print "cancelling with a bad key"
            b = h2o.nodes[0].jobs_cancel(key=model_key)
            print "jobs_cancel():", h2o.dump_json(b)
Exemplo n.º 54
0
def trainRF(trainParseResult, scoreParseResult=None, **kwargs):
    # Train RF
    start = time.time()

    if scoreParseResult:
        trainResult = h2o_cmd.runRF(
            parseResult=trainParseResult,
            validation=scoreParseResult['destination_key'],
            **kwargs)
    else:
        trainResult = h2o_cmd.runRF(parseResult=trainParseResult, **kwargs)

    rftime = time.time() - start
    h2o.verboseprint("RF train results: ", trainResult)
    h2o.verboseprint("RF computation took {0} sec".format(rftime))

    trainResult['python_call_timer'] = rftime
    return trainResult
Exemplo n.º 55
0
def pollWaitJobs(pattern=None,
                 timeoutSecs=30,
                 pollTimeoutSecs=30,
                 retryDelaySecs=5,
                 benchmarkLogging=None):
    anyBusy = True
    waitTime = 0
    while (anyBusy):
        # timeout checking has to move in here now! just count loops
        anyBusy = False
        a = h2o.nodes[0].jobs_admin(timeoutSecs=pollTimeoutSecs)
        ## print "jobs_admin():", h2o.dump_json(a)
        jobs = a['jobs']
        patternKeys = []
        for j in jobs:
            ### h2o.verboseprint(j)
            # save the destination keys for any GLMModel in progress
            if pattern and pattern in j['destination_key']:
                patternKeys.append(j['destination_key'])

            if j['end_time'] == '':
                anyBusy = True
                h2o.verboseprint("waiting", waitTime, "secs, still not done - ",\
                    "destination_key:", j['destination_key'], \
                    "progress:",  j['progress'], \
                    "cancelled:", j['cancelled'],\
                    "end_time:",  j['end_time'])

        ### h2b.browseJsonHistoryAsUrlLastMatch("Jobs")
        if (anyBusy and waitTime > timeoutSecs):
            print h2o.dump_json(jobs)
            raise Exception("Some queued jobs haven't completed after",
                            timeoutSecs, "seconds")

        sys.stdout.write('.')
        sys.stdout.flush()
        time.sleep(retryDelaySecs)
        waitTime += retryDelaySecs

        # any time we're sitting around polling we might want to save logging info (cpu/disk/jstack)
        # test would pass ['cpu','disk','jstack'] kind of list
        if benchmarkLogging:
            h2o.cloudPerfH2O.get_log_save(benchmarkLogging)
    return patternKeys
Exemplo n.º 56
0
    def test_exec2_fast_locks_overlap(self):
        csvPathname = 'iris/iris2.csv'
        src_key='iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                src_key=src_key, timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        lastHexKey = None
        for trial in range (1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult, importPattern)  = h2i.import_only(bucket='smalldata', path=csvPathname, schema='put', 
                    src_key=src_key, timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key, hex_key=hex_key, noPoll=True,
                delete_on_done=1 if AVOID_BUG else 0, timeoutSecs=10)

            # wait until iteration 2, when lastHexKey is available, so you can operate on that
            if lastHexKey:
                execExpr="%s[,%s]=(%s[,%s]==%s)" % (lastHexKey, y+1, lastHexKey, y+1, 1)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

            lastHexKey = hex_key

            # since we are using the same source file, and potentially re-uploading if AVOID_BUG
            # we have to synchronize here. I guess we have to make sure the parse is done too, since we're going to 
            # use it next iteration
            h2o_jobs.pollWaitJobs(timeoutSecs=10)
            
        # just show the jobs still going. Shouldn't be any
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 57
0
def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
# "grid": {
#    "destination_keys": [
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_0", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_1", 
#        "GLMGridResults__8222a49156af52532a34fb3ce4304308_2"
#   ]
# }, 
    destination_key = glmGridResult['grid']['destination_keys'][0]
    inspectGG = h2o.nodes[0].glm_view(destination_key)
    models = inspectGG['glm_model']['submodels']
    h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0]))
    g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs)
    # just to get some save_model testing
    for i,m in enumerate(glmGridResult['grid']['destination_keys']):
        print "Saving model", m, "to model"+str(i)
        h2o.nodes[0].save_model(model=m, path='model'+str(i), force=1)

    return g
Exemplo n.º 58
0
    def test_exec2_fast_locks(self):
        csvPathname = 'iris/iris2.csv'
        src_key = 'iris.csv'
        if not AVOID_BUG:
            # need the key name (pattern) to feed to parse)
            (importResult, importPattern) = h2i.import_only(bucket='smalldata',
                                                            path=csvPathname,
                                                            schema='put',
                                                            src_key=src_key,
                                                            timeoutSecs=10)
            # just as a reminder of what these returns look like
            print "importResult:", h2o.dump_json(importResult)
            print "importPattern:", h2o.dump_json(importPattern)
        y = 4

        for trial in range(1, 100):
            if AVOID_BUG:
                # need the key name (pattern) to feed to parse)
                (importResult,
                 importPattern) = h2i.import_only(bucket='smalldata',
                                                  path=csvPathname,
                                                  schema='put',
                                                  src_key=src_key,
                                                  timeoutSecs=10)
                # just as a reminder of what these returns look like
                print "importResult:", h2o.dump_json(importResult)
                print "importPattern:", h2o.dump_json(importPattern)

            # make sure each parse is unique dest key (not in use)
            hex_key = "iris2_" + str(trial) + ".hex"
            # what if we kicked off another parse without waiting for it? I think the src key gets locked
            # so we'd get lock issues on the src_key
            parseResult = h2i.parse_only(pattern=src_key,
                                         hex_key=hex_key,
                                         delete_on_done=1 if AVOID_BUG else 0,
                                         timeoutSecs=10)
            execExpr = "%s[,%s]=(%s[,%s]==%s)" % (hex_key, y + 1, hex_key,
                                                  y + 1, 1)
            h2e.exec_expr(execExpr=execExpr, timeoutSecs=10)

        # just show the jobs still going, if any. maybe none, because short (iris)
        a = h2o.nodes[0].jobs_admin()
        h2o.verboseprint("jobs_admin():", h2o.dump_json(a))
Exemplo n.º 59
0
    def test_exec_assign(self):
        ### h2b.browseTheCloud()

        trial = 0
        while (trial < 200):
            for execExpr in initList:
                # always a one node stream. shouldn't fail
                nodeX = 0
                resultKey="Result" + str(trial%period)
                execResultInspect, min_value = h2e.exec_expr(h2o.nodes[nodeX], execExpr,
                    resultKey=resultKey, timeoutSecs=4)

                print "trial: #" + str(trial), min_value, execExpr
                h2o.verboseprint("min: ", min_value, "trial:", trial)
                self.assertEqual(float(min_value), float((trial % period) - 1), 
                    "exec constant assigns don't seem to be getting done and visible to Inspect")

                ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                trial += 1
Exemplo n.º 60
0
    def parseFile(self, s3bucket, localbucket, pathname, timeoutSecs, header, **kwargs):
        if USE_LOCAL:
            schema = "/"
            bucket = localbucket
            URI = schema + bucket + pathname
            importResult = h2o.nodes[0].import_files(URI)
        else:
            schema = "s3n://"
            bucket = s3bucket
            URI = schema + bucket + pathname
            importResult = h2o.nodes[0].import_hdfs(URI)

        start      = time.time()
        # pattern match, so nfs and s3n case is the same
        parseKey = h2o.nodes[0].parse("*" + pathname, timeoutSecs=timeoutSecs, header=header)
        parse_time = time.time() - start 
        h2o.verboseprint("py-S3 parse took {0} sec".format(parse_time))
        parseKey['python_call_timer'] = parse_time
        return parseKey