Exemplo n.º 1
0
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=resultKey,
                                               timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 2
0
def parse_only(node=None, pattern=None, hex_key=None, importKeyList=None, 
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, **kwargs):

    if not node: node = h2o_nodes.nodes[0]
    # Get the list of all keys and use those that match the pattern
    # FIX! this can be slow. Can we use h2o to filter the list for us?

    # HACK. to avoid the costly frames, pass the imported key list during import_parse
    # won't work for cases where we do multiple import_only, then parse (for multi-dir import)
    matchingList = []
    if importKeyList:
        # the pattern is a full path/key name, so no false matches
        for key_name in importKeyList:
            if fnmatch.fnmatch(key_name, pattern):
                matchingList.append(key_name)
    else:
        h2p.yellow_print("WARNING: using frames to look up key names for possible parse regex")
        framesResult = node.frames(timeoutSecs=timeoutSecs)
        for frame in framesResult['frames']:
            key_name = frame['key']['name']
            if fnmatch.fnmatch(key_name, pattern):
                matchingList.append(key_name)

    parseResult = node.parse(key=matchingList, hex_key=hex_key,
        timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, 
        initialDelaySecs=initialDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noise=noise,
        benchmarkLogging=benchmarkLogging, noPoll=noPoll, **kwargs)

    parseResult['python_source'] = pattern
    return parseResult
Exemplo n.º 3
0
    def test_rapids_basic(self):
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'p'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        for execExpr in exprList:
            r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr)
            resultKey = r.group(1)
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
            if DO_ROLLUP:
                h2o_cmd.runInspect(key=resultKey)
            # rows might be zero!
            if execResult['num_rows'] or execResult['num_cols']:
                keys.append(execExpr)
            else:
                h2p.yellow_print("\nNo key created?\n", dump_json(execResult))

        print "\nExpressions that created keys. Shouldn't all of these expressions create keys"

        for k in keys:
            print k

        h2o.check_sandbox_for_errors()
Exemplo n.º 4
0
def parse_only(node=None,
               pattern=None,
               hex_key=None,
               importKeyList=None,
               timeoutSecs=30,
               retryDelaySecs=0.1,
               initialDelaySecs=0,
               pollTimeoutSecs=180,
               noise=None,
               benchmarkLogging=None,
               noPoll=False,
               **kwargs):

    if not node: node = h2o_nodes.nodes[0]
    # Get the list of all keys and use those that match the pattern
    # FIX! this can be slow. Can we use h2o to filter the list for us?

    # HACK. to avoid the costly frames, pass the imported key list during import_parse
    # won't work for cases where we do multiple import_only, then parse (for multi-dir import)
    matchingList = []
    if importKeyList:
        # the pattern is a full path/key name, so no false matches
        for key_name in importKeyList:
            if fnmatch.fnmatch(str(key_name), pattern):
                matchingList.append(key_name)
    else:
        h2p.yellow_print(
            "WARNING: using frames to look up key names for possible parse regex"
        )
        framesResult = node.frames(timeoutSecs=timeoutSecs)
        for frame in framesResult['frames']:
            key_name = frame['key']['name']
            if fnmatch.fnmatch(str(key_name), pattern):
                matchingList.append(key_name)

    if len(matchingList) == 0:
        raise Exception("Didn't find %s in key list %s or Frames result" %
                        (pattern, importKeyList))

    start = time.time()
    # put quotes on all keys
    parseResult = node.parse(key=matchingList,
                             hex_key=hex_key,
                             timeoutSecs=timeoutSecs,
                             retryDelaySecs=retryDelaySecs,
                             initialDelaySecs=initialDelaySecs,
                             pollTimeoutSecs=pollTimeoutSecs,
                             noise=noise,
                             benchmarkLogging=benchmarkLogging,
                             noPoll=noPoll,
                             **kwargs)
    # FIX! extract and print the result key name (from parseResult)
    print "\nparse took", time.time() - start, "seconds"

    parseResult['python_source'] = pattern
    return parseResult
Exemplo n.º 5
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            time.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)

            ### h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Exemplo n.º 6
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Exemplo n.º 7
0
def build_model(self, algo, training_frame, parameters, destination_key=None, 
    timeoutSecs=60, asynchronous=False, **kwargs):
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders']
    builder = model_builders['model_builders'][algo]
    
    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames['frames'][0]['key']['name'] 
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_key is not None:
        parameters['destination_key'] = destination_key

    result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', 
        timeout=timeoutSecs, postData=parameters)

    if asynchronous:
        result = result1
    elif 'validation_error_count' in result1:
        h2p.yellow_print("parameter error in model_builders")
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
    else:
        job = result1['jobs'][0]
        job_key = job['key']['name']
        verboseprint("model building job_key: " + repr(job_key))
        result = self.poll_job(job_key, timeoutSecs=timeoutSecs)

    verboseprint("result:", result)
    return result
Exemplo n.º 8
0
def import_parse(node=None, schema='local', bucket=None, path=None,
    src_key=None, hex_key=None, 
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    if schema=='put':
        h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
            "\nMeans multi-machine with 'put' will fail")
        schema = 'local'

    if not node: node = h2o_nodes.nodes[0]

    (importResult, importPattern) = import_only(node, schema, bucket, path,
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs)

    verboseprint("importPattern:", importPattern)
    verboseprint("importResult", dump_json(importResult))

    assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path)
    parseResult = parse_only(node, importPattern, hex_key, importResult['keys'],
        timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, 
        benchmarkLogging, noPoll, **kwargs)
    verboseprint("parseResult:", dump_json(parseResult))

    # do SummaryPage here too, just to get some coverage
    # only if not noPoll. otherwise parse isn't done
    if doSummary and not noPoll:
        # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up
        check_sandbox_for_errors()
        print "WARNING: not doing inspect/summary for now after parse"
        ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs)
        ## numRows = inspect['numRows']
        ## numCols = inspect['numCols']
        # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma)
        ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols)
        # for now, don't worry about error isolating summary 
    else:
        # isolate a parse from the next thing
        check_sandbox_for_errors()

    return parseResult
Exemplo n.º 9
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4 * 3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600,
                         "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(
            h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!")
        h2p.blue_print(
            "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs"
            % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime < maxTime):  # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Exemplo n.º 10
0
Arquivo: cloud.py Projeto: 100star/h2o
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 
        h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Exemplo n.º 11
0
def build_model(self,
                algo,
                training_frame,
                parameters,
                destination_frame=None,
                model_id=None,
                timeoutSecs=60,
                noPoll=False,
                **kwargs):

    if 'destination_key' in kwargs:
        raise Exception('Change destination_key in build_model() to model_id')
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders'], "%s %s" % (
        algo, [k for k in model_builders['model_builders']])
    builder = model_builders['model_builders'][algo]

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(
        training_frame)

    key_name = frames['frames'][0]['frame_id']['name']
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_frame is not None:
        print "destination_frame should be replaced by model_id now"
        parameters['model_id'] = destination_frame

    if model_id is not None:
        parameters['model_id'] = model_id

    print "build_model parameters", parameters
    start = time.time()
    result1 = self.do_json_request('/3/ModelBuilders.json/' + algo,
                                   cmd='post',
                                   timeout=timeoutSecs,
                                   postData=parameters)
    # make get overwritten after polling
    elapsed = time.time() - start
    verboseprint("build_model result", dump_json(result1))

    if noPoll:
        result = result1
    elif ('validation_error_count'
          in result1) and (result1['validation_error_count'] > 0):
        h2p.yellow_print("parameter error in model_builders: %s" % result1)
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
        # don't bother printing a time message
    elif 'exception_msg' in result1:
        h2p.yellow_print("exception msg in model_builders: %s" %
                         result1['exception_msg'])
        result = result1
    else:
        job_result = result1['job']
        job_key = job_result['key']['name']
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print "ModelBuilders", algo, "end on", training_frame, 'took', time.time(
        ) - start, 'seconds'
        print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            # can condition this with a parameter if some FAILED are expected by tests.
            if status == 'FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on build_model job status: %s %s %s %s" % \
                    (status, progress, msec, description))

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception(
                "build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    result['python_elapsed'] = elapsed
    return result
Exemplo n.º 12
0
def parse(self,
          key,
          hex_key=None,
          columnTypeDict=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_frames': None,
        'destination_frame': hex_key,
        'parse_type': None,  # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None,  # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None,  # a list
        'column_types':
        None,  # a list. or can use columnTypeDict param (see below)
        'na_strings': None,  # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        # have to put double quotes around the individual list items (single not legal)
        source_frames = "[" + ",".join(map(
            (lambda x: '"' + x + '"'), key)) + "]"

    else:
        # what if None here
        source_frames = '["' + key + '"]'  # quotes required on key

    params_dict['source_frames'] = source_frames

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)
    # Call ParseSetup?source_frames=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_frames': source_frames}
    setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_frames']:
        # should these be quoted?
        source_framesStr = "[" + ",".join([
            ('"%s"' % src['name']) for src in setup_result['source_frames']
        ]) + "]"
    else:
        source_framesStr = None

    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        # single quotes not legal..need double quotes
        columnNamesStr = "[" + ",".join(
            map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
        # single quotes not legal..need double quotes
        naStrings = "[" + ",".join(
            map((lambda x: '"' + x + '"' if x != None else '""'),
                setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict:
        for k, v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k >= 0 and k < len(ct):
                    ct[k] = v
                else:
                    raise Exception(
                        "bad col index %s in columnTypeDict param %s" %
                        (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception(
                        "bad col name %s in columnTypeDict param %s. columnNames: %s"
                        % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]"

    parse_params = {
        'source_frames': source_framesStr,
        'destination_frame': setup_result['destination_frame'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings,
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['column_names'] and len(
        setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse source_frames is length:", len(parse_params['source_frames'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="3/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_frame']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Exemplo n.º 13
0
def parse(self, key, hex_key=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key, 
        'pType': None, # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None, # how is this used
        'singleQuotes': None,
        'columnNames': None, # list?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]"
    else:
        setupSrcs = None
    
    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None


    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        return this.jobs(job_key)

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Exemplo n.º 14
0
def build_model(self, algo, training_frame, parameters, destination_key=None, 
    timeoutSecs=60, asynchronous=False, **kwargs):
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']])
    builder = model_builders['model_builders'][algo]
    
    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames['frames'][0]['key']['name'] 
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_key is not None:
        parameters['destination_key'] = destination_key

    print "build_model parameters", parameters
    result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', 
        timeout=timeoutSecs, postData=parameters)
    verboseprint("build_model result", dump_json(result1))

    if asynchronous:
        result = result1
    elif 'validation_error_count' in result1:
        h2p.yellow_print("parameter error in model_builders")
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
    else:
        job_result = result1['jobs'][0]
        job_key = job_result['key']['name']
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            # can condition this with a parameter if some FAILED are expected by tests.
            if status=='FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on build_model job status: %s %s %s %s" % \
                    (status, progress, msec, description))

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception("build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Exemplo n.º 15
0
def parse(self,
          key,
          hex_key=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key,
        'pType': None,  # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None,  # how is this used
        'singleQuotes': None,
        'columnNames': None,  # list?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join(
            [src['name'] for src in setup_result['srcs']]) + "]"
    else:
        setupSrcs = None

    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None

    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['columnNames'] and len(
        setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="2/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Exemplo n.º 16
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    if schema=='put':
        h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
            "\nMeans multi-machine with 'put' will fail")
        schema = 'local'

    if src_key and schema!='put':
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o_nodes.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    verboseprint("head:", head)
    verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
           raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
           raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        importResult = None
        
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder?
        # is it used for key matching by others?

        # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path
        # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets
        # importPattern = folderURI + "/" + pattern
        # could include this on the entire importPattern if we no longer have regex basename in h2o-dev?
          
        # folderURI = 'nfs:/' + folderPath
        folderURI = 'nfs:/' + os.path.realpath(folderPath)
        if importParentDir:
            finalImportString = folderPath
        else:
            finalImportString = folderPath + "/" + pattern
        importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o_nodes.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution 
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString
    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
Exemplo n.º 17
0
    def test_xl_ast_assert_ZZ(self):
        #*****************************************
        a = DF('a1') # inits to -1
        checkAst(astForInit(a))
        # I suppose use of the h2o inspect request is deprecated
        # h2o_cmd.runInspect uses Frames?
        if 1==0:
            inspect = h2o.n0.inspect(key=a) # str(a) becomes 'a1'. so this param should take type Key for key=
            print "a/a1:", dump_json(inspect)

        # let's use runSummary for fun..returns OutputObj for the col
        # will get from column 0, since column not specified
        summaryResult = h2o_cmd.runSummary(key=a)
        co = h2o_cmd.infoFromSummary(summaryResult)
        print "co.label:", co.label
        print "co.data:", co.data

        # how can we get a bunch of data?
        b = DF('b1') # inits to -1
        checkAst(astForInit(b))
        c = DF('c1') # inits to -1
        checkAst(astForInit(c))
        print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult)

        h2p.yellow_print("Assign compare1")
        Assign(c[0], c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare2")
        Assign(c[0], c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare3")
        Assign(c[0], c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        h2p.yellow_print("Assign compare4")
        Assign(c[0], c[0] != 0)
        checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))")

        # h2o_xl.debugPrintEnable = True

        #*****************************************
        c = DF('c1')

        h2p.yellow_print("<<= compare1")
        c[0] <<= (c[0] + 0)
        checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare2")
        c[0] <<= (c[0] - 0)
        checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))")

        h2p.yellow_print("<<= compare3")
        c[0] <<= (c[0] == 0)
        checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))")

        #*****************************************
        c = DF('c1') # inits to -1
        h2p.yellow_print("compare1")
        # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? 
        # .result can give us scalar, list, Key, None

        # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small?
        # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for ()
        result = Expr(c[0] == -1).result
        checkAst("(n ([ %c1 #0 #0) #-1)")
        h2p.yellow_print("Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true for if of result", type(result), result
        else:
            print "else for if of result", type(result), result

        #*****************************************
        # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key
        result = Assign(None, c[0]==-1).result
        checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))")
        h2p.yellow_print("Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result))
        assert result == 1.0, "%s %s" % (type(result), result) # real result?

        if result:
            print "true if of result", result
        else:
            print "false if of result", result
Exemplo n.º 18
0
def build_model(
    self,
    algo,
    training_frame,
    parameters,
    destination_frame=None,
    model_id=None,
    timeoutSecs=60,
    noPoll=False,
    **kwargs
):

    if "destination_key" in kwargs:
        raise Exception("Change destination_key in build_model() to model_id")

    """
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    """
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders["model_builders"], "%s %s" % (algo, [k for k in model_builders["model_builders"]])
    builder = model_builders["model_builders"][algo]

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames["frames"][0]["frame_id"]["name"]
    assert key_name == training_frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(
        training_frame, key_name, training_frame
    )
    parameters["training_frame"] = training_frame

    if destination_frame is not None:
        print "destination_frame should be replaced by model_id now"
        parameters["model_id"] = destination_frame

    if model_id is not None:
        parameters["model_id"] = model_id

    print "build_model parameters", parameters
    start = time.time()
    result1 = self.do_json_request(
        "/3/ModelBuilders.json/" + algo, cmd="post", timeout=timeoutSecs, postData=parameters
    )
    # make get overwritten after polling
    elapsed = time.time() - start
    verboseprint("build_model result", dump_json(result1))

    if noPoll:
        result = result1
    elif ("validation_error_count" in result1) and (result1["validation_error_count"] > 0):
        h2p.yellow_print("parameter error in model_builders: %s" % result1)
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
        # don't bother printing a time message
    elif "exception_msg" in result1:
        h2p.yellow_print("exception msg in model_builders: %s" % result1["exception_msg"])
        result = result1
    else:
        job_result = result1["job"]
        job_key = job_result["key"]["name"]
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print "ModelBuilders", algo, "end on", training_frame, "took", time.time() - start, "seconds"
        print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

        if job_result:
            jobs = job_result["jobs"][0]
            description = jobs["description"]
            dest = jobs["dest"]
            msec = jobs["msec"]
            status = jobs["status"]
            progress = jobs["progress"]

            # can condition this with a parameter if some FAILED are expected by tests.
            if status == "FAILED":
                print dump_json(job_result)
                raise Exception(
                    "Taking exception on build_model job status: %s %s %s %s" % (status, progress, msec, description)
                )

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception("build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    result["python_elapsed"] = elapsed
    return result
Exemplo n.º 19
0
def parse(self, key, hex_key=None, columnTypeDict=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_keys': None,
        'destination_key': hex_key, 
        'parse_type': None, # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None, # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None, # a list
        'column_types': None, # a list. or can use columnTypeDict param (see below)
	'na_strings' : None, # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        # have to put quotes around the individual list items
        source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]"

    else:
        # what if None here
        source_keys = "['" + key + "']" # quotes required on key

    params_dict['source_keys'] = source_keys

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)
    # Call ParseSetup?source_keys=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_keys': source_keys}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_keys']:
        # should these be quoted?
        source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]"
    else:
        source_keysStr = None
    
    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
	naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict: 
        for k,v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k>=0 and k<len(ct):
                    ct[k] = v
                else:
                    raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]"


    parse_params = {
        'source_keys': source_keysStr,
        'destination_key': setup_result['destination_key'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings, 
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse source_keys is length:", len(parse_params['source_keys'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_key']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")