def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex', col=0)), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1)), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_rapids_funs_basic3(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # works for 1 pass..why is execExpr set for 2nd pass? should be new instance? # if we reuse the same object in the list, it has state? # do we need to copy the object...hmm for trial in range(1): for execObj in funsList: freshObj = copy(execObj) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1', col=0)), do=False) else: a = Assign('junk', Fcn('anon', 'r1'), do=False) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_funs_1000_stmnt(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(3): for execObj in funsList: freshObj = copy(execObj) print "ast length:", len(str(freshObj)) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0))) else: a = Assign('junk', Fcn('anon', 'r1')) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:, i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_rapids_mean(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) data_key = hex_key data_key2 = hex_key + "_2" for trial in range(4): result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) Assign(result_key, Fcn('mean', KeyIndexed(data_key2, col=0), 0, False)) trial += 1
# new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # can specify values for enums ..values are 0 thru n-1 for n enums print "FIX!: need to test the && and || reduction operators" initList = [] from h2o_xl import Key, AssignObj, Fcn DO_SUM = False r1 = Key('r1') if DO_SUM: funstr = 'sum' exprList = [ AssignObj('a', Fcn(funstr, r1[1], r1[2])), AssignObj('b', 1), AssignObj('b', Fcn(funstr, r1[1], r1[2])), AssignObj('d', 1), AssignObj('d', Fcn(funstr, r1[1], r1[2])), AssignObj('e', 1), AssignObj('e', Fcn(funstr, r1[1], r1[2])), AssignObj('f', 1), AssignObj('f', Fcn(funstr, r1[1], r1[2])), AssignObj('g', 1), AssignObj('g', Fcn(funstr, r1[1], r1[2])), AssignObj('h', 1), AssignObj('h', Fcn(funstr, r1[1], r1[2])), ] else: funstr = 'log'
import unittest, random, sys, time sys.path.extend(['.','..','../..','py']) import h2o2 as h2o import h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i # '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)', from h2o_xl import Def, Fcn, Assign, KeyIndexed from copy import copy, deepcopy print "Trying a different way, listing Rapids objects, rather than .ast() strings" # 'c' allowed # should be able to take a list of statements funsList = [ Def('anon', 'x', Assign('a', Fcn('var', 'x', None, False, None), do=False), Assign('b', Fcn('var', 'x', None, False, None), do=False), Assign('d', Fcn('var', 'x', None, False, None), do=False), Assign('e', Fcn('var', 'x', None, False, None), do=False), Assign('f', Fcn('var', 'x', None, False, None), do=False), Assign('g', Fcn('var', 'x', None, False, None), do=False), Assign('d', Fcn('var', 'x', None, False, None), do=False), Assign('i', Fcn('var', 'x', None, False, None), do=False), Assign('j', Fcn('var', 'x', None, False, None), do=False), Assign('k', Fcn('var', 'x', None, False, None), do=False), Assign('l', Fcn('var', 'x', None, False, None), do=False), Assign('m', Fcn('var', 'x', None, False, None), do=False), Assign('n', Fcn('var', 'x', None, False, None), do=False), Assign('o', Fcn('var', 'x', None, False, None), do=False), Assign('p', Fcn('var', 'x', None, False, None), do=False), Assign('q', Fcn('var', 'x', None, False, None), do=False),
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i # '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)', from h2o_xl import Def, Fcn, Assign, KeyIndexed from copy import copy print "Trying a different way, listing Rapids objects, rather than .ast() strings" # 'c' allowed # should be able to take a list of statements keyString = 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' funsList = [ Def('anon', 'x', [Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in keyString], [Assign(key, Fcn('sum', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('max', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('min', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('xorsum', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('sd', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('ncol', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('is.factor', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('any.factor', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('length', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('sin', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('asin', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('sinh', KeyIndexed('x',col=0)), do=False) for key in keyString],
def test_rapids_overloaded_opr(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('s1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)))) # just combine Assign('s3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 Assign('s2', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s1', f) f = Col( Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 313 assert numCols == 1 print "Now trying to do the functions with the alternate overloaded operators" data_key = Key(parse_key) result_key = Key() # what triggers immediate operation at h2o # as opposed to an object within a function result_key.frame = 'a1' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a2' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a3' result_key <<= data_key[Seq(range(1, 4)), :] result_key.frame = 'a4' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a5' result_key <<= data_key[Seq(range(1, 4)), 0:1] result_key.frame = 'a6' result_key <<= data_key[[1, 2, 3], 1] print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_rapids_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) # Assign('s1', Seq(range(5)) ).do Assign('s1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)))) print dump_json(Xbase.lastExecResult) print dump_json(Xbase.lastResult) # just combine Assign('s3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 Assign('s2', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s1', f) f = Col( Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 313 assert numCols == 1 print "z1" Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5)))) print "z2" Assign( 's1', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1, 5)))) print "z3" Assign(result_key, KeyIndexed(data_key, row='#1')).do print "z4" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) print "z5" Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) print "z6" Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) print "z7" Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion print "z8" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) print "z9" Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # no assign print "z10" result = KeyIndexed(data_key, row=Colon('#1', rowCount - 10)).do() print "z11" # result = KeyIndexed(data_key, col=Colon('#1', colCount-1,)).do() # do some function translation print "z12" # result = Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))).do() print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
# So you want to use the long forms only when you are certain the vectors are length one. # # You should be absolutely certain your vectors are only length 1, such as in cases where they are functions that return only length 1 booleans. You want to use the short forms if the vectors are length possibly >1. So if you're not absolutely sure, you should either check first, or use the short form and then use all and any to reduce it to length one for use in control flow statements, like if. # # The functions all and any are often used on the result of a vectorized comparison to see if all or any of the comparisons are true, respectively. The results from these functions are sure to be length 1 so they are appropriate for use in if clauses, while the results from the vectorized comparison are not. (Though those results would be appropriate for use in ifelse. # # One final difference: the && and || only evaluate as many terms as they need to (which seems to be what is meant by short-circuiting). For example, here's a comparison using an undefined value a; if it didn't short-circuit, as & and | don't, it would give an error. # also see http://www.burns-stat.com/pages/Tutor/R_inferno.pdf initList = [ AssignObj('a', [1, 0, 0]), ] r1 = Key('r1') exprList = [ AssignObj('a', Fcn('&&', r1[1], r1[2])), AssignObj('b', 1), AssignObj('b', Fcn('&&', r1[1], r1[2])), AssignObj('d', 1), AssignObj('d', Fcn('&&', r1[1], r1[2])), AssignObj('e', 1), AssignObj('e', Fcn('||', r1[1], r1[2])), AssignObj('f', 1), AssignObj('f', Fcn('||', r1[1], r1[2])), AssignObj('g', 1), AssignObj('g', Fcn('||', r1[1], r1[2])), AssignObj('h', 1), AssignObj('h', Fcn('||', r1[1], r1[2])), ]
def test_rapids_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('seq1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created Assign('seq2', Fcn('c', Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign('seq3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # can't have sequence of sequences? # make sure key is created with c() Assign( 'seq4', Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5)))) Assign( 'seq5', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1, 5)))) # they need to be same size # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3')) # doesn't like my cut? complains on FALSE # Assign(result_key, Cut(KeyIndexed(data_key, col=0))) # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3)) Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('mean', KeyIndexed(data_key, col=1), 0, False)) Assign(result_key, KeyIndexed(data_key, row='#1')) Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # no assign. Expr() complains when result has no key? Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # do some function translation Assign( result_key, Fcn('==', 1, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, )))) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
from h2o_test import dump_json, verboseprint from copy import copy print "Trying a different way, listing Rapids objects, rather than .ast() strings" # 'c' allowed # should be able to take a list of statements objList = [ Assign('e', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False), Assign('f', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False), Assign('g', IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))), do=False), Def('ms', 'x', [ IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))), Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False), ]), Assign('e', Fcn('ms', 2), do=False), Def('ms', 'x', [ If(0, Return(3)), IfElse(0, 5, IfElse(0, 8, 9)), Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False), If(1, Return(2)), ]), Assign('e', Fcn('ms', 2), do=False), ] resultList = [ None, None, None, None, 19,
def test_rapids_funs_1op(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key data_key2 = hex_key + "_2" trial = 0 good = [] bad = [] both = h2o_xl.xFcnOp1Set.union(h2o_xl.xFcnOp3Set) both = h2o_xl.xFcnOp1Set for fun in both: a = None try: result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) # a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('xorsum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sqrt', KeyIndexed(data_key2, col=0))) # a = Assign(result_key, Fcn('ncol', KeyIndexed(data_key2, col=0))) # what's wrong with mean? if fun in ['ncol', 'asin', 'any.factor', 'sin', 'atan', 'tan', 'sign', 'log', 'exp', 'sqrt', 'abs', 'floor', 'ceiling', 'trunc','is.factor', 'is.na', 'any.na', 'nrow', 'tanh', 'length', 'acos', 'cos', 'sinh', 'cosh']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0))) good.append(fun) elif fun in ['sum', 'max', 'min', 'xorsum', 'sd']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) good.append(fun) elif fun in ['scale']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False)) good.append(fun) elif fun in ['round', 'signif']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1)) good.append(fun) elif fun in ['seq_len', 'rep_len']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 4)) good.append(fun) elif fun in ['seq']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1, 5, 1)) good.append(fun) elif fun in ['mean']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 0, False)) good.append(fun) elif fun in ['var']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False, False)) good.append(fun) elif fun in ['match']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), KeyIndexed(data_key2, col=0), 1, None)) good.append(fun) elif fun in ['unique']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, 10, 1)) good.append(fun) else: # bad functions kill h2o? a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), None)) bad.append(fun) # a = Fcn(fun, KeyIndexed(data_key, col=0), '%FALSE ') # a = Fcn(fun, data_key, '%FALSE') # a = Fcn(fun, data_key) # scalars? if 1==0: inspect = h2o_cmd.runInspect(key=result_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==1000, numRows assert numCols==1, numCols print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) except: if not a: # print dump_json(a.execResult) bad.append(fun) trial += 1 print "good:", good print "bad:", bad
sys.path.extend(['.', '..', '../..', 'py']) import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i # '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)', from h2o_xl import Def, Fcn, Assign, KeyIndexed from copy import copy, deepcopy print "Trying a different way, listing Rapids objects, rather than .ast() strings" # 'c' allowed # should be able to take a list of statements funsList = [ Def( 'anon', 'x', Assign('a', Fcn('var', 'x', None, False, None), do=False), Assign('b', Fcn('var', 'x', None, False, None), do=False), Assign('d', Fcn('var', 'x', None, False, None), do=False), Assign('e', Fcn('var', 'x', None, False, None), do=False), Assign('f', Fcn('var', 'x', None, False, None), do=False), Assign('g', Fcn('var', 'x', None, False, None), do=False), Assign('d', Fcn('var', 'x', None, False, None), do=False), Assign('i', Fcn('var', 'x', None, False, None), do=False), Assign('j', Fcn('var', 'x', None, False, None), do=False), Assign('k', Fcn('var', 'x', None, False, None), do=False), Assign('l', Fcn('var', 'x', None, False, None), do=False), Assign('m', Fcn('var', 'x', None, False, None), do=False), Assign('n', Fcn('var', 'x', None, False, None), do=False), Assign('o', Fcn('var', 'x', None, False, None), do=False), Assign('p', Fcn('var', 'x', None, False, None), do=False), Assign('q', Fcn('var', 'x', None, False, None), do=False),