예제 #1
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed
            AssignObj('r0.hex', KeyIndexed('r.hex', col=0)),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0),
                                    1)),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1),
                                    -1)),
            AssignObj('s2.hex',
                      Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
    def test_rapids_funs_basic3(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []

        # works for 1 pass..why is execExpr set for 2nd pass? should be new instance?
        # if we reuse the same object in the list, it has state?
        # do we need to copy the object...hmm
        for trial in range(1):
            for execObj in funsList:
                freshObj = copy(execObj)
                result = freshObj.do()
                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk',
                               Fcn('anon', KeyIndexed('r1', col=0)),
                               do=False)
                else:
                    a = Assign('junk', Fcn('anon', 'r1'), do=False)
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_rapids_funs_1000_stmnt(self):
        DO_FAIL = False
        if DO_FAIL:
            bucket = 'home-0xdiag-datasets'
            csvPathname = 'standard/covtype.data'
        else:
            bucket = 'smalldata'
            csvPathname = 'iris/iris_wheader.csv'

        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []

        for trial in range(3):
            for execObj in funsList:
                freshObj = copy(execObj)
                print "ast length:", len(str(freshObj))
                result = freshObj.do()

                # rapids doesn't like complicated params right now?
                if DO_FAIL:
                    a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0)))
                else:
                    a = Assign('junk', Fcn('anon', 'r1'))
                result = a.do(timeoutSecs=60)

                # rows might be zero!
                if a.execResult['num_rows'] or a.execResult['num_cols']:
                    keys.append(a.execExpr)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
예제 #4
0
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:, i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
예제 #5
0
    def test_rapids_mean(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 5, 'cA', 200),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            data_key = hex_key
            data_key2 = hex_key + "_2"
            for trial in range(4):
                result_key = data_key + "_" + str(trial)
                # copy the key
                Assign(data_key2, data_key)
                Assign(result_key,
                       Fcn('mean', KeyIndexed(data_key2, col=0), 0, False))
                trial += 1
예제 #6
0
# new ...ability to reference cols
# src[ src$age<17 && src$zip=95120 && ... , ]
# can specify values for enums ..values are 0 thru n-1 for n enums
print "FIX!: need to test the && and || reduction operators"
initList = []

from h2o_xl import Key, AssignObj, Fcn

DO_SUM = False

r1 = Key('r1')

if DO_SUM:
    funstr = 'sum'
    exprList = [
        AssignObj('a', Fcn(funstr, r1[1], r1[2])),
        AssignObj('b', 1),
        AssignObj('b', Fcn(funstr, r1[1], r1[2])),
        AssignObj('d', 1),
        AssignObj('d', Fcn(funstr, r1[1], r1[2])),
        AssignObj('e', 1),
        AssignObj('e', Fcn(funstr, r1[1], r1[2])),
        AssignObj('f', 1),
        AssignObj('f', Fcn(funstr, r1[1], r1[2])),
        AssignObj('g', 1),
        AssignObj('g', Fcn(funstr, r1[1], r1[2])),
        AssignObj('h', 1),
        AssignObj('h', Fcn(funstr, r1[1], r1[2])),
    ]
else:
    funstr = 'log'
예제 #7
0
import unittest, random, sys, time
sys.path.extend(['.','..','../..','py'])
import h2o2 as h2o
import h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy, deepcopy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
funsList = [
    Def('anon', 'x', 
        Assign('a', Fcn('var', 'x', None, False, None), do=False),
        Assign('b', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('e', Fcn('var', 'x', None, False, None), do=False),
        Assign('f', Fcn('var', 'x', None, False, None), do=False),
        Assign('g', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('i', Fcn('var', 'x', None, False, None), do=False),
        Assign('j', Fcn('var', 'x', None, False, None), do=False),
        Assign('k', Fcn('var', 'x', None, False, None), do=False),
        Assign('l', Fcn('var', 'x', None, False, None), do=False),
        Assign('m', Fcn('var', 'x', None, False, None), do=False),
        Assign('n', Fcn('var', 'x', None, False, None), do=False),
        Assign('o', Fcn('var', 'x', None, False, None), do=False),
        Assign('p', Fcn('var', 'x', None, False, None), do=False),
        Assign('q', Fcn('var', 'x', None, False, None), do=False),
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
keyString = 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz'
funsList = [
    Def('anon', 'x', 
        [Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in keyString],
        
        [Assign(key, Fcn('sum', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('max', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('min', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('xorsum', KeyIndexed('x',col=0), False), do=False) for key in keyString],

        [Assign(key, Fcn('sd', KeyIndexed('x',col=0), False), do=False) for key in keyString],
        [Assign(key, Fcn('ncol', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('is.factor', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('any.factor', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('length', KeyIndexed('x',col=0)), do=False) for key in keyString],

        [Assign(key, Fcn('sin', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('asin', KeyIndexed('x',col=0)), do=False) for key in keyString],
        [Assign(key, Fcn('sinh', KeyIndexed('x',col=0)), do=False) for key in keyString],
    def test_rapids_overloaded_opr(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                Assign('s1', Seq(range(5)))

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5))))

                # just combine
                Assign('s3', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                Assign('s2', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn(
                    'c',
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s1', f)

                f = Col(
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 313
                assert numCols == 1

                print "Now trying to do the functions with the alternate overloaded operators"
                data_key = Key(parse_key)
                result_key = Key()
                # what triggers immediate operation at h2o
                # as opposed to an object within a function

                result_key.frame = 'a1'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a2'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a3'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a4'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]
                result_key.frame = 'a5'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]

                result_key.frame = 'a6'
                result_key <<= data_key[[1, 2, 3], 1]

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
예제 #10
0
    def test_rapids_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                # Assign('s1', Seq(range(5)) ).do
                Assign('s1', Seq(range(5)))

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5))))
                print dump_json(Xbase.lastExecResult)
                print dump_json(Xbase.lastResult)

                # just combine
                Assign('s3', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                Assign('s2', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn(
                    'c',
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s1', f)

                f = Col(
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 313
                assert numCols == 1

                print "z1"
                Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))))
                print "z2"
                Assign(
                    's1',
                    KeyIndexed(data_key,
                               row=Seq(Colon(99, 400), "#2", 1, range(1, 5))))

                print "z3"
                Assign(result_key, KeyIndexed(data_key, row='#1')).do
                print "z4"
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#1', '#100')))
                print "z5"
                Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100)))
                # this should fail rapids because of reverse msb/lsb
                # illegal, detected
                # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1')))
                print "z6"
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#-2', '#-1')))
                print "z7"
                Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1)))
                # illegal, detected
                # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2')))
                # take advantage of number to string conversion
                print "z8"
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                print "z9"
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # no assign
                print "z10"
                result = KeyIndexed(data_key, row=Colon('#1',
                                                        rowCount - 10)).do()
                print "z11"
                # result = KeyIndexed(data_key, col=Colon('#1', colCount-1,)).do()

                # do some function translation
                print "z12"
                # result = Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))).do()

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
# So you want to use the long forms only when you are certain the vectors are length one.
#
# You should be absolutely certain your vectors are only length 1, such as in cases where they are functions that return only length 1 booleans. You want to use the short forms if the vectors are length possibly >1. So if you're not absolutely sure, you should either check first, or use the short form and then use all and any to reduce it to length one for use in control flow statements, like if.
#
# The functions all and any are often used on the result of a vectorized comparison to see if all or any of the comparisons are true, respectively. The results from these functions are sure to be length 1 so they are appropriate for use in if clauses, while the results from the vectorized comparison are not. (Though those results would be appropriate for use in ifelse.
#
# One final difference: the && and || only evaluate as many terms as they need to (which seems to be what is meant by short-circuiting). For example, here's a comparison using an undefined value a; if it didn't short-circuit, as & and | don't, it would give an error.
# also see http://www.burns-stat.com/pages/Tutor/R_inferno.pdf
initList = [
    AssignObj('a', [1, 0, 0]),
]

r1 = Key('r1')

exprList = [
    AssignObj('a', Fcn('&&', r1[1], r1[2])),
    AssignObj('b', 1),
    AssignObj('b', Fcn('&&', r1[1], r1[2])),
    AssignObj('d', 1),
    AssignObj('d', Fcn('&&', r1[1], r1[2])),
    AssignObj('e', 1),
    AssignObj('e', Fcn('||', r1[1], r1[2])),
    AssignObj('f', 1),
    AssignObj('f', Fcn('||', r1[1], r1[2])),
    AssignObj('g', 1),
    AssignObj('g', Fcn('||', r1[1], r1[2])),
    AssignObj('h', 1),
    AssignObj('h', Fcn('||', r1[1], r1[2])),
]

예제 #12
0
    def test_rapids_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)

                Assign('seq1', Seq(range(5)))
                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                Assign('seq2', Fcn('c', Seq(range(5))))
                inspect = h2o_cmd.runInspect(key='seq1')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                Assign('seq3', Col(Seq(range(5))))
                inspect = h2o_cmd.runInspect(key='seq2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                # can't have sequence of sequences?
                # make sure key is created with c()
                Assign(
                    'seq4',
                    Fcn(
                        'c',
                        Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                            range(50, 52))))

                inspect = h2o_cmd.runInspect(key='seq1')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)

                Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5))))
                Assign(
                    'seq5',
                    KeyIndexed(data_key,
                               row=Seq(Colon(99, 400), "#2", 1, range(1, 5))))

                # they need to be same size
                # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3'))

                # doesn't like my cut? complains on FALSE
                # Assign(result_key, Cut(KeyIndexed(data_key, col=0)))
                # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3))

                Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1),
                                       True))
                Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1),
                                       True))
                Assign(result_key,
                       Fcn('mean', KeyIndexed(data_key, col=1), 0, False))

                Assign(result_key, KeyIndexed(data_key, row='#1'))
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#1', '#100')))
                Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100)))
                # this should fail rapids because of reverse msb/lsb
                # illegal, detected
                # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1')))
                Assign(result_key, KeyIndexed(data_key,
                                              row=Colon('#-2', '#-1')))
                Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1)))
                # illegal, detected
                # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2')))
                # take advantage of number to string conversion
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # no assign. Expr() complains when result has no key?
                Assign(result_key,
                       KeyIndexed(data_key, row=Colon('#1', rowCount - 10)))
                Assign(result_key,
                       KeyIndexed(data_key, col=Colon(
                           '#1',
                           colCount - 1,
                       )))

                # do some function translation
                Assign(
                    result_key,
                    Fcn('==', 1,
                        KeyIndexed(data_key, col=Colon(
                            '#1',
                            colCount - 1,
                        ))))

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount),
                                     random.randint(1, iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice

                cutExprList = []

                pKey = Key('p')
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:, i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else:
                        cutExpr = ce

                print "cutExpr:", cutExpr

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1, 2, 3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))

            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                if 1 == 1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."

                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                        inspect)

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
from h2o_test import dump_json, verboseprint
from copy import copy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
objList = [
    Assign('e', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False),
    Assign('f', IfElse(1, 2, IfElse(4, 5, IfElse(7, 8, 9))), do=False),
    Assign('g', IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))), do=False),
    Def('ms', 'x', [
        IfElse(0, 2, IfElse(0, 5, IfElse(0, 8, 9))),
        Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False),
    ]),
    Assign('e', Fcn('ms', 2), do=False),
    Def('ms', 'x', [
        If(0, Return(3)),
        IfElse(0, 5, IfElse(0, 8, 9)),
        Assign('k', IfElse(0, 12, IfElse(0, 15, IfElse(0, 18, 19))), do=False),
        If(1, Return(2)),
    ]),
    Assign('e', Fcn('ms', 2), do=False),
]

resultList = [
    None,
    None,
    None,
    None,
    19,
예제 #15
0
    def test_rapids_funs_1op(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)

            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(numCols, colCount,
                "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
            self.assertEqual(numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" % (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            data_key2 = hex_key + "_2"
            trial = 0
            good = []
            bad = []
            both = h2o_xl.xFcnOp1Set.union(h2o_xl.xFcnOp3Set)
            both = h2o_xl.xFcnOp1Set
            for fun in both:

                a = None
                try:
                    result_key = data_key + "_" + str(trial)
                    # copy the key
                    Assign(data_key2, data_key)

                    # a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True))

                    # a = Assign(result_key, Fcn('sum', KeyIndexed(data_key2, col=0), True))
                    # a = Assign(result_key, Fcn('xorsum', KeyIndexed(data_key2, col=0), True))
                    # a = Assign(result_key, Fcn('sqrt', KeyIndexed(data_key2, col=0)))
                    # a = Assign(result_key, Fcn('ncol', KeyIndexed(data_key2, col=0)))

                    # what's wrong with mean?
                    if fun in ['ncol', 'asin', 'any.factor', 'sin', 'atan', 'tan', 'sign', 'log', 'exp', 'sqrt', 'abs', 'floor', 'ceiling', 'trunc','is.factor', 'is.na', 'any.na', 'nrow', 'tanh', 'length', 'acos', 'cos', 'sinh', 'cosh']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0)))
                        good.append(fun)
                    elif fun in ['sum', 'max', 'min', 'xorsum', 'sd']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True))
                        good.append(fun)
                    elif fun in ['scale']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False))
                        good.append(fun)
                    elif fun in ['round', 'signif']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1))
                        good.append(fun)
                    elif fun in ['seq_len', 'rep_len']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 4))
                        good.append(fun)
                    elif fun in ['seq']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1, 5, 1))
                        good.append(fun)
                    elif fun in ['mean']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 0, False))
                        good.append(fun)
                    elif fun in ['var']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False, False))
                        good.append(fun)
                    elif fun in ['match']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), KeyIndexed(data_key2, col=0), 1, None))
                        good.append(fun)
                    elif fun in ['unique']:
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, 10, 1))
                        good.append(fun)
                    else:
                        # bad functions kill h2o?
                        a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), None))
                        bad.append(fun)

                        # a = Fcn(fun, KeyIndexed(data_key, col=0), '%FALSE ')
                        # a = Fcn(fun, data_key, '%FALSE')
                        # a = Fcn(fun, data_key)

                    # scalars?
                    if 1==0:
                        inspect = h2o_cmd.runInspect(key=result_key)
                        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                        assert numRows==1000, numRows
                        assert numCols==1, numCols

                        print "\n" + csvPathname, \
                            "    numRows:", "{:,}".format(numRows), \
                            "    numCols:", "{:,}".format(numCols)

                except: 
                    if not a:
                        # print dump_json(a.execResult)
                        bad.append(fun)

                trial += 1

            print "good:", good
            print "bad:", bad
예제 #16
0
sys.path.extend(['.', '..', '../..', 'py'])
import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_import as h2i
# '(def anon {x} ( (var %x "null" %FALSE "null");;(var %x "null" %FALSE "null") );;;)',

from h2o_xl import Def, Fcn, Assign, KeyIndexed
from copy import copy, deepcopy

print "Trying a different way, listing Rapids objects, rather than .ast() strings"

# 'c' allowed
# should be able to take a list of statements
funsList = [
    Def(
        'anon',
        'x',
        Assign('a', Fcn('var', 'x', None, False, None), do=False),
        Assign('b', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('e', Fcn('var', 'x', None, False, None), do=False),
        Assign('f', Fcn('var', 'x', None, False, None), do=False),
        Assign('g', Fcn('var', 'x', None, False, None), do=False),
        Assign('d', Fcn('var', 'x', None, False, None), do=False),
        Assign('i', Fcn('var', 'x', None, False, None), do=False),
        Assign('j', Fcn('var', 'x', None, False, None), do=False),
        Assign('k', Fcn('var', 'x', None, False, None), do=False),
        Assign('l', Fcn('var', 'x', None, False, None), do=False),
        Assign('m', Fcn('var', 'x', None, False, None), do=False),
        Assign('n', Fcn('var', 'x', None, False, None), do=False),
        Assign('o', Fcn('var', 'x', None, False, None), do=False),
        Assign('p', Fcn('var', 'x', None, False, None), do=False),
        Assign('q', Fcn('var', 'x', None, False, None), do=False),