Exemplo n.º 1
0
    def test_GLM2_tnc3_10(self):
        h2o.beta_features = True
        csvFilename = 'tnc3_10.csv'
        print "\n" + csvFilename
        hex_key = "tnc3.hex"
        h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, schema='put', hex_key=hex_key, timeoutSecs=10)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 2
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3_10.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1==0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseKey
            # in any case, the destination_key in parseKey was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'


        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1==0):
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1==1):
            start = time.time()
            kwargs = {'y': 13, 'num_cross_validation_folds': 6}
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 3
0
    def test_tnc3_ignore(self):
        csvFilename = "tnc3.csv"
        csvPathname = h2o.find_file("smalldata/" + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseKey["destination_key"]
        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1 == 0:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, numExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after num swap", colResultList

        if 1 == 1:
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        # ******************
        if 1 == 0:
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, charExprList, key2, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after char swap", colResultList

        if 1 == 1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 4
0
    def test_tnc3_ignore(self):
        csvPathname = 'tnc3.csv'
        print "\n" + csvPathname
        hex_key = "tnc3.hex"
        ### h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', 
            timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, ignore="boat,body")

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 5
0
    def test_tnc3_ignore(self):
        csvFilename = 'tnc3.csv'
        csvPathname = h2o.find_file('smalldata/' + csvFilename)
        print "\n" + csvPathname
        key2 = "tnc3.hex"
        h2b.browseTheCloud()

        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10, header=1)
        print "Parse result['Key']:", parseKey['destination_key']
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, ignore="boat,body", csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, key2, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(trees=5, timeoutSecs=10, csvPathname=csvPathname)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 6
0
    def test_GLM2_tnc3_10(self):
        h2o.beta_features = True
        csvFilename = "tnc3_10.csv"
        print "\n" + csvFilename
        hex_key = "tnc3.hex"

        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvFilename, schema="put", hex_key=hex_key, timeoutSecs=10
        )
        print "Parse result['Key']:", parseResult["destination_key"]
        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
        ### time.sleep(10)

        if 1 == 0:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, numExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after num swap", colResultList

        if 1 == 1:
            start = time.time()
            kwargs = {"response": 13, "n_folds": 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, "took", time.time() - start, "seconds"

        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])

        # ******************
        if 1 == 0:
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes, charExprList, hex_key, maxCol=10, incrementingResult=False, timeoutSecs=10
            )
            print "\ncolResultList after char swap", colResultList

        if 1 == 1:
            start = time.time()
            kwargs = {"response": 13, "n_folds": 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=300, **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, "took", time.time() - start, "seconds"

        inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
    def test_GLM_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA.hex', 100),
            (10000,  20, 'cB.hex', 200),
            (10000,  30, 'cC.hex', 300),
            (10000,  40, 'cD.hex', 400),
            (10000,  50, 'cE.hex', 500),
            ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))',
                ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=90)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, 
                timeoutSecs=90, incrementingResult=False)
            print "\nexec colResultList", colResultList

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]
            # since we add the output twice, it's no longer colCount-1
            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            # only col y-1 (next to last)doesn't get renamed in coefficients 
            # due to enum/categorical expansion
            print "y:", y 
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(3)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 8
0
    def test_sum_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"
        h2i.setupImportFolder(None, importFolderPath)

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        #    ("covtype20x.data", "cD", 50, 20),
        #    ("covtype200x.data", "cE", 50, 200),
        csvFilenameAll = [
            ("covtype.data", "cA", 5, 1),
            ("covtype.data", "cB", 5, 1),
            ("covtype.data", "cC", 5, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 key2=key2,
                                                 timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                key2,
                minCol=0,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemplo n.º 9
0
    def test_exec2_int2cat_nested(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000,  10, 'cA', 100),
            (1000,  20, 'cB', 100),
            (1000,  30, 'cC', 100),
            (1000,  40, 'cD', 100),
            (1000,  10, 'cE', 100),
            (1000,  20, 'cF', 100),
            (1000,  30, 'cG', 100),
            (1000,  40, 'cH', 100),
            ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                ]

        exprList = [
                '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                ]

        exprList = [
                '<keyX>[,2] = factor(<keyX>[,2])',
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the exec commands across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, 
                timeoutSecs=30, incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 10
0
    def test_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 10, 'cA', 100),
            (1000, 20, 'cB', 100),
            (1000, 30, 'cC', 100),
            (1000, 40, 'cD', 100),
            (1000, 10, 'cE', 100),
            (1000, 20, 'cF', 100),
            (1000, 30, 'cG', 100),
            (1000, 40, 'cH', 100),
        ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        if 1 == 0:
            exprList = [
                '<keyX> = colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' +
                'colSwap(<keyX>,<col1>,' + 'colSwap(<keyX>,<col2>,' +
                '<keyX>[0]' + '))))',
            ]
        else:
            exprList = [
                '<keyX> = colSwap(<keyX>,<col1>,' + '<keyX>[0]' + ')',
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                key2,
                maxCol=colCount,
                timeoutSecs=30,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 11
0
    def test_sum_import_hosts(self):
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        #    ("covtype20x.data", "cD", 50, 20),
        #    ("covtype200x.data", "cE", 50, 200),
        csvFilenameAll = [
            ("covtype.data", "cA", 5, 1),
            ("covtype.data", "cB", 5, 1),
            ("covtype.data", "cC", 5, 1),
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ## h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        importFolderPath = "standard"

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename
            # creates csvFilename.hex from file in importFolder dir
            parseResult = h2i.import_parse(
                bucket='home-0xdiag-datasets',
                path=csvPathname,
                hex_key=hex_key,
                timeoutSecs=2000)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                hex_key,
                minCol=0,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemplo n.º 12
0
    def test_rf_tnc3_fvec(self):
        h2o.beta_features = True
        csvPathname = 'tnc3.csv'
        print "\n" + csvPathname
        hex_key = "tnc3.hex"
        ### h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put', 
            timeoutSecs=10, retryDelaySecs=0.25, header=1)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")

        if 1==1:
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, numExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1==1):
            print "\nWe're not CM data getting back from RFView.json that we can check!. so look at the browser"
            print 'The good case with ignore="boat,body"'
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25,
                ignored_cols_by_name="boat,body")

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)

        #******************
        if 1==0:
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, charExprList, hex_key, maxCol=10,
                incrementingResult=False, timeoutSecs=10, retryDelaySecs=0.25)
            print "\ncolResultList after char swap", colResultList

        if 1==1:
            print "\nNow the bad case (no ignore)"
            rfv = h2o_cmd.runRF(parseResult=parseResult, trees=5, timeoutSecs=10, retryDelaySecs=0.25)

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrl(retryDelaySecs=0.5)
Exemplo n.º 13
0
    def test_exec2_int2cat_nested(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 10, 'cA', 100),
            (1000, 20, 'cB', 100),
            (1000, 30, 'cC', 100),
            (1000, 40, 'cD', 100),
            (1000, 10, 'cE', 100),
            (1000, 20, 'cF', 100),
            (1000, 30, 'cG', 100),
            (1000, 40, 'cH', 100),
        ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>[,<col2>] = <keyX>[,<col1>];',
                '<keyX>[,<col1>] = <keyX>[,1];',
                '<keyX>[,1] = <keyX>[,<col2>];',
                '<keyX>[,<col2>] = <keyX>[,<col1>];',
                '<keyX>[,<col1>] = <keyX>[,1];',
                '<keyX>[,1] = <keyX>[,<col2>];' \
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the exec commands across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                hex_key,
                maxCol=colCount,
                timeoutSecs=30,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 14
0
    def test_sum_import_hosts(self):
        # just do the import folder once
        # importFolderPath = "/home/hduser/hdfs_datasets"
        importFolderPath = "/home/0xdiag/datasets"

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        if localhost:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype.data", "cB", 5,  1),
                ("covtype.data", "cC", 5,  1),
            ]
        else:
            csvFilenameAll = [
                ("covtype.data", "cA", 5,  1),
                ("covtype20x.data", "cD", 50, 20),
                ("covtype200x.data", "cE", 50, 200),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, key2, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            h2i.setupImportFolder(None, importFolderPath)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                key2=key2, timeoutSecs=2000)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            colResultList = h2e.exec_expr_list_across_cols(lenNodes, exprList, key2, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\n*************"
            print "colResultList", colResultList
            print "*************"

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
Exemplo n.º 15
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                hex_key,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemplo n.º 16
0
    def test_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000,  10, 'cA', 100),
            (1000,  20, 'cB', 100),
            (1000,  30, 'cC', 100),
            (1000,  40, 'cD', 100),
            (1000,  10, 'cE', 100),
            (1000,  20, 'cF', 100),
            (1000,  30, 'cG', 100),
            (1000,  40, 'cH', 100),
            ]

        ### h2b.browseTheCloud()
        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        if 1==0:
            exprList = [
                    '<keyX> = colSwap(<keyX>,<col1>,' +
                                 'colSwap(<keyX>,<col2>,' +
                                 'colSwap(<keyX>,<col1>,' +
                                 'colSwap(<keyX>,<col2>,' +
                                 '<keyX>[0]' +
                                 '))))',
                ]
        else:
            exprList = [
                    '<keyX> = colSwap(<keyX>,<col1>,' + 
                                 '<keyX>[0]' +
                                 ')',
                ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=10)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, key2, maxCol=colCount, 
                timeoutSecs=30, incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 17
0
    def test_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' +
                                      filename1x)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, key2, timeoutSecs, resultMult) in csvAll:
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=key2,
                                         timeoutSecs=2000)
            print "Parse result['Key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemplo n.º 18
0
    def test_int2cat_factor_factor(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000,  10, 'cA.hex', 100),
            (10000,  20, 'cB.hex', 200),
            (10000,  30, 'cC.hex', 300),
            (10000,  40, 'cD.hex', 400),
            (10000,  50, 'cE.hex', 500),
            ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
                '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[0]))', 
                ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[0])',
                ### '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))', 
                ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(None, exprList, hex_key, maxCol=colCount, 
                timeoutSecs=90, incrementingResult=False)
            print "\nexec colResultList", colResultList

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
Exemplo n.º 19
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
    def test_many_fp_formats_libsvm (self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30,'sparse'),
            (100, 100, 'cF', 30,'sparse50'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            # for sel in range(48): # len(caseList)
            for sel in [random.randint(0,47)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict, colNumberMax)  = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, timeoutSecs=timeoutSecs)
                print csvFilename, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)


                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(colNumberMax+1, num_cols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use num_cols?. num_cols should be <= colCount. 

                colSumList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if k > colNumberMax: # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(k>=0 and k<len(colSumList), msg="k: %s len(colSumList): %s num_cols: %s" % (k, len(colSumList), num_cols))

                    syn = {}
                    if k==0: 
                        syn['name'] = "Target"
                        syn['size'] = {1,2} # can be two if we actually used the full range 0-255 (need extra for h2o NA)
                        syn['type'] = {'int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        syn['scale'] = {1}
                        # syn['base'] = 0
                        # syn['variance'] = 0
                    elif k==1: # we forced this to always be 0
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1}
                        syn['type'] = {'int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        syn['scale'] = {1}
                        syn['base'] = 0
                        syn['variance'] = 0
                    else:
                        syn['name'] = "V" + str(k)
                        syn['size'] = {1,2,4,8} # can be 2, 4 or 8? maybe make this a set for membership check
                        syn['type'] = {'int', 'float'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        syn['scale'] = {1,10,100,1000}
                        # syn['base'] = 0
                        # syn['variance'] = 0

                    syn['num_missing_values'] = 0
                    syn['enum_domain_size'] = 0
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but 
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                # for debug of why it was a bad size
                                print "cols size/min/max:", cols['size'], cols['min'], cols['max']
                                print "syn size/min/max:", syn['size'], syn['min'], syn['max']
                                raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(syn[synKey], cols[synKey],
                                msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey]))
                    
                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(float(v), colSum, places=0, 
                        msg='%0.6f col sum is not equal to expected %0.6f' % (v, colSum))
Exemplo n.º 21
0
    def test_exec2_int2cat_nested(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000, 10, 'cA', 100),
            (100000, 20, 'cB', 100),
            (100000, 30, 'cC', 100),
        ]

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!

        if DO_CASE == 0:
            exprList = [
                    '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                    '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                    '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                    '<keyX>[,<col2>] = factor(<keyX>[,<col1>]);',
                    '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                    '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                    ]

        elif DO_CASE == 1:
            exprList = [
                    '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                    '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                    '<keyX>[,1] = factor(<keyX>[,<col2>]);',
                    '<keyX>[,<col1>] = factor(<keyX>[,<col1>]);',
                    '<keyX>[,<col1>] = factor(<keyX>[,1]);',
                    '<keyX>[,1] = factor(<keyX>[,<col2>]);' \
                    ]

        elif DO_CASE == 2:
            exprList = [
                '<keyX>[,2] = factor(<keyX>[,2])',
            ]

        else:
            raise Exception("Bad case: %s" % DO_CASE)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the exec commands across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                hex_key,
                maxCol=colCount,
                timeoutSecs=30,
                incrementingResult=False)
            print "\nexec colResultList", colResultList
Exemplo n.º 22
0
    def test_GLM_tnc3_10(self):
        csvFilename = 'tnc3_10.csv'
        print "\n" + csvFilename
        hex_key = "tnc3.hex"
        h2b.browseTheCloud()

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=10)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(10)

        if (1 == 0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                numExprList,
                hex_key,
                maxCol=10,
                incrementingResult=False,
                timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1 == 1):
            start = time.time()
            kwargs = {'y': 13, 'n_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=300,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        #******************
        if (1 == 0):
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                charExprList,
                hex_key,
                maxCol=10,
                incrementingResult=False,
                timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1 == 1):
            start = time.time()
            kwargs = {'y': 13, 'n_folds': 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=300,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
        ### time.sleep(3600)
        h2b.browseJsonHistoryAsUrlLastMatch("RFView")

        if not h2o.browse_disable:
            ### print "\n <ctrl-C> to quit sleeping here"
            ### time.sleep(1500)
            pass
Exemplo n.º 23
0
    def test_GLM_many_cols_int2cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10000, 10, 'cA.hex', 100),
            (10000, 20, 'cB.hex', 200),
            (10000, 30, 'cC.hex', 300),
            (10000, 40, 'cD.hex', 400),
            (10000, 50, 'cE.hex', 500),
        ]

        ### h2b.browseTheCloud()

        # we're going to do a special exec across all the columns to turn them into enums
        # including the duplicate of the output!
        exprList = [
            '<keyX>= colSwap(<keyX>,<col1>,factor(<keyX>[<col1>]))',
            ### '<keyX>= colSwap(<keyX>,<col1>,<keyX>[<col1>])',
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=90)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename

            print "\nNow running the int 2 enum exec command across all input cols"
            colResultList = h2e.exec_expr_list_across_cols(
                None,
                exprList,
                key2,
                maxCol=colCount,
                timeoutSecs=90,
                incrementingResult=False)
            print "\nexec colResultList", colResultList

            paramDict2 = {}
            for k in paramDict:
                paramDict2[k] = paramDict[k][0]
            # since we add the output twice, it's no longer colCount-1
            y = colCount
            kwargs = {'y': y, 'max_iter': 50, 'case': 1}
            kwargs.update(paramDict2)

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            # only col y-1 (next to last)doesn't get renamed in coefficients
            # due to enum/categorical expansion
            print "y:", y
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            if not h2o.browse_disable:
                h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                time.sleep(3)
                h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                time.sleep(3)
    def test_many_fp_formats_libsvm_fvec(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, "cA", 30, "sparse50"),
            (100, 10, "cB", 30, "sparse"),
            (100000, 100, "cC", 30, "sparse"),
            (1000, 10, "cD", 30, "sparse50"),
            (100, 100, "cE", 30, "sparse"),
            (100, 100, "cF", 30, "sparse50"),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict, colNumberMax) = write_syn_dataset(
                    csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution
                )

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=selKey2, timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult["destination_key"]
                inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
                numCols = inspect["numCols"]
                numRows = inspect["numRows"]
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0, key=parseResult["destination_key"], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg="generated %s cols (including output). parsed to %s cols" % (colNumberMax + 1, numCols),
                )

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                colSumList = h2e.exec_expr_list_across_cols(
                    None, exprList, selKey2, maxCol=colNumberMax + 1, timeoutSecs=timeoutSecs
                )

                self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" % (k, len(colSumList), numCols),
                    )

                    syn = {}
                    if k == 0:
                        syn["name"] = "C1"
                        syn["type"] = {"Int"}
                        syn["min"] = classMin
                        syn["max"] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn["name"] = "C2"
                        syn["type"] = {"Int"}
                        syn["min"] = 0
                        syn["max"] = 0
                        # syn['scale'] = {1}
                    else:
                        syn["name"] = "C" + str(k + 1)
                        syn["type"] = {"Int", "Real"}
                        syn["min"] = valMin
                        syn["max"] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn["naCnt"] = 0
                    syn["cardinality"] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect["cols"][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == "min":
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg="col %s %s %s should be <= %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )
                        elif synKey == "max":
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg="col %s %s %s should be >= %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )
                        elif synKey == "type":
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols["min"], cols["max"]
                                print "syn min/max:", syn["min"], syn["max"]
                                raise Exception(
                                    "col %s %s %s should be in this allowed %s" % (k, synKey, cols[synKey], syn[synKey])
                                )
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg="col %s %s %s should be %s" % (k, synKey, cols[synKey], syn[synKey]),
                            )

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v), colSum, places=0, msg="%0.6f col sum is not equal to expected %0.6f" % (v, colSum)
                    )
Exemplo n.º 25
0
    def test_many_fp_formats_libsvm(self):
        # h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 10, 'cA', 30, 'sparse50'),
            (100, 10, 'cB', 30, 'sparse'),
            (100000, 100, 'cC', 30, 'sparse'),
            (1000, 10, 'cD', 30, 'sparse50'),
            (100, 100, 'cE', 30, 'sparse'),
            (100, 100, 'cF', 30, 'sparse50'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)

                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (synColSumDict,
                 colNumberMax) = write_syn_dataset(csvPathname, rowCount,
                                                   colCount, SEEDPERFILE, sel,
                                                   distribution)

                selKey2 = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0, key=parseResult['destination_key'], timeoutSecs=300)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2,
                                                       timeoutSecs=360)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                # we might have added some zeros at the end, that our colNumberMax won't include
                print synColSumDict.keys(), colNumberMax
                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                h2e.exec_zero_list(zeroList)
                # how do we know the max dimension (synthetic may not generate anything for the last col)
                # use numCols?. numCols should be <= colCount.

                colSumList = h2e.exec_expr_list_across_cols(
                    None,
                    exprList,
                    selKey2,
                    maxCol=colNumberMax + 1,
                    timeoutSecs=timeoutSecs)

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                print "\ncolSumList:", colSumList
                print "\nsynColSumDict:", synColSumDict

                for k, v in synColSumDict.iteritems():
                    if k > colNumberMax:  # ignore any extra 0 cols at the end
                        continue

                    # k should be integers that match the number of cols
                    self.assertTrue(
                        k >= 0 and k < len(colSumList),
                        msg="k: %s len(colSumList): %s numCols: %s" %
                        (k, len(colSumList), numCols))

                    syn = {}
                    if k == 0:
                        syn['name'] = "C1"
                        syn['type'] = {'Int'}
                        syn['min'] = classMin
                        syn['max'] = classMax
                        # don't check these for the col 0 'Target'
                        # syn['scale'] = {1}
                    elif k == 1:  # we forced this to always be 0
                        syn['name'] = "C2"
                        syn['type'] = {'Int'}
                        syn['min'] = 0
                        syn['max'] = 0
                        # syn['scale'] = {1}
                    else:
                        syn['name'] = "C" + str(k + 1)
                        syn['type'] = {'Int', 'Real'}
                        syn['min'] = valMin
                        syn['max'] = valMax
                        # syn['scale'] = {1,10,100,1000}

                    syn['naCnt'] = 0
                    syn['cardinality'] = -1
                    # syn['min'] = 0
                    # syn['max'] = 0
                    # syn['mean'] = 0

                    cols = inspect['cols'][k]
                    for synKey in syn:
                        # we may not see the min/max range of values that was bounded by our gen, but
                        # we can check that it's a subset of the allowed range
                        if synKey == 'min':
                            self.assertTrue(
                                syn[synKey] <= cols[synKey],
                                msg='col %s %s %s should be <= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'max':
                            self.assertTrue(
                                syn[synKey] >= cols[synKey],
                                msg='col %s %s %s should be >= %s' %
                                (k, synKey, cols[synKey], syn[synKey]))
                        elif synKey == 'type':
                            if cols[synKey] not in syn[synKey]:
                                print "cols min/max:", cols['min'], cols['max']
                                print "syn min/max:", syn['min'], syn['max']
                                raise Exception(
                                    'col %s %s %s should be in this allowed %s'
                                    % (k, synKey, cols[synKey], syn[synKey]))
                        else:
                            self.assertEqual(
                                syn[synKey],
                                cols[synKey],
                                msg='col %s %s %s should be %s' %
                                (k, synKey, cols[synKey], syn[synKey]))

                    colSum = colSumList[k]
                    print "\nComparing col", k, "sums:", v, colSum
                    # Even though we're comparing floating point sums, the operations probably should have
                    # been done in same order, so maybe the comparison can be exact (or not!)
                    self.assertAlmostEqual(
                        float(v),
                        colSum,
                        places=0,
                        msg='%0.6f col sum is not equal to expected %0.6f' %
                        (v, colSum))
    def test_many_fp_formats_libsvm_2_fvec(self):
        #h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0,NUM_CASES-1)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=selKey2, 
                    timeoutSecs=timeoutSecs, doSummary=False, parser_type='SVMLight')
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseResult['destination_key'], timeoutSecs=300, noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, max_column_display=colNumberMax+1, timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, numCols, msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                        timeoutSecs=timeoutSecs, print_params=False)
                    #print "\n*************"
                    #print "colResultList", colResultList
                    #print "*************"

                self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset

                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
                print sortedColSumDict
                for k,v in sortedColSumDict.iteritems():
                    print k
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k>=0 and k<len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(v, compare, places=0, 
                            msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare))

                    synMean = (v + 0.0)/rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    if not h2o_util.approxEqual(mean, synMean, tol=1e-3):
                        execExpr = 'sum(%s[,%s])' % (selKey2, k+1)
                        resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) 
                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec)
                        print "Result of remembered sum on failing col:..:", k, v
                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
                        print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean
                        sys.stdout.flush()
                        raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt,
                        msg='col %s naCnt %d should be 0' % (k, naCnt))
Exemplo n.º 27
0
    def test_many_fp_formats_libsvm_2 (self):
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            (100, 40000, 'cC', 300, 'sparse50'),
            (100, 40000, 'cD', 300, 'sparse'),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs, distribution) in tryList:
            # for sel in range(48): # len(caseList)
            for sel in [random.randint(0,47)]: # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution)

                selKey2 = key2 + "_" + str(sel)
                parseKey = h2o_cmd.parseFile(None, csvPathname, key2=selKey2, timeoutSecs=timeoutSecs, doSummary=False)
                print csvFilename, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']
                inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values, 
                # to see if we have x specified well
                # figures out everything from parseKey['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(y=0,
                    key=parseKey['destination_key'], timeoutSecs=300, noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(key=selKey2, timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(colNumberMax+1, num_cols, msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, num_cols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
                        timeoutSecs=timeoutSecs)
                    print "\n*************"
                    print "colResultList", colResultList
                    print "*************"

                self.assertEqual(rowCount, num_rows, msg="generated %s rows, parsed to %s rows" % (rowCount, num_rows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset
                ### print "\nsynColSumDict:", synColSumDict

                for k,v in synColSumDict.iteritems():
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k>=0 and k<len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(v, compare, places=0, 
                            msg='%0.6f col sum is not equal to expected %0.6f' % (v, compare))

                    synMean = (v + 0.0)/rowCount
                    # enums don't have mean, but we're not enums
                    mean = inspect['cols'][k]['mean']
                    # our fp formats in the syn generation sometimes only have two places?
                    self.assertAlmostEqual(mean, synMean, places=0,
                        msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

                    num_missing_values = inspect['cols'][k]['num_missing_values']
                    self.assertEqual(0, num_missing_values,
                        msg='col %s num_missing_values %d should be 0' % (k, num_missing_values))
Exemplo n.º 28
0
    def test_many_fp_formats_libsvm_2_fvec(self):
        #h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 300, 'sparse50'),
            (100, 10000, 'cB', 300, 'sparse'),
            # (100, 40000, 'cC', 300, 'sparse50'),
            # (100, 40000, 'cD', 300, 'sparse'),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             distribution) in tryList:
            NUM_CASES = h2o_util.fp_format()
            for sel in [random.randint(0, NUM_CASES - 1)]:  # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel,
                                                       rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax,
                 synColSumDict) = write_syn_dataset(csvPathname, rowCount,
                                                    colCount, SEEDPERFILE, sel,
                                                    distribution)

                selKey2 = hex_key + "_" + str(sel)
                print "This dataset requires telling h2o parse it's a libsvm..doesn't detect automatically"
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=selKey2,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False,
                                               parser_type='SVMLight')
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             max_column_display=colNumberMax +
                                             1,
                                             timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']
                print "\n" + csvFilename

                # SUMMARY****************************************
                # gives us some reporting on missing values, constant values,
                # to see if we have x specified well
                # figures out everything from parseResult['destination_key']
                # needs y to avoid output column (which can be index or name)
                # assume all the configs have the same y..just check with the firs tone
                goodX = h2o_glm.goodXFromColumnInfo(
                    y=0,
                    key=parseResult['destination_key'],
                    timeoutSecs=300,
                    noPrint=True)

                if DO_SUMMARY:
                    summaryResult = h2o_cmd.runSummary(
                        key=selKey2,
                        max_column_display=colNumberMax + 1,
                        timeoutSecs=timeoutSecs)
                    h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

                self.assertEqual(
                    colNumberMax + 1,
                    numCols,
                    msg=
                    "generated %s cols (including output).  parsed to %s cols"
                    % (colNumberMax + 1, numCols))

                # Exec (column sums)*************************************************
                if DO_COMPARE_SUM:
                    h2e.exec_zero_list(zeroList)
                    colResultList = h2e.exec_expr_list_across_cols(
                        None,
                        exprList,
                        selKey2,
                        maxCol=colNumberMax + 1,
                        timeoutSecs=timeoutSecs,
                        print_params=False)
                    #print "\n*************"
                    #print "colResultList", colResultList
                    #print "*************"

                self.assertEqual(rowCount,
                                 numRows,
                                 msg="generated %s rows, parsed to %s rows" %
                                 (rowCount, numRows))
                # need to fix this for compare to expected
                # we should be able to keep the list of fp sums per col above
                # when we generate the dataset

                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
                print sortedColSumDict
                for k, v in sortedColSumDict.iteritems():
                    print k
                    if DO_COMPARE_SUM:
                        # k should be integers that match the number of cols
                        self.assertTrue(k >= 0 and k < len(colResultList))
                        compare = colResultList[k]
                        print "\nComparing col sums:", v, compare
                        # Even though we're comparing floating point sums, the operations probably should have
                        # been done in same order, so maybe the comparison can be exact (or not!)
                        self.assertAlmostEqual(
                            v,
                            compare,
                            places=0,
                            msg='%0.6f col sum is not equal to expected %0.6f'
                            % (v, compare))

                    synMean = (v + 0.0) / rowCount
                    # enums don't have mean, but we're not enums
                    mean = float(inspect['cols'][k]['mean'])
                    # our fp formats in the syn generation sometimes only have two places?
                    if not h2o_util.approxEqual(mean, synMean, tol=1e-3):
                        execExpr = 'sum(%s[,%s])' % (selKey2, k + 1)
                        resultExec = h2o_cmd.runExec(str=execExpr,
                                                     timeoutSecs=300)
                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(
                            resultExec)
                        print "Result of remembered sum on failing col:..:", k, v
                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
                        print "k: ", k, "mean: ", mean, "remembered sum/rowCount : ", synMean
                        sys.stdout.flush()
                        raise Exception(
                            'col %s mean %0.6f is not equal to generated mean %0.6f'
                            % (k, mean, synMean))

                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be 0' %
                                     (k, naCnt))
Exemplo n.º 29
0
    def test_GLM2_tnc3_10(self):
        h2o.beta_features = True
        csvFilename = 'tnc3_10.csv'
        print "\n" + csvFilename
        hex_key = "tnc3.hex"

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvFilename,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=10)
        print "Parse result['Key']:", parseResult['destination_key']
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        ### time.sleep(10)

        if (1 == 0):
            lenNodes = len(h2o.nodes)
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                numExprList,
                hex_key,
                maxCol=10,
                incrementingResult=False,
                timeoutSecs=10)
            print "\ncolResultList after num swap", colResultList

        if (1 == 1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            # hmm. maybe we should update to use key as input
            # in case exec is used to change the parseResult
            # in any case, the destination_key in parseResult was what was updated
            # so if we Exec, it's correct.
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=300,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

        #******************
        if (1 == 0):
            colResultList = h2e.exec_expr_list_across_cols(
                lenNodes,
                charExprList,
                hex_key,
                maxCol=10,
                incrementingResult=False,
                timeoutSecs=10)
            print "\ncolResultList after char swap", colResultList

        if (1 == 1):
            start = time.time()
            kwargs = {'response': 13, 'n_folds': 6}
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=300,
                                 **kwargs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "glm end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])