Пример #1
0
    def test_NN_covtype_1(self):
        csvFilename = 'covtype.data'
        csvPathname = 'UCI/UCI-large/covtype/' + csvFilename
        hex_key = 'covtype.hex'
        parseResult = h2i.import_parse(bucket='datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        print "WARNING: just doing the first 33 features, for comparison to ??? numbers"
        x = ",".join(map(str,range(33)))

        response = 54
        kwargs = {
            # this is ignore??
            'cols': x, # apparently required? 
            'response': response,
            'activation': 'Tanh',
            'hidden': 500,
            'rate': 0.01,
            'l2': 1.0E-4,
            'epochs': 100,
            'destination_key': 'a.hex',
            'validation': hex_key,
        }

        timeoutSecs = 600
        start = time.time()
        h2o.beta_features = True
        nnResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
        h2o.beta_features = False
        print "Hack: neural net apparently doesn't support the right polling response yet?"
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

        print "FIX! need to add something that looks at the neural net result here?"
        print "neural net end on ", csvPathname, 'took', time.time() - start, 'seconds'
Пример #2
0
    def test_NN_covtype_1(self):
        csvFilename_train = 'sumsigmoids.csv'
        csvPathname_train = 'neural/' + csvFilename_train
        csvFilename_test  = 'sumsigmoids_test.csv'
        csvPathname_test  = 'neural/' + csvFilename_test
        hex_key = 'sigmoids_train.hex'
        validation_key = 'sigmoids_test.hex'
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=10)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=30)
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        print "\n" + csvPathname_train, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        hidden = 1 #number of hidden units
        response = 'Y'
        
        kwargs = {
            'ignored_cols'    : None,
            'response'        : response,
            'activation'      : 'Tanh',
            'hidden'          : hidden,
            'rate'            : 0.01,
            'l2'              : 0.0005,
            'epochs'          : 10,
            'destination_key' : 'nn'+str(hidden)+'.hex',
            'validation'      : validation_key,
        }

        timeoutSecs = 600
        start = time.time()
        h2o.beta_features = True
        nnResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
        h2o.beta_features = False
        print "Hack: neural net apparently doesn't support the right polling response yet?"
        h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
Пример #3
0
    def test_GLM_twovalues(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + "/" + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ("A", "B", 0, 14),
            ("A", "B", 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, "NaN", 12),
            (1, 0, "NaN", 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)

            # default takes 39 iterations? play with alpha/beta
            parseResult = h2i.import_parse(path=csvPathname, schema="put", hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            response = inspect["num_cols"] - 1
            # up to but not including
            x = ",".join(map(str, range(response)))

            kwargs = {
                # this is ignore??
                "response": response,
                "cols": x,  # apparently no longer required?
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "validation": hex_key,
                "activation": "Tanh",  # 'Rectifier'
                "hidden": 500,  # comma separated values, or from:to:step
                "rate": 0.01,  # learning rate
                "l2": 1.0e-4,  # regularization
                "epochs": 2,  # how many times dataset should be iterated
                "destination_key": "a.hex",
            }

            for iteration in range(2):
                timeoutSecs = 600
                start = time.time()
                h2o.beta_features = True
                nnResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2o.beta_features = False

                print "Hack: neural net apparently doesn't support the right polling response yet?"
                h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)

                print "FIX! need to add something that looks at the neural net result here?"
                print "nnResult:", h2o.dump_json(nnResult)

                print "trial #", trial, "iteration #", iteration, "NN end on ", csvFilename, "took", time.time() - start, "seconds"
                # h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                h2o.check_sandbox_for_errors()

            trial += 1
Пример #4
0
    def test_NN_mnist_multi(self):
        # h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = "mnist/train.csv.gz"
        csvPathname_test = "mnist/test.csv.gz"
        hex_key = "mnist_train.hex"
        validation_key = "mnist_test.hex"
        timeoutSecs = 60
        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvPathname_train, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs
        )
        parseResultV = h2i.import_parse(
            bucket="smalldata", path=csvPathname_test, schema="put", hex_key=validation_key, timeoutSecs=timeoutSecs
        )
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, "    numRows:", "{:,}".format(
            inspect["numRows"]
        ), "    numCols:", "{:,}".format(inspect["numCols"])
        response = inspect["numCols"] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            "SingleNode",  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            # Making random id
            identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = "nn_" + identifier + ".hex"

            kwargs = {
                "ignored_cols": None,
                "response": response,
                "classification": 1,
                "mode": mode,
                "activation": "RectifierWithDropout",
                "input_dropout_ratio": 0.2,
                "hidden": "117,131,129",
                "rate": 0.005,
                "rate_annealing": 1e-6,
                "momentum_start": 0.5,
                "momentum_ramp": 100000,
                "momentum_stable": 0.9,
                "l1": 0.00001,
                "l2": 0.0000001,
                "seed": 98037452452,
                "loss": "CrossEntropy",
                "max_w2": 15,
                "warmup_samples": 0,
                "initial_weight_distribution": "UniformAdaptive",
                #'initial_weight_scale'         : 0.01,
                "epochs": 20.0,
                "destination_key": model_key,
                "validation": validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.03  ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds"

            relTol = 0.02 if mode == "SingleThread" else 0.10  ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(
                self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs
            )

            ### Now score using the model, and check the validation error
            kwargs = {
                "source": validation_key,
                "max_rows": 0,
                "response": response,
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "destination_key": "score_" + identifier + ".hex",
                "model": model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            if mode != "MapReduce":
                print "WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results."

        h2o.beta_features = False
Пример #5
0
    def test_NN_covtype_1(self):
        tryList = ["covtype.shuffled.90pct.sorted.data", "covtype.shuffled.90pct.data"]

        importFolderPath = "standard"
        for csvFilename in tryList:
            csvPathname = importFolderPath + "/" + csvFilename
            hex_key = "covtype.hex"
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="local", hex_key=hex_key, timeoutSecs=10
            )
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # print "WARNING: just doing the first 33 features, for comparison to ??? numbers"
            # x = ",".join(map(str,range(33)))
            x = ""

            response = 54
            modelKey = "a.hex"
            kwargs = {
                # this is ignore??
                "response": response,
                # 'cols': x, # apparently no longer required?
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "validation": hex_key,
                "activation": "Tanh",  # 'Rectifier'
                "hidden": 500,  # comma separated values, or from:to:step
                "rate": 0.01,  # learning rate
                "l2": 1.0e-4,  # regularization
                "epochs": 1,  # how many times dataset should be iterated
                "destination_key": modelKey,
            }

            timeoutSecs = 600
            start = time.time()
            h2o.beta_features = True
            nnFirstResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            print "nnFirstResult:", h2o.dump_json(nnFirstResult)
            print "Hack: neural net apparently doesn't support the right polling response yet?"
            h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            print "neural net end on ", csvPathname, "took", time.time() - start, "seconds"

            # hack it!
            job_key = nnFirstResult["job_key"]

            # is the job finishing before polling would say it's done?
            params = {"job_key": job_key, "destination_key": modelKey}
            a = h2o.nodes[0].completion_redirect(jsonRequest="2/NeuralNetProgress.json", params=params)

            # fake it
            ## response = {'redirect_url': "2/NeuralNetProgress.json?job_key=%s&destination_key=%s" % (job_key, modelKey)}
            ## a = h2o.nodes[0].poll_url(response, timeoutSecs=30)

            print "NeuralNetProgress:", h2o.dump_json(a)

            # print 'From hack url for neural net result:', h2o.dump_json(a)

            if DO_SCORE:
                kwargs = {
                    "max_rows": 0,
                    "response": response,
                    # 'cols': x, # apparently no longer required?
                    "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                    "cols": None,  # this is not consistent with ignored_cols_by_name
                    "classification": 1,
                    "destination_key": "b.hex",
                    "model": modelKey,
                }
                nnScoreFirstResult = h2o_cmd.runNNetScore(
                    key=parseResult["destination_key"], timeoutSecs=timeoutSecs, noPoll=True, **kwargs
                )
                h2o.beta_features = False
                print "Hack: neural net apparently doesn't support the right polling response yet?"
                h2o_jobs.pollWaitJobs(
                    pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5
                )

                print "neural net score end on ", trainCsvFilename, "took", time.time() - start, "seconds"
                print "nnScoreResult:", h2o.dump_json(nnScoreResult)

            h2o.beta_features = False
Пример #6
0
    def test_NN_covtype(self):
        # h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = "covtype/covtype.20k.data"
        csvPathname_test = "covtype/covtype.20k.data"
        hex_key = "covtype.hex"
        validation_key = hex_key
        timeoutSecs = 30
        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvPathname_train, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs
        )
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, "    numRows:", "{:,}".format(
            inspect["numRows"]
        ), "    numCols:", "{:,}".format(inspect["numCols"])
        response = inspect["numCols"] - 1

        modes = ["SingleThread", "SingleNode"]

        for mode in modes:

            # Making random id
            identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = "nn_" + identifier + ".hex"

            kwargs = {
                "ignored_cols": None,
                "response": response,
                "classification": 1,
                "mode": mode,
                "activation": "Tanh",
                #'input_dropout_ratio'          : 0.1,
                "hidden": "200,200",
                "rate": 0.005,
                "rate_annealing": 1e-5,
                "momentum_start": 0.1,
                "momentum_ramp": 100000,
                "momentum_stable": 0.3,
                "l1": 0.0000,
                "l2": 0.0000,
                "seed": 28372348842,
                "loss": "CrossEntropy",
                #'max_w2'                       : 10,
                "warmup_samples": 0,
                "initial_weight_distribution": "Normal",
                "initial_weight_scale": 1,
                "epochs": 2.0,
                "destination_key": model_key,
                "validation": validation_key,
            }
            expectedErr = 0.3413 if mode == "SingleThread" else 0.3  ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds"

            relTol = 0.03 if mode == "SingleThread" else 0.15  ### 15% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(
                self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs
            )

            ### Now score using the model, and check the validation error
            kwargs = {
                "source": validation_key,
                "max_rows": 0,
                "response": response,
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "destination_key": "score_" + identifier + ".hex",
                "model": model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Пример #7
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread', 
            'SingleNode',
            ###'MapReduce' ### TODO: enable, once implemented
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'RectifierWithDropout',
                'input_dropout_ratio'          : 0.2,
                'hidden'                       : '117,131,129',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0.5,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.9,
                'l1'                           : 0.00001,
                'l2'                           : 0.0000001,
                'seed'                         : 98037452452,
                'loss'                         : 'CrossEntropy',
                'max_w2'                       : 15,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 2.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            expectedErr = 0.0565 ## expected validation error for the above model on 1 thread

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            #### Look at model progress, and check the last reported validation error
            relTol = 0.3 if mode == 'SingleThread' else 0.15
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            #### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Пример #8
0
    def test_NN_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : 'C' + str(response),
                'classification'               : 1,
                'mode'                         : 'SingleThread',
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden'                       : '500',
                'rate'                         : 0.01,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0,
                'momentum_ramp'                : 0,
                'momentum_stable'              : 0,
                'l1'                           : 0.0,
                'l2'                           : 1e-4,
                'seed'                         : 80023842348,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 1.0,
                'destination_key'              : model_key,
                'validation'                   : hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o.beta_features = True
            h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "trial #", trial, "NN end on ", csvFilename, ' took', time.time() - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.0
            relTol = 0.01
            kwargs = {
                'source' : hex_key,
                'max_rows': 0,
                'response': 'C' + str(response),
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score' + str(trial) + '.hex',
                'model': model_key
            }

            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            h2o.check_sandbox_for_errors()

            trial += 1
Пример #9
0
    def test_NN_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            'SingleNode',  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'RectifierWithDropout',
                'input_dropout_ratio'          : 0.2,
                'hidden'                       : '117,131,129',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0.5,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.9,
                'l1'                           : 0.00001,
                'l2'                           : 0.0000001,
                'seed'                         : 98037452452,
                'loss'                         : 'CrossEntropy',
                'max_w2'                       : 15,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 20.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.0331 ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            if mode != 'MapReduce':
                print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.'

        h2o.beta_features = False
Пример #10
0
    def test_NN_mnist(self):
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put',
                hex_key=testKey2, timeoutSecs=timeoutSecs, noise=('StoreView', None))
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']


            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename, schema='put',
                hex_key=trainKey2, timeoutSecs=timeoutSecs, noise=('StoreView', None))
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # NN****************************************
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + trainCsvFilename, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            response = inspect['num_cols'] - 1
            # up to but not including
            x = ",".join(map(str,range(response)))

            modelKey = 'a.hex'
            kwargs = {
                # this is ignore??
                'response': 0, # first column is pixel value
                # 'cols': x, # apparently no longer required? 
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'validation': trainKey2,
                'activation': 'Tanh', # 'Rectifier'
                'hidden': 500, # comma separated values, or from:to:step
                'rate': 0.01,  # learning rate
                'l2': 1.0E-4, # regularization
                'epochs': 2, # how many times dataset should be iterated
                'destination_key': modelKey,
            }

            timeoutSecs = 600
            start = time.time()
            h2o.beta_features = True
            nnFirstResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
            print "nnFirstResult:", h2o.dump_json(nnFirstResult)
            print "Hack: neural net apparently doesn't support the right polling response yet?"
            h2o_jobs.pollWaitJobs(pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
            print "neural net end on ", trainCsvFilename, 'took', time.time() - start, 'seconds'

            # hack it!
            job_key = nnFirstResult['job_key']
            params = {'job_key': job_key, 'destination_key': modelKey}
            a = h2o.nodes[0].completion_redirect(jsonRequest="2/NeuralNetProgress.json", params=params)
            print "NeuralNetProgress:", h2o.dump_json(a)

            # print 'From hack url for neural net result:', h2o.dump_json(a)

            if DO_SCORE:
                kwargs = {
                    'max_rows': 0,
                    'response': 0, # first column is pixel value
                    # 'cols': x, # apparently no longer required? 
                    'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                    'cols': None, # this is not consistent with ignored_cols_by_name
                    'classification': 1,
                    'destination_key': 'b.hex',
                    'model': modelKey,
                }
                nnScoreFirstResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, noPoll=True, **kwargs)
                h2o.beta_features = False
                print "Hack: neural net apparently doesn't support the right polling response yet?"
                h2o_jobs.pollWaitJobs(pattern=None, errorIfCancelled=True, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)


                print "neural net score end on ", trainCsvFilename, 'took', time.time() - start, 'seconds'
                print "nnScoreResult:", h2o.dump_json(nnScoreResult)

            h2o.beta_features = False
Пример #11
0
    def test_GLM_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)

            # default takes 39 iterations? play with alpha/beta
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            response = inspect['numCols'] - 1
            # up to but not including
            x = ",".join(map(lambda x: 'C' + str(x), range(response)))

            kwargs = {
                # this is ignore??
                'response': 'C' + str(response),
                'cols': x, # apparently no longer required? 
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'validation': hex_key,
                'activation': 'Tanh', # 'Rectifier'
                'hidden': 500, # comma separated values, or from:to:step
                'rate': 0.01,  # learning rate
                'l2': 1.0E-4, # regularization
                'epochs': 2, # how many times dataset should be iterated
                'destination_key': 'a.hex',
            }

            for iteration in range(2):
                timeoutSecs = 600
                start = time.time()
                h2o.beta_features = True
                nnResult = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

                print "FIX! need to add something that looks at the neural net result here?"
                print "nnResult:", h2o.dump_json(nnResult)

                print "trial #", trial, "iteration #", iteration, "NN end on ", csvFilename, 'took', time.time() - start, 'seconds'
                # h2b.browseJsonHistoryAsUrlLastMatch("GLM")
                h2o.check_sandbox_for_errors()

            trial += 1
Пример #12
0
    def test_NN_covtype_1(self):
        h2o.beta_features = True
        tryList = [
            ("covtype.shuffled.90pct.sorted.data", "covtype.shuffled.10pct.sorted.data"),
            ("covtype.shuffled.90pct.data", "covtype.shuffled.10pct.data"),
        ]

        importFolderPath = "standard"
        for trainFilename, testFilename in tryList:
            # Parse Train********************************
            trainPathname = importFolderPath + "/" + trainFilename
            trainHexKey = "covtype90.hex"
            trainParseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=trainPathname, schema="local", hex_key=trainHexKey, timeoutSecs=10
            )
            inspect = h2o_cmd.runInspect(None, trainParseResult["destination_key"])
            print "\n" + trainPathname, "    numRows:", "{:,}".format(
                inspect["numRows"]
            ), "    numCols:", "{:,}".format(inspect["numCols"])
            # Parse Test********************************
            testPathname = importFolderPath + "/" + testFilename
            testHexKey = "covtype10.hex"
            testParseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=testPathname, schema="local", hex_key=testHexKey, timeoutSecs=10
            )

            # NN Train********************************
            x = ""
            response = "C54"
            modelKey = "a.hex"
            kwargs = {
                # this is ignore??
                "response": response,
                # 'cols': x, # apparently no longer required?
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "validation": testHexKey,
                "activation": "Tanh",  # 'Rectifier'
                "hidden": 500,  # comma separated values, or from:to:step
                "rate": 0.01,  # learning rate
                "l2": 1.0e-2,  # regularization
                # can overfit the training data
                "epochs": 5,  # how many times dataset should be iterated
                "destination_key": modelKey,
            }

            timeoutSecs = 600
            start = time.time()
            nnResult = h2o_cmd.runNNet(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "nnResult:", h2o.dump_json(nnResult)
            print "neural net end on ", trainPathname, "took", time.time() - start, "seconds"

            # NN Score********************************
            kwargs = {
                "max_rows": 0,
                "response": response,
                # 'cols': x, # apparently no longer required?
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "destination_key": "b.hex",
                "model": modelKey,
            }
            # doesn't need polling?
            nnScoreResult = h2o_cmd.runNNetScore(
                key=testParseResult["destination_key"], timeoutSecs=timeoutSecs, noPoll=True, **kwargs
            )

            print "neural net score end on ", testPathname, "took", time.time() - start, "seconds"
            # print "nnScoreResult:", h2o.dump_json(nnScoreResult)
            cm = nnScoreResult["confusion_matrix"]
            mean_square_error = nnScoreResult["mean_square_error"]
            classification_error = nnScoreResult["classification_error"]

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)
            print "\nTest\n==========\n"
            print "classification_error:", classification_error
            print "mean_square_error:", mean_square_error
            print h2o_gbm.pp_cm(cm)
Пример #13
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117,131,129',
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                'loss': 'CrossEntropy',
                'max_w2': 15,
                'warmup_samples': 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 2.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            expectedErr = 0.0565  ## expected validation error for the above model on 1 thread

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            #### Look at model progress, and check the last reported validation error
            relTol = 0.3 if mode == 'SingleThread' else 0.15
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            #### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

        h2o.beta_features = False
Пример #14
0
    def test_NN_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'mode': 'SingleThread',
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_ramp': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o.beta_features = True
            h2o_cmd.runNNet(parseResult=parseResult,
                            timeoutSecs=timeoutSecs,
                            **kwargs)
            print "trial #", trial, "NN end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.0
            relTol = 0.01
            kwargs = {
                'source': hex_key,
                'max_rows': 0,
                'response': 'C' + str(response),
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score' + str(trial) + '.hex',
                'model': model_key
            }

            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

            h2o.check_sandbox_for_errors()

            trial += 1
Пример #15
0
    def test_NN_covtype(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'covtype/covtype.20k.data'
        csvPathname_test  = 'covtype/covtype.20k.data'
        hex_key = 'covtype.hex'
        validation_key = hex_key
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs)
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.1,
                'hidden'                       : '200,200',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-5,
                'momentum_start'               : 0.1,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.3,
                'l1'                           : 0.0000,
                'l2'                           : 0.0000,
                'seed'                         : 28372348842,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 10,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'Normal',
                'initial_weight_scale'         : 1,
                'epochs'                       : 2.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            expectedErr = 0.3413 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
                }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Пример #16
0
    def test_NN_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            'SingleNode',  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117,131,129',
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                'loss': 'CrossEntropy',
                'max_w2': 15,
                'warmup_samples': 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 20.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.03  ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            relTol = 0.02 if mode == 'SingleThread' else 0.10  ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

            if mode != 'MapReduce':
                print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.'

        h2o.beta_features = False
Пример #17
0
    def test_NN_covtype(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'covtype/covtype.20k.data'
        csvPathname_test = 'covtype/covtype.20k.data'
        hex_key = 'covtype.hex'
        validation_key = hex_key
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.1,
                'hidden': '200,200',
                'rate': 0.005,
                'rate_annealing': 1e-5,
                'momentum_start': 0.1,
                'momentum_ramp': 100000,
                'momentum_stable': 0.3,
                'l1': 0.0000,
                'l2': 0.0000,
                'seed': 28372348842,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 10,
                'warmup_samples': 0,
                'initial_weight_distribution': 'Normal',
                'initial_weight_scale': 1,
                'epochs': 2.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            expectedErr = 0.35195 if mode == 'SingleThread' else 0.3  ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            relTol = 0.03 if mode == 'SingleThread' else 0.20  ### 20% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

        h2o.beta_features = False