Пример #1
0
    def test_NN2_params_rand2(self):
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        paramDict = define_params()

        for trial in range(5):
            # params is mutable. This is default.
            params = {'response': 'C55'}
            h2o_nn.pickRandDeepLearningParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            nn = h2o_cmd.runDeepLearning(timeoutSecs=300, parseResult=parseResult, **kwargs)
            print "nn result:", h2o.dump_json(nn)
            h2o.check_sandbox_for_errors()
            # FIX! simple check?

            print "Deep Learning end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Пример #2
0
    def test_NN2_params_rand2(self):
        h2o.beta_features = True
        csvPathname = 'covtype/covtype.20k.data'
        hex_key = 'covtype.20k.hex'
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, schema='put')
        paramDict = define_params()

        for trial in range(3):
            # params is mutable. This is default.
            params = {'response': 'C55', 'epochs': '1'}
            h2o_nn.pickRandDeepLearningParams(paramDict, params)
            kwargs = params.copy()
            start = time.time()
            nn = h2o_cmd.runDeepLearning(timeoutSecs=500, parseResult=parseResult, **kwargs)
            print "nn result:", h2o.dump_json(nn)
            h2o.check_sandbox_for_errors()


            deeplearning_model = nn['deeplearning_model']
            errors = deeplearning_model['errors']
            # print "errors", h2o.dump_json(errors)
            # print "errors, classification", errors['classification']

            # assert 1==0
            # unstable = nn['model_info']['unstable']

            # unstable case caused by : 
            # normal initial distribution with amplitude 1 and input_dropout_ratio=1.  
            # blowing up numerically during propagation of all zeroes as input repeatedly.  
            # arnon added logging to stdout in addition to html in 7899b92ad67.  
            # Will have to check that first before making predictions.

            # print "unstable:", unstable

            # FIX! simple check?

            print "Deep Learning end on ", csvPathname, 'took', time.time() - start, 'seconds'
            print "Trial #", trial, "completed\n"
Пример #3
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        h2o.beta_features = True
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)

        h2o.beta_features = False
Пример #4
0
    def test_DeepLearning_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        validation_key = 'test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, 
            timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY)
        parseResultV = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, 
            timeoutSecs=timeoutSecs, doSummary=DO_SUMMARY)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        response = 'any_response'

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # use defaults otherwise
        # need to change epochs otherwise it takes too long
        kwargs = {
            'epochs'                       : 0.001,
            'response'                     : response,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03 ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.35 # allow 35% tolerance. kbn
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))
Пример #5
0
    def test_NN_airlines_small(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'airlines_train.hex'
        validation_key = 'airlines_test.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        # this gives the last col number, which is IsDepDelayed_REC (1 or -1)
        # response = inspect['numCols'] - 1

        # this is "YES"/"NO"
        response = 'IsDepDelayed'

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # get the column names
        colNames = [c['name'] for c in inspect['cols']]
        print "colNames:", colNames
        usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier",
                    "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance")

        ignoredCols = []
        for c in colNames:
            # don't put the response in the ignore list (is there a problem if so?)
            if c not in usedCols and c != response:
                ignoredCols.append(c)

        ignoredColsString = ",".join(ignoredCols)
        print "Telling h2o to ignore these cols:"
        print ignoredColsString

        kwargs = {
            'ignored_cols': ignoredColsString,
            'response': response,
            'classification': 1,
            'destination_key': model_key,
        }
        expectedErr = 0.45  ## expected validation error for the above model
        relTol = 0.50  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Пример #6
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.001
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs,
                                               **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {}

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs,
                **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs(
                (expectedErr - actualErr) / expectedErr) > relTol:
                raise Exception(
                    "Scored classification error of %s is not within %s %% relative error of %s"
                    % (actualErr, float(relTol) * 100, expectedErr))

            trial += 1
Пример #7
0
    def test_DeepLearning_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 300
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '1024,1024,2048',
            'adaptive_rate'                : 1,
            'rho'                          : 0.99,
            'epsilon'                      : 1e-8,
            'train_samples_per_iteration'  : -1, ## 0: better accuracy!  -1: best scalability!  10000: best accuracy?
#            'rate'                         : 0.01,
#            'rate_annealing'               : 1e-6,
#            'momentum_start'               : 0.5,
#            'momentum_ramp'                : 1800000,
#            'momentum_stable'              : 0.99,
            'l1'                           : 1e-5,
            'l2'                           : 0.0,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            'epochs'                       : 128, #enough for 64 nodes
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000 #don't score until the end
            }

        timeoutSecs = 7200
        start = time.time()
        deeplearning = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
Пример #8
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1


        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '117,131,129',
            'rate'                         : 0.005,
            'rate_annealing'               : 1e-6,
            'momentum_start'               : 0.5,
            'momentum_ramp'                : 100000,
            'momentum_stable'              : 0.9,
            'l1'                           : 0.00001,
            'l2'                           : 0.0000001,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs'                       : 20.0,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03 ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        ### Now score using the model, and check the validation error
        kwargs = {
            'source' : validation_key,
            'max_rows': 0,
            'response': response,
            'ignored_cols': None, # this is not consistent with ignored_cols_by_name
            'classification': 1,
            'destination_key': 'score_' + identifier + '.hex',
            'model': model_key,
        }
        nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
        h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)
Пример #9
0
    def test_anomaly_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, COLS, 'x.hex', 1, 20000),
            (ROWS, COLS, 'x.hex', -5000, 0),
            (ROWS, COLS, 'x.hex', -100000, 100000),
            (ROWS, COLS, 'x.hex', -1, 1),

            (ROWS, COLS, 'A.hex', 1, 100),
            (ROWS, COLS, 'A.hex', -99, 99),

            (ROWS, COLS, 'B.hex', 1, 10000),
            (ROWS, COLS, 'B.hex', -100, 100),

            (ROWS, COLS, 'C.hex', 1, 100000),
            (ROWS, COLS, 'C.hex', -101, 101),
        ]

        trial = 1
        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "numRows:", numRows, "numCols:", numCols

            model_key = "m.hex"
            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : numCols-1,
                'classification'               : 0,
                'activation'                   : 'RectifierWithDropout',
                'input_dropout_ratio'          : 0.2,
                'hidden'                       : '117',
                'adaptive_rate'                : 0,
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0.5,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.9,
                'l1'                           : 0.00001,
                'l2'                           : 0.0000001,
                'seed'                         : 98037452452,
                # 'loss'                         : 'CrossEntropy',
                'max_w2'                       : 15,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 2.0,
                'destination_key'              : model_key,
                # 'validation'                   : None,
                'score_interval'               : 10000,
                'autoencoder'                  : 1,
                }

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end. took", time.time() - start, "seconds"


            kwargs = {
                'destination_key': "a.hex",
                'source': parseResult['destination_key'],
                'dl_autoencoder_model': model_key,
                'thresh': 1.0
            }

            anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs)
            inspect = h2o_cmd.runInspect(None, "a.hex")
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "anomaly: numRows:", numRows, "numCols:", numCols
            self.assertEqual(numCols,1)
            # twice as many rows because of NA injection
            self.assertEqual(numRows,rowCount*(1 + NA_ROW_RATIO))

            # first col has the anomaly info. other cols are the same as orig data
            aSummary = h2o_cmd.runSummary(key='a.hex', cols=0)
            h2o_cmd.infoFromSummary(aSummary)


            print "anomaly:", h2o.dump_json(anomaly)
            trial += 1
            h2i.delete_keys_at_all_nodes()
Пример #10
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'standard/covtype.shuffled.90pct.data'
        csvPathname_test = 'standard/covtype.shuffled.10pct.data'
        hex_key = 'covtype.hex'
        validation_key = 'covtype.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname_train,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname_test,
                                        schema='local',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'adaptive_rate': 0,
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 96.0,
            'destination_key': model_key,
            'validation': validation_key,
            'score_interval': 10000
        }
        expectedErr = 0.24  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Пример #11
0
    def test_DeepLearning_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        validation_key = 'test.hex'
        timeoutSecs = 300
        parseResult = h2i.import_parse(path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=DO_SUMMARY)
        parseResultV = h2i.import_parse(path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=DO_SUMMARY)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        response = 'any_response'

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # use defaults otherwise
        # need to change epochs otherwise it takes too long
        kwargs = {
            'epochs': 0.001,
            'response': response,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.35  # allow 35% tolerance. kbn
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Пример #12
0
    def test_NN_airlines_small(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test  = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'airlines_train.hex'
        validation_key = 'airlines_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        # this gives the last col number, which is IsDepDelayed_REC (1 or -1)
        # response = inspect['numCols'] - 1

        # this is "YES"/"NO"
        response = 'IsDepDelayed'

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        # get the column names
        colNames = [c['name'] for c in inspect['cols']]
        print "colNames:", colNames
        usedCols = ("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance")

        ignoredCols = []
        for c in colNames:
            # don't put the response in the ignore list (is there a problem if so?)
            if c not in usedCols and c != response:
                ignoredCols.append(c)

        ignoredColsString = ",".join(ignoredCols)
        print "Telling h2o to ignore these cols:"
        print ignoredColsString

        kwargs = {
            'ignored_cols'                 : ignoredColsString,
            'response'                     : response,
            'classification'               : 1,
            'destination_key'              : model_key,
            }
        expectedErr = 0.45 ## expected validation error for the above model
        relTol = 0.50 ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        h2o.beta_features = True
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))

        h2o.beta_features = False
Пример #13
0
    def test_anomaly_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, COLS, 'x.hex', 1, 20000),
            (ROWS, COLS, 'x.hex', -5000, 0),
            (ROWS, COLS, 'x.hex', -100000, 100000),
            (ROWS, COLS, 'x.hex', -1, 1),
            (ROWS, COLS, 'A.hex', 1, 100),
            (ROWS, COLS, 'A.hex', -99, 99),
            (ROWS, COLS, 'B.hex', 1, 10000),
            (ROWS, COLS, 'B.hex', -100, 100),
            (ROWS, COLS, 'C.hex', 1, 100000),
            (ROWS, COLS, 'C.hex', -101, 101),
        ]

        trial = 1
        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "numRows:", numRows, "numCols:", numCols

            model_key = "m.hex"
            kwargs = {
                'ignored_cols': None,
                'response': numCols - 1,
                'classification': 0,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117',
                'adaptive_rate': 0,
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                # 'loss'                         : 'CrossEntropy',
                'max_w2': 15,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 2.0,
                'destination_key': model_key,
                # 'validation'                   : None,
                'score_interval': 10000,
                'autoencoder': 1,
            }

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "neural net end. took", time.time() - start, "seconds"

            kwargs = {
                'destination_key': "a.hex",
                'source': parseResult['destination_key'],
                'dl_autoencoder_model': model_key,
                'thresh': 1.0
            }

            anomaly = h2o.nodes[0].anomaly(timeoutSecs=30, **kwargs)
            inspect = h2o_cmd.runInspect(None, "a.hex")
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            print "anomaly: numRows:", numRows, "numCols:", numCols
            self.assertEqual(numCols, 1)
            # twice as many rows because of NA injection
            self.assertEqual(numRows, rowCount * (1 + NA_ROW_RATIO))

            # first col has the anomaly info. other cols are the same as orig data
            aSummary = h2o_cmd.runSummary(key='a.hex', cols=0)
            h2o_cmd.infoFromSummary(aSummary)

            print "anomaly:", h2o.dump_json(anomaly)
            trial += 1
            h2i.delete_keys_at_all_nodes()
Пример #14
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '117,131,129',
            'adaptive_rate'                : 0,
            'rate'                         : 0.005,
            'rate_annealing'               : 1e-6,
            'momentum_start'               : 0.5,
            'momentum_ramp'                : 100000,
            'momentum_stable'              : 0.9,
            'l1'                           : 0.00001,
            'l2'                           : 0.0000001,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs'                       : 2.0,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
            'score_interval'               : 10000
            }
        expectedErr = 0.057 ## expected validation error for the above model
        relTol = 0.20 ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        predict_key = 'score_' + identifier + '.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
            }

        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)

        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {
        }

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs, **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm)/100.;

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
            raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                            (actualErr, float(relTol)*100, expectedErr))
Пример #15
0
    def test_DeepLearning_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'
            validation_key = hex_key

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols']
            response = 'C' + str(response)

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden'                       : '113,71,54',
                'rate'                         : 0.01,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0,
                'momentum_stable'              : 0,
                'l1'                           : 0.0,
                'l2'                           : 1e-6,
                'seed'                         : 80023842348,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 15,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 100,
                'destination_key'              : model_key,
                'validation'                   : hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "trial #", trial, "Deep Learning end on ", csvFilename, ' took', time.time() - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.00
            relTol = 0.01
            predict_key = 'Predict.hex'

            kwargs = {
                'data_key': validation_key,
                'destination_key': predict_key,
                'model_key': model_key
            }
            predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
            h2o_cmd.runInspect(key=predict_key, verbose=True)

            kwargs = {
            }

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=validation_key,
                vactual=response,
                predict=predict_key,
                vpredict='predict',
                timeoutSecs=timeoutSecs, **kwargs)

            cm = predictCMResult['cm']

            print h2o_gbm.pp_cm(cm)
            actualErr = h2o_gbm.pp_cm_summary(cm)/100.

            print "actual   classification error:" + format(actualErr)
            print "expected classification error:" + format(expectedErr)
            if actualErr != expectedErr and abs((expectedErr - actualErr)/expectedErr) > relTol:
                raise Exception("Scored classification error of %s is not within %s %% relative error of %s" %
                                (actualErr, float(relTol)*100, expectedErr))


            trial += 1
Пример #16
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 90
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 20.0,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        #### Now score using the model, and check the validation error
        expectedErr = 0.046
        relTol = 0.1
        predict_key = 'Predict.hex'

        kwargs = {
            'data_key': validation_key,
            'destination_key': predict_key,
            'model_key': model_key
        }
        predictResult = h2o_cmd.runPredict(timeoutSecs=timeoutSecs, **kwargs)
        h2o_cmd.runInspect(key=predict_key, verbose=True)

        kwargs = {}

        predictCMResult = h2o.nodes[0].predict_confusion_matrix(
            actual=validation_key,
            vactual=response,
            predict=predict_key,
            vpredict='predict',
            timeoutSecs=timeoutSecs,
            **kwargs)

        cm = predictCMResult['cm']

        print h2o_gbm.pp_cm(cm)
        actualErr = h2o_gbm.pp_cm_summary(cm) / 100.

        print "actual   classification error:" + format(actualErr)
        print "expected classification error:" + format(expectedErr)
        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
Пример #17
0
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols': None,
            'response': response,
            'classification': 1,
            'activation': 'RectifierWithDropout',
            'input_dropout_ratio': 0.2,
            'hidden': '117,131,129',
            'rate': 0.005,
            'rate_annealing': 1e-6,
            'momentum_start': 0.5,
            'momentum_ramp': 100000,
            'momentum_stable': 0.9,
            'l1': 0.00001,
            'l2': 0.0000001,
            'seed': 98037452452,
            'loss': 'CrossEntropy',
            'max_w2': 15,
            'initial_weight_distribution': 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs': 20.0,
            'destination_key': model_key,
            'validation': validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03  ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
        ) - start, 'seconds'

        relTol = 0.10  ### 10% relative error is acceptable for Hogwild
        h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                        inspect['numRows'], expectedErr,
                                        relTol, **kwargs)

        ### Now score using the model, and check the validation error
        kwargs = {
            'source': validation_key,
            'max_rows': 0,
            'response': response,
            'ignored_cols':
            None,  # this is not consistent with ignored_cols_by_name
            'classification': 1,
            'destination_key': 'score_' + identifier + '.hex',
            'model': model_key,
        }
        nnScoreResult = h2o_cmd.runDeepLearningScore(
            key=parseResult['destination_key'],
            timeoutSecs=timeoutSecs,
            **kwargs)
        h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                **kwargs)

        h2o.beta_features = False