Python createIgnoredCols 예제들, h2o_cmd.createIgnoredCols Python 예제들

예제 #1

0

파일 보기

    def test_GLM2_tweedie(self):
        csvFilename = "AutoClaim.csv"
        csvPathname = 'standard/' + csvFilename
        print "\nStarting", csvPathname
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        # columns start at 0
        # regress: glm(CLM_AMT ~ CAR_USE + REVOLKED + GENDER + AREA + MARRIED + CAR_TYPE, data=AutoClaim, family=tweedie(1.34))
        
        coefs = [7, 13, 20, 27, 21, 11]
        y = 4
        ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], cols=coefs, response=y)

        # sapply(c('CLM_AMT', 'CAR_USE', 'REVOLKED', 'GENDER', 'AREA', 'MARRIED', 'CAR_TYPE'), function(x) which(x==colnames(AutoClaim)) - 1)
        kwargs = {
                'family': 'tweedie',
                'tweedie_variance_power': 1.36,
                'response': y, 
                'ignored_cols' : ignored_cols,
                'max_iter': 10, 
                'lambda': 0,
                'alpha': 0,
                'n_folds': 0,
                'beta_epsilon': 1e-4,
        }

        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=15, **kwargs)

        coefficientsExpected = {'Intercept': 0, 'GENDER.M': 0.0014842488782470984, 'CAR_TYPE.Sports Car': 0.07786742314454961, 'MARRIED.Yes': 0.0007748552195851079, 'CAR_TYPE.SUV': 0.07267702940249621, 'CAR_TYPE.Pickup': 0.04952083408742968, 'CAR_TYPE.Van': 0.026422137690691405, 'CAR_TYPE.Sedan': 0.05128350794060489, 'CAR_USE.Private': -0.03050194832853935, 'REVOLKED.Yes': -0.05095942737408699}

        deltaExpected = 0.05
        (warnings, coefficients, intercept) = h2o_glm.simpleCheckGLM(self, glm, None,   
            coefficientsExpected=coefficientsExpected, deltaExpected=deltaExpected, **kwargs)
        print 'coefficients: %s' % (str(coefficients))

예제 #2

0

파일 보기

    def test_GLM2_umass(self):
        h2o.beta_features = True
        csvFilenameList = [
            ('cgd.dat', 'gaussian', 12, 30, None),
            ('chdage.dat', 'binomial', 2, 30, None),

            # leave out ID and birth weight
            ('clslowbwt.dat', 'binomial', 7, 60, [1, 2, 3, 4, 5]),
            ('icu.dat', 'binomial', 1, 60, None),
            # need to exclude col 0 (ID) and col 10 (bwt)
            # but -x doesn't work..so do 2:9...range doesn't work? FIX!
            ('lowbwt.dat', 'binomial', 1, 60, [2, 3, 4, 5, 6, 7, 8, 9]),
            ('lowbwtm11.dat', 'binomial', 1, 60, None),
            ('meexp.dat', 'gaussian', 3, 60, None),
            ('nhanes3.dat', 'binomial', 15, 60, None),
            ('pbc.dat', 'gaussian', 1, 60, None),
            ('pharynx.dat', 'gaussian', 12, 60, None),
            ('pros.dat', 'binomial', 1, 60, None),
            ('uis.dat', 'binomial', 8, 60, None),
        ]

        trial = 0
        for i in range(3):
            for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList:
                csvPathname = "logreg/umass_statdata/" + csvFilename
                kwargs = {
                    'n_folds': 3,
                    'response': y,
                    'family': family,
                    'alpha': 1,
                    'lambda': 1e-4
                }

                parseResult = h2i.import_parse(bucket='smalldata',
                                               path=csvPathname,
                                               schema='put',
                                               timeoutSecs=timeoutSecs)
                if x is not None:
                    ignored_cols = h2o_cmd.createIgnoredCols(
                        key=parseResult['destination_key'], cols=x, response=y)
                    kwargs['ignored_cols'] = ignored_cols

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     **kwargs)
                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                print "glm end (w/check) on ", csvPathname, 'took', time.time(
                ) - start, 'seconds'
                trial += 1
                print "\nTrial #", trial

예제 #3

0

파일 보기

파일: test_GLM2_umass.py 프로젝트: jimmy0000/h2o

    def test_GLM2_umass(self):
        h2o.beta_features = True
        csvFilenameList = [
            ('cgd.dat', 'gaussian', 12, 5, None),
            ('chdage.dat', 'binomial', 2, 5, None),
    
            # leave out ID and birth weight
            ('clslowbwt.dat', 'binomial', 7, 30, [1,2,3,4,5]),
            ('icu.dat', 'binomial', 1, 30, None),
            # need to exclude col 0 (ID) and col 10 (bwt)
            # but -x doesn't work..so do 2:9...range doesn't work? FIX!
            ('lowbwt.dat', 'binomial', 1, 30, [2,3,4,5,6,7,8,9]),
            ('lowbwtm11.dat', 'binomial', 1, 30, None),
            ('meexp.dat', 'gaussian', 3, 30, None),
            ('nhanes3.dat', 'binomial', 15, 30, None),
            ('pbc.dat', 'gaussian', 1, 30, None),
            ('pharynx.dat', 'gaussian', 12, 30, None),
            ('pros.dat', 'binomial', 1, 30, None),
            ('uis.dat', 'binomial', 8, 30, None),
            ]

        trial = 0
        for i in range(3):
            for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList:
                csvPathname = "logreg/umass_statdata/" + csvFilename
                kwargs = {'n_folds': 3, 'response': y, 'family': family, 'alpha': 1, 'lambda': 1e-4}


                parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put', 
                    timeoutSecs=timeoutSecs)
                if x is not None:
                    ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult['destination_key'], 
                        cols=x, response=y)
                    kwargs['ignored_cols'] = ignored_cols


                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                print "glm end (w/check) on ", csvPathname, 'took', time.time() - start, 'seconds'
                trial += 1
                print "\nTrial #", trial

예제 #4

0

파일 보기

파일: test_GLM2_umass.py 프로젝트: hectorfung/h2o

    def test_GLM2_umass(self):
        h2o.beta_features = True
        csvFilenameList = [
            ("cgd.dat", "gaussian", 12, 5, None),
            ("chdage.dat", "binomial", 2, 5, None),
            # leave out ID and birth weight
            ("clslowbwt.dat", "binomial", 7, 10, [1, 2, 3, 4, 5]),
            ("icu.dat", "binomial", 1, 10, None),
            # need to exclude col 0 (ID) and col 10 (bwt)
            # but -x doesn't work..so do 2:9...range doesn't work? FIX!
            ("lowbwt.dat", "binomial", 1, 10, [2, 3, 4, 5, 6, 7, 8, 9]),
            ("lowbwtm11.dat", "binomial", 1, 10, None),
            ("meexp.dat", "gaussian", 3, 10, None),
            ("nhanes3.dat", "binomial", 15, 10, None),
            ("pbc.dat", "gaussian", 1, 10, None),
            ("pharynx.dat", "gaussian", 12, 10, None),
            ("pros.dat", "binomial", 1, 10, None),
            ("uis.dat", "binomial", 8, 10, None),
        ]

        trial = 0
        for i in range(3):
            for (csvFilename, family, y, timeoutSecs, x) in csvFilenameList:
                csvPathname = "logreg/umass_statdata/" + csvFilename
                kwargs = {"n_folds": 2, "response": y, "family": family, "alpha": 1, "lambda": 1e-4}

                parseResult = h2i.import_parse(
                    bucket="smalldata", path=csvPathname, schema="put", timeoutSecs=timeoutSecs
                )
                if x is not None:
                    ignored_cols = h2o_cmd.createIgnoredCols(key=parseResult["destination_key"], cols=x, response=y)
                    kwargs["ignored_cols"] = ignored_cols

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                print "glm end (w/check) on ", csvPathname, "took", time.time() - start, "seconds"
                trial += 1
                print "\nTrial #", trial