示例#1
0
    def test_GLM_both(self):
        if (1==1):
            csvFilenameList = [
                ('logreg', 'benign.csv', 'binomial', 3, 10),
                # col is zero based
                # FIX! what's wrong here? index error
                ## ('uis.dat', 'binomial', 8, 5, False),
                ## ('pros.dat', 'binomial', 1, 10, False),
                ## ('chdage.dat', 'binomial', 2, 5, True),
                ## ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ## ('clslowbwt.dat', 'binomial', 7, 10, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('logreg', 'benign.csv', 'gaussian', 3, 10),
                (None, 'icu.dat', 'binomial', 1, 10),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'lowbwt.dat', 'binomial', 1, 10),
                (None, 'lowbwtm11.dat', 'binomial', 1, 10),
                (None, 'meexp.dat', 'gaussian', 3, 10),
                # FIX! does this one hang in R?
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'pbc.dat', 'gaussian', 1, 10),
                (None, 'pharynx.dat', 'gaussian', 12, 10),
                (None, 'uis.dat', 'binomial', 8, 10),
            ]

        trial = 0
        for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList:

            # FIX! do something about this file munging
            if offset:
                csvPathname1 = offset + "/" + csvFilename
            else:
                csvPathname1 = 'logreg/umass_statdata/' + csvFilename

            fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)

            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(fullPathname, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)

            num_cols = inspect['numCols']
            num_rows = inspect['numRows']
            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                if csvFilename=='clslowbwt.dat' and c==6:
                    print "Not including col 6 for this dataset from x"
                if csvFilename=='benign.csv' and (c==0 or c==1):
                    print "Not including col 0,1 for this dataset from x"
                else:
                    # don't add the output col to the RHS of formula
                    if x is None: 
                        col_names += "V" + str(c+1)
                    else: 
                        col_names += ",V" + str(c+1)

                    if c!=y:
                        if x is None: 
                            x = str(c)
                            formula += "V" + str(c+1)
                        else: 
                            x += "," + str(c)
                            formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names

        
            print 'x:', x

            kwargs = { 
                'n_folds': 0, 
                'response': y, 
                # what about x?
                'family': family, 
                'alpha': 0, 
                'lambda': 0,
                'beta_epsilon': 1.0E-4, 
                'max_iter': 50 }

            if csvFilename=='benign.csv':
                kwargs['ignored_cols'] = '0,1'

            if csvFilename=='clslowbwt.dat':
                kwargs['ignored_cols'] = '6'

            
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial
示例#2
0
    def test_GLM_both(self):
        h2o.beta_features = True
        if (1==1):
            csvFilenameList = [
                ('logreg', 'benign.csv', 'binomial', 3, 10),
                # col is zero based
                # FIX! what's wrong here? index error
                ## ('uis.dat', 'binomial', 8, 5, False),
                ## ('pros.dat', 'binomial', 1, 10, False),
                ## ('chdage.dat', 'binomial', 2, 5, True),
                ## ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ## ('clslowbwt.dat', 'binomial', 7, 10, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('logreg', 'benign.csv', 'gaussian', 3, 10),
                (None, 'icu.dat', 'binomial', 1, 10),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'lowbwt.dat', 'binomial', 1, 10),
                (None, 'lowbwtm11.dat', 'binomial', 1, 10),
                (None, 'meexp.dat', 'gaussian', 3, 10),
                # FIX! does this one hang in R?
                (None, 'nhanes3.dat', 'binomial', 15, 10),
                (None, 'pbc.dat', 'gaussian', 1, 10),
                (None, 'pharynx.dat', 'gaussian', 12, 10),
                (None, 'uis.dat', 'binomial', 8, 10),
            ]

        trial = 0
        for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList:

            # FIX! do something about this file munging
            if offset:
                csvPathname1 = offset + "/" + csvFilename
            else:
                csvPathname1 = 'logreg/umass_statdata/' + csvFilename

            fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True)

            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(fullPathname, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)

            if h2o.beta_features:
                num_cols = inspect['numCols']
                num_rows = inspect['numRows']
            else:
                num_cols = inspect['num_cols']
                num_rows = inspect['num_rows']

            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                if csvFilename=='clslowbwt.dat' and c==6:
                    print "Not including col 6 for this dataset from x"
                if csvFilename=='benign.csv' and (c==0 or c==1):
                    print "Not including col 0,1 for this dataset from x"
                else:
                    # don't add the output col to the RHS of formula
                    if x is None: 
                        col_names += "V" + str(c+1)
                    else: 
                        col_names += ",V" + str(c+1)

                    if c!=y:
                        if x is None: 
                            x = str(c)
                            formula += "V" + str(c+1)
                        else: 
                            x += "," + str(c)
                            formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names

        
            print 'x:', x

            if h2o.beta_features:
                kwargs = { 
                    'n_folds': 0, 
                    'response': y, 
                    # what about x?
                    'family': family, 
                    'alpha': 0, 
                    'lambda': 0,
                    'beta_epsilon': 1.0E-4, 
                    'max_iter': 50 }
            else:
                kwargs = { 
                    'n_folds': 0, 
                    'y': y, 
                    'x': x,
                    'family': family, 
                    'alpha': 0, 
                    'lambda': 1e-4,
                    'beta_epsilon': 1.0E-4, 
                    'max_iter': 50 }

            if csvFilename=='benign.csv':
                kwargs['ignored_cols'] = '0,1'

            if csvFilename=='clslowbwt.dat':
                kwargs['ignored_cols'] = '6'

            
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial
示例#3
0
    def test_GLM_umass(self):
        if 1 == 1:
            csvFilenameList = [
                # col is zero based
                # FIX! what's wrong here? index error
                ("uis.dat", "binomial", 8, 5, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
                ("pros.dat", "binomial", 1, 10, False),
                ("chdage.dat", "binomial", 2, 5, True),
                ("icu.dat", "binomial", 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ("clslowbwt.dat", "binomial", 7, 10, False),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ("icu.dat", "binomial", 1, 10, None),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                ("nhanes3.dat", "binomial", 15, 10),
                ("lowbwt.dat", "binomial", 1, 10, "2,3,4,5,6,7,8,9"),
                ("lowbwtm11.dat", "binomial", 1, 10, None),
                ("meexp.dat", "gaussian", 3, 10, None),
                # FIX! does this one hang in R?
                ("nhanes3.dat", "binomial", 15, 10, None),
                ("pbc.dat", "gaussian", 1, 10, None),
                ("pharynx.dat", "gaussian", 12, 10, None),
                ("uis.dat", "binomial", 8, 10, None),
            ]

        trial = 0
        for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList:
            # FIX! do something about this file munging
            csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename)
            csvPathname2 = SYNDATASETS_DIR + "/" + csvFilename + "_2.csv"
            h2o_util.file_clean_for_R(csvPathname1, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseKey["destination_key"]
            inspect = h2o_cmd.runInspect(None, destination_key)
            num_cols = inspect["num_cols"]
            num_rows = inspect["num_rows"]
            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y + 1) + " ~ "
            x = None
            col_names = ""
            for c in range(0, num_cols):
                # don't add the output col to the RHS of formula
                if x is None:
                    col_names += "V" + str(c + 1)
                else:
                    col_names += ",V" + str(c + 1)

                if c != y:
                    if x is None:
                        x = str(c)
                        formula += "V" + str(c + 1)
                    else:
                        x += "," + str(c)
                        formula += "+V" + str(c + 1)

            print "formula:", formula
            print "col_names:", col_names
            print "x:", x

            kwargs = {
                "n_folds": 0,
                "y": y,
                "x": x,
                "family": family,
                "link": "familyDefault",
                "alpha": 0,
                "lambda": 0,
                "case_mode": "=",
                "case": 1,
                "beta_eps": 1.0e-4,
                "max_iter": 50,
            }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, "took", time.time() - start, "seconds"
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(
                self, csvPathname2, family, formula, y, header=header, h2oResults=h2oResults
            )

            trial += 1
            print "\nTrial #", trial
示例#4
0
    def test_GLM_umass(self):
        if (1==1):
            csvFilenameList = [
                # col is zero based
                # FIX! what's wrong here? index error
                ('uis.dat', 'binomial', 8, 5, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
                ('pros.dat', 'binomial', 1, 10, False),
                ('chdage.dat', 'binomial', 2, 5, True),
                ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ('clslowbwt.dat', 'binomial', 7, 10, False),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('icu.dat', 'binomial', 1, 10, None),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                ('nhanes3.dat', 'binomial', 15, 10),
                ('lowbwt.dat', 'binomial', 1, 10, '2,3,4,5,6,7,8,9'),
                ('lowbwtm11.dat', 'binomial', 1, 10, None),
                ('meexp.dat', 'gaussian', 3, 10, None),
                # FIX! does this one hang in R?
                ('nhanes3.dat', 'binomial', 15, 10, None),
                ('pbc.dat', 'gaussian', 1, 10, None),
                ('pharynx.dat', 'gaussian', 12, 10, None),
                ('uis.dat', 'binomial', 8, 10, None),
            ]

        trial = 0
        for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList:

            # FIX! do something about this file munging
            csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename)
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(csvPathname1, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                if csvFilename=='clslowbwt.dat' and c==6:
                    print "Not including col 6 for this dataset from x"
                else:
                    # don't add the output col to the RHS of formula
                    if x is None: 
                        col_names += "V" + str(c+1)
                    else: 
                        col_names += ",V" + str(c+1)

                    if c!=y:
                        if x is None: 
                            x = str(c)
                            formula += "V" + str(c+1)
                        else: 
                            x += "," + str(c)
                            formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names

        
            print 'x:', x

            kwargs = { 'n_folds': 0, 'y': y, 'x': x,
                'family': family, 'link': 'familyDefault',
                'alpha': 0, 'lambda': 0, 'case_mode': '=', 'case': 1,
                'beta_epsilon': 1.0E-4, 'max_iter': 50 }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, 
                header=header, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial
示例#5
0
    def test_GLM_umass(self):
        if (1==1):
            csvFilenameList = [
                # col is zero based
                # FIX! what's wrong here? index error
                ('uis.dat', 'binomial', 8, 5, False),
                # ('cgd.dat', 'gaussian', 12, 5, False),
                # ('meexp.dat', 'gaussian', 3, 10, None),
                ('pros.dat', 'binomial', 1, 10, False),
                ('chdage.dat', 'binomial', 2, 5, True),
                ('icu.dat', 'binomial', 1, 10, False),
                # how to ignore 6? '1,2,3,4,5', False),
                ('clslowbwt.dat', 'binomial', 7, 10, False),
            ]
        else:
            csvFilenameList = [
                # leave out ID and birth weight
                ('icu.dat', 'binomial', 1, 10, None),
                # need to exclude col 0 (ID) and col 10 (bwt)
                # but -x doesn't work..so do 2:9...range doesn't work? FIX!
                ('nhanes3.dat', 'binomial', 15, 10),
                ('lowbwt.dat', 'binomial', 1, 10, '2,3,4,5,6,7,8,9'),
                ('lowbwtm11.dat', 'binomial', 1, 10, None),
                ('meexp.dat', 'gaussian', 3, 10, None),
                # FIX! does this one hang in R?
                ('nhanes3.dat', 'binomial', 15, 10, None),
                ('pbc.dat', 'gaussian', 1, 10, None),
                ('pharynx.dat', 'gaussian', 12, 10, None),
                ('uis.dat', 'binomial', 8, 10, None),
            ]

        trial = 0
        for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList:
            # FIX! do something about this file munging
            csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename)
            csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv'
            h2o_util.file_clean_for_R(csvPathname1, csvPathname2)

            # we can inspect this to get the number of cols in the dataset (trust H2O here)
            parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10)
            # we could specify key2 above but this is fine
            destination_key = parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, destination_key)
            num_cols = inspect['num_cols']
            num_rows = inspect['num_rows']
            print "num_cols", num_cols, "num_rows", num_rows
            ##  print h2o.dump_json(inspect)

            # create formula and the x for H2O GLM
            formula = "V" + str(y+1) + " ~ "
            x = None
            col_names = ""
            for c in range(0,num_cols):
                # don't add the output col to the RHS of formula
                if x is None: 
                    col_names += "V" + str(c+1)
                else: 
                    col_names += ",V" + str(c+1)

                if c!=y:
                    if x is None: 
                        x = str(c)
                        formula += "V" + str(c+1)
                    else: 
                        x += "," + str(c)
                        formula += "+V" + str(c+1)

            print 'formula:', formula
            print 'col_names:', col_names
            print 'x:', x

            kwargs = { 'num_cross_validation_folds': 0, 'y': y, 'x': x,
                'family': family, 'link': 'familyDefault',
                'alpha': 0, 'lambda': 0, 'case_mode': '=', 'case': 1,
                'beta_epsilon': 1.0E-4, 'max_iter': 50 }

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)

            print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds'
            h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs)
            # now do it thru R and compare
            (warningsR, cListR, interceptR) = glm_R_and_compare(csvPathname2, family, formula, y, 
                header=header, h2oResults=h2oResults)

            trial += 1
            print "\nTrial #", trial