예제 #1
0
def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None):
    # this is some hack code for reading the csv and doing some percentile stuff in scipy
    # from numpy import loadtxt, genfromtxt, savetxt
    import numpy as np
    import scipy as sp

    dataset = np.genfromtxt(
        open(csvPathname, 'r'),
        delimiter=',',
        # skip_header=1,
        dtype=None); # guess!

    print "csv read for training, done"
    # we're going to strip just the last column for percentile work
    # used below
    NUMCLASSES = 10
    print "csv read for training, done"

    # data is last column
    # drop the output
    print dataset.shape
    if len(dataset.shape) > 1:
        target = [x[col] for x in dataset]
    else:
        target = dataset

    # we may have read it in as a string. coerce to number
    targetFP = np.array(target, np.float)

    if 1==0:
        n_features = len(dataset[0]) - 1;
        print "n_features:", n_features

        # get the end
        # target = [x[-1] for x in dataset]
        # get the 2nd col

        print "histogram of target"
        print target
        print sp.histogram(target, bins=NUMCLASSES)

        print target[0]
        print target[1]

    thresholds   = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
    print "scipy per:", thresholds
    from scipy import stats
    # a = stats.scoreatpercentile(target, per=per)
    a = stats.mstats.mquantiles(targetFP, prob=thresholds)
    a2 = ["%.2f" % v for v in a]
    h2p.red_print("scipy stats.mstats.mquantiles:", a2)

    # also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
    # inplace sort
    targetFP.sort()
    b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
    label = '50%' if DO_MEDIAN else '99.9%'
    h2p.blue_print(label, "from sort:", b)
    s = a[5 if DO_MEDIAN else 10]
    h2p.blue_print(label, "from scipy:", s)
    h2p.blue_print(label, "from h2o summary2:", h2oMedian)
    h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a]
        h2p.red_print("after sort")
        h2p.red_print("scipy stats.mstats.mquantiles:", a2)
예제 #2
0
# this is type 7
alphap = 1
betap = 1

from scipy import stats
a1 = stats.scoreatpercentile(target,
                             per=100 * OTHER_T,
                             interpolation_method='fraction')
h2p.red_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP,
                             prob=[OTHER_T],
                             alphap=alphap,
                             betap=betap)
h2p.red_print("scipy stats.mstats.mquantiles:", a2)
targetFP.sort()
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.red_print("sort algo:", b)
h2p.red_print("from h2o (multi):", quantiles[0])

print "Now looking at the sorted list..same thing"
h2p.blue_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP,
                             prob=[OTHER_T],
                             alphap=alphap,
                             betap=betap)
h2p.blue_print("scipy stats.mstats.mquantiles:", a2)
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.blue_print("sort algo:", b)
h2p.blue_print("from h2o (multi):", quantiles[0])
예제 #3
0
QUANTILE = 0.25
print "stress the 1000 fixed binning based on (max-min)/1000"
a = [
    -1.0000002e10, -1.0000001e10, -1.0000000e10, -1.0000002e9, -1.0000001e9,
    -1.0000000e9, -1.0000002e6, -1.0000001e6, -1.0000000e6, -1.0000002e3,
    -1.0000001e3, -1.0000000e3, -1.0, 0.0000000, 1.0, 1.0000002e3, 1.0000001e3,
    1.0000000e3, 1.0000002e6, 1.0000001e6, 1.0000000e6, 1.0000002e9,
    1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10
]

initList = ["ddd = c(%s)" % ",".join(map(str, a))]

# get expected result
a.sort()
expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear')
print "expectedP:", expectedP
h2p.blue_print("sort result, expectedP:", expectedP)

exprList = [
    ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1),
]


class Basic(unittest.TestCase):
    def tearDown(self):
        h2o.check_sandbox_for_errors()

    @classmethod
    def setUpClass(cls):
        global SEED
예제 #4
0
    1.0000000e6,
    1.0000002e9,
    1.0000001e9,
    1.0000000e9,
    1.0000002e10,
    1.0000001e10,
    1.0000000e10
]

initList = [
    "ddd = c(%s)" % ",".join(map(str,a))
]

# get expected result
a.sort()
expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear')
print "expectedP:", expectedP
h2p.blue_print("sort result, expectedP:", expectedP)

exprList = [
    ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1),
]

class Basic(unittest.TestCase):
    def tearDown(self):
        h2o.check_sandbox_for_errors()

    @classmethod
    def setUpClass(cls):
        global SEED
        SEED = h2o.setup_random_seed()
예제 #5
0
파일: binquant.py 프로젝트: earlh/h2o


# an approx? (was good when comparing to h2o type 2)
alphap=0.4
betap=0.4

# this is type 7
alphap=1
betap=1


from scipy import stats
a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction')
h2p.red_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap)
h2p.red_print("scipy stats.mstats.mquantiles:", a2)
targetFP.sort()
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.red_print("sort algo:", b)
h2p.red_print( "from h2o (multi):", quantiles[0])

print "Now looking at the sorted list..same thing"
h2p.blue_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap)
h2p.blue_print("scipy stats.mstats.mquantiles:", a2)
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.blue_print("sort algo:", b)
h2p.blue_print( "from h2o (multi):", quantiles[0])