示例#1
0
def test_data_indexing():
    """test data indexing"""

    d = micro_data()

    msg = 'error indexing data by era'
    ade(d['era1'], micro_data([0]), msg)
    ade(d['era2'], micro_data([1, 2]), msg)
    ade(d['era3'], micro_data([3, 4, 5]), msg)
    ade(d['era4'], micro_data([6]), msg)
    ade(d['eraX'], micro_data([7, 8, 9]), msg)

    msg = 'error indexing data by region'
    ade(d['train'], micro_data([0, 1, 2]), msg)
    ade(d['validation'], micro_data([3, 4, 5, 6]), msg)
    ade(d['test'], micro_data([7, 8]), msg)
    ade(d['live'], micro_data([9]), msg)

    msg = 'error indexing data by array'
    # TODO
    # a = d.y['kazutsugi']
    # b = d[a]
    # c = b == 0
    # ade(d[d.y['kazutsugi'] == 0], micro_data([0, 2, 4, 6, 8, 9]), msg)
    # ade(d[d.era == 'era4'], micro_data([6]), msg)

    assert_raises(IndexError, d.__getitem__, 'era')
    assert_raises(IndexError, d.__getitem__, 'wtf')
    assert_raises(IndexError, d.__getitem__, None)
示例#2
0
def test_data_loc():
    "test data.loc"
    d = micro_data()
    msg = 'data.loc indexing error'
    ade(d.loc[['index1']], micro_data([1]), msg)
    ade(d.loc[['index4']], micro_data([4]), msg)
    ade(d.loc[['index4', 'index0']], micro_data([4, 0]), msg)
    ade(d.loc[['index4', 'index0', 'index2']], micro_data([4, 0, 2]), msg)
示例#3
0
def test_data_methods():
    "test data methods"
    d = micro_data()
    ok_(len(d) == 10, "wrong length")
    ok_(d.size == 120, "wrong size")
    ok_(d.shape == (10, 12), "wrong shape")
    ok_(d == d, "not equal")
示例#4
0
def test_data_y_indexing():
    "test data y indexing"

    d = micro_data()

    msg = 'y arrays not equal'
    y1 = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0]
    assert_array_equal(d.y[1], y1, msg)
    assert_array_equal(d.y['bernie'], y1, msg)

    y2 = [0, 1, 1, 1, 0, 1, 1, 1, 0, 1]
    assert_array_equal(d.y[2], y2, msg)
    assert_array_equal(d.y['elizabeth'], y2, msg)

    y3 = [1, 1, 1, 0, 0, 1, 0, 1, 0, 0]
    assert_array_equal(d.y[3], y3, msg)
    assert_array_equal(d.y['jordan'], y3, msg)

    y4 = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
    assert_array_equal(d.y[4], y4, msg)
    assert_array_equal(d.y['ken'], y4, msg)

    y5 = [0, 0, 1, 0, 0, 0, 1, 1, 0, 1]
    assert_array_equal(d.y[5], y5, msg)
    assert_array_equal(d.y['charles'], y5, msg)

    y = np.vstack([[y1], [y2], [y3], [y4], [y5]]).T
    assert_array_equal(d.y[:], y, msg)

    assert_raises(IndexError, d.y.__getitem__, 0)
    assert_raises(IndexError, d.y.__getitem__, 'era')
    assert_raises(IndexError, d.y.__getitem__, 'wtf')
    assert_raises(IndexError, d.y.__getitem__, None)
    assert_raises(IndexError, d.y.__getitem__, slice(1))
示例#5
0
def test_data_roundtrip():
    "save/load roundtrip shouldn't change data"

    d = micro_data()
    path = None

    try:

        path = testing.create_tempfile('numerox.h5')

        d.save(path)
        d2 = nx.load_data(path)
        ade(d, d2, "data corrupted during roundtrip")

        d.save(path, compress=True)
        d2 = nx.load_data(path)
        ade(d, d2, "data corrupted during roundtrip")

        d = d['live']
        d.save(path)
        d2 = nx.load_data(path)
        ade(d, d2, "data corrupted during roundtrip")

    finally:

        testing.delete_tempfile(path)
示例#6
0
def test_data_balance():
    "test data.balance"

    tournament = 1
    d = micro_data()

    # check balance
    b = d.balance(tournament, train_only=False)
    for era in b.unique_era():
        if era != 'eraX':
            y = b[era].y[tournament]
            n0 = (y == 0).sum()
            n1 = (y == 1).sum()
            ok_(n0 == n1, "y is not balanced")

    # check balance
    b = d.balance(tournament, train_only=True)
    eras = np.unique(b.era[b.region == 'train'])
    for era in eras:
        y = b[era].y[tournament]
        n0 = (y == 0).sum()
        n1 = (y == 1).sum()
        ok_(n0 == n1, "y is not balanced")

    # balance already balanced data (regression test)
    d.balance(tournament).balance(tournament)
示例#7
0
def test_metrics_per_era():
    "make sure calc_metrics runs"
    d = micro_data()
    p = micro_prediction()
    metrics_per_era(d, p)
    metrics_per_era(d, p, 'yhat')
    metrics_per_era(d, p, 'inner')
    assert_raises(ValueError, metrics_per_era, d, p, 'outer')
示例#8
0
def test_metrics_per_name():
    "make sure metrics_per_name runs"
    d = testing.micro_data()
    p = testing.micro_prediction()
    metrics_per_name(d, p, 1)
    metrics_per_name(d, p, 2, join='yhat')
    metrics_per_name(d, p, 3, columns=['sharpe'])
    assert_raises(ValueError, metrics_per_name, d, p, 4, 'data', ['wtf'])
示例#9
0
def test_prediction_roundtrip():
    "save/load roundtrip shouldn't change prediction"
    d = micro_data()
    m = nx.logistic()
    p = nx.production(m, d, verbosity=0)
    with tempfile.NamedTemporaryFile() as temp:
        p.save(temp.name)
        p2 = nx.load_prediction(temp.name)
        ade(p, p2, "prediction corrupted during roundtrip")
示例#10
0
def test_data_era_isnotin():
    "test data.era_isnotin"
    d = micro_data()
    eras = ['era3', 'eraX']
    d0 = d.era_isnotin(eras)
    d1 = d.era_isin(eras)
    d01 = nx.concat_data([d0, d1])
    d01 = d01.loc[d.ids]
    ade(d01, d, "all rows not selected")
示例#11
0
def test_metrics_per_era():
    "make sure metrics_per_era runs"
    d = testing.micro_data()
    p = testing.micro_prediction()
    metrics_per_era(d, p, 1)
    metrics_per_era(d, p, 2, join='yhat')
    metrics_per_era(d, p, 3, join='inner')
    assert_raises(ValueError, metrics_per_era, d, p, 4, 'outer')
    with testing.HiddenPrints():
        metrics_per_era(d, p, tournament=5, era_as_str=True)
示例#12
0
def test_data_roundtrip():
    "Saving and then loading data shouldn't change data"
    d = micro_data()
    with tempfile.NamedTemporaryFile() as temp:
        d.save(temp.name)
        d2 = load_data(temp.name)
        ade(d, d2, "data corrupted during roundtrip")
        d.save(temp.name, compress=True)
        d2 = load_data(temp.name)
        ade(d, d2, "data corrupted during roundtrip")
示例#13
0
def test_backtest_production():
    "Make sure backtest and production run"
    d = testing.micro_data()
    model = fifty()
    with testing.HiddenPrints():
        for verbosity in (0, 1, 2, 3):
            nx.backtest(model, d, kfold=2, verbosity=verbosity)
            nx.production(model, d, verbosity=verbosity)
            if verbosity == 3:
                nx.production(model, d, name='test', verbosity=verbosity)
示例#14
0
def test_data_properties():
    """data properties should not be corrupted"""

    d = micro_data()

    ok_((d.ids == d.df.index).all(), "ids is corrupted")
    ok_((d.era_float == d.df.era).all(), "era is corrupted")
    ok_((d.region_float == d.df.region).all(), "region is corrupted")

    idx = ~pd.isnull(d.y[:])
    y = d.df[['kazutsugi']].values
示例#15
0
def test_prediction_performance():
    "make sure prediction.performance runs"
    d = testing.micro_data()
    p = testing.micro_prediction()
    df = p.performance(d)
    ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe')
    p.performance(d, sort_by='auc')
    p.performance(d, sort_by='acc')
    p.performance(d, sort_by='ystd')
    p.performance(d, sort_by='sharpe')
    p.performance(d, sort_by='consis')
示例#16
0
def test_data_properties():
    "prediction properties should not be corrupted"

    d = testing.micro_data()
    p = nx.Prediction()
    p = p.merge_arrays(d.ids, d.y, 'model1')
    p = p.merge_arrays(d.ids, d.y, 'model2')

    ok_((p.ids == p.df.index).all(), "ids is corrupted")
    ok_((p.ids == d.df.index).all(), "ids is corrupted")
    ok_((p.y[:, 0] == d.df.y).all(), "y is corrupted")
    ok_((p.y[:, 1] == d.df.y).all(), "y is corrupted")
示例#17
0
def test_data_copies():
    "data properties should be copies"

    d = micro_data()

    ok_(shares_memory(d, d), "looks like shares_memory failed")
    ok_(~shares_memory(d, d.copy()), "should be a copy")

    ok_(~shares_memory(d, d.ids), "d.ids should be a copy")
    ok_(~shares_memory(d, d.era), "d.era should be a copy")
    ok_(~shares_memory(d, d.region), "d.region should be a copy")
    ok_(~shares_memory(d, d.x), "d.x should be a copy")
    ok_(~shares_memory(d, d.y), "d.y should be a copy")
示例#18
0
def test_splitter_overlap():
    "prediction data should not overlap"
    d = micro_data()
    splitters = [tournament_splitter(d),
                 validation_splitter(d),
                 cheat_splitter(d),
                 cv_splitter(d, kfold=2),
                 split_splitter(d, fit_fraction=0.5)]
    for splitter in splitters:
        predict_ids = []
        for dfit, dpredict in splitter:
            predict_ids.extend(dpredict.ids.tolist())
        ok_(len(predict_ids) == len(set(predict_ids)), "ids overlap")
示例#19
0
def test_data_era_iter():
    "test data.era_iter"
    d = micro_data()
    for as_str in (True, False):
        era2 = []
        for era, idx in d.era_iter(as_str=as_str):
            era2.append(era)
            n = np.unique(d[idx].era).size
            ok_(n == 1, "expecting a single era")
        era = d.unique_era(as_str=as_str).tolist()
        era.sort()
        era2.sort()
        ok_(era2 == era, "era difference found")
示例#20
0
def test_data_region_iter():
    "test data.region_iter"
    d = micro_data()
    for as_str in (True, False):
        region2 = []
        for region, idx in d.region_iter(as_str=as_str):
            region2.append(region)
            n = np.unique(d[idx].region).size
            ok_(n == 1, "expecting a single region")
        region = d.unique_region(as_str=as_str).tolist()
        region.sort()
        region2.sort()
        ok_(region2 == region, "region difference found")
示例#21
0
def test_data_properties():
    "data properties should not be corrupted"

    d = micro_data()

    ok_((d.ids == d.df.index).all(), "ids is corrupted")
    ok_((d.era == d.df.era).all(), "era is corrupted")
    ok_((d.region == d.df.region).all(), "region is corrupted")

    idx = ~np.isnan(d.df.y)
    ok_((d.y[idx] == d.df.y[idx]).all(), "y is corrupted")

    x = d.x
    for i, name in enumerate(d._x_names()):
        ok_((x[:, i] == d.df[name]).all(), "%s is corrupted" % name)
示例#22
0
def test_data_properties():
    "data properties should not be corrupted"

    d = micro_data()

    ok_((d.ids == d.df.index).all(), "ids is corrupted")
    ok_((d.era_float == d.df.era).all(), "era is corrupted")
    ok_((d.region_float == d.df.region).all(), "region is corrupted")

    idx = ~np.isnan(d.y[:])
    y = d.df[['bernie', 'elizabeth', 'jordan', 'ken', 'charles']].values
    ok_((d.y[:][idx] == y[idx]).all(), "y is corrupted")

    x = d.x
    for i, name in enumerate(d.column_list(x_only=True)):
        ok_((x[:, i] == d.df[name]).all(), "%s is corrupted" % name)
示例#23
0
def test_empty_data():
    "test empty data"
    d = micro_data()
    d['eraXXX']
    d['eraYYY'].__repr__()
    idx = np.zeros(len(d), dtype=np.bool)
    d0 = d[idx]
    ok_(len(d0) == 0, "empty data should have length 0")
    ok_(d0.size == 0, "empty data should have size 0")
    ok_(d0.shape[0] == 0, "empty data should have d.shape[0] == 0")
    ok_(d0.era.size == 0, "empty data should have d.era.size == 0")
    ok_(d0.region.size == 0, "empty data should have d.region.size == 0")
    ok_(d0.x.size == 0, "empty data should have d.x.size == 0")
    ok_(d0.y[:].size == 0, "empty data should have d.y.size == 0")
    d2 = d['era0'] + d[idx]
    ok_(len(d2) == 0, "empty data should have length 0")
示例#24
0
def test_data_copies():
    """data properties should be copies or views"""

    d = micro_data()

    ok_(shares_memory(d, d), "looks like shares_memory failed")

    # copies
    ok_(not shares_memory(d, d.copy()), "should be a copy")
    ok_(not shares_memory(d, d.era), "d.era should be a copy")
    ok_(not shares_memory(d, d.region), "d.region should be a copy")
    ok_(not shares_memory(d, d.ids), "d.ids should be a copy")

    # views
    ok_(shares_memory(d, d.era_float), "d.era_float should be a view")
    ok_(shares_memory(d, d.region_float), "d.region_float should be a view")
示例#25
0
def test_data_roundtrip():
    "save/load roundtrip shouldn't change data"
    d = micro_data()
    with tempfile.NamedTemporaryFile() as temp:

        d.save(temp.name)
        d2 = nx.load_data(temp.name)
        ade(d, d2, "data corrupted during roundtrip")

        d.save(temp.name, compress=True)
        d2 = nx.load_data(temp.name)
        ade(d, d2, "data corrupted during roundtrip")

        d = d['live']
        d.save(temp.name)
        d2 = nx.load_data(temp.name)
        ade(d, d2, "data corrupted during roundtrip")
示例#26
0
def test_report_performance_df():
    "make sure report.performance_df runs"

    d = micro_data()
    d = d['train'] + d['validation']

    p = Prediction()
    p.append(d.ids, d.y)

    r = Report()
    r.append_prediction(p, 'model1')
    r.append_prediction(p, 'model2')
    r.append_prediction(p, 'model3')

    df = r.performance_df(d)

    ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe')
示例#27
0
def test_prediction_add():
    "add two predictions together"

    d = micro_data()
    p1 = Prediction()
    p2 = Prediction()
    d1 = d['train']
    d2 = d['tournament']
    rs = np.random.RandomState(0)
    yhat1 = 0.2 * (rs.rand(len(d1)) - 0.5) + 0.5
    yhat2 = 0.2 * (rs.rand(len(d2)) - 0.5) + 0.5
    p1.append(d1.ids, yhat1)
    p2.append(d2.ids, yhat2)

    p = p1 + p2  # just make sure that it runs

    assert_raises(IndexError, p.__add__, p1)
    assert_raises(IndexError, p1.__add__, p1)
示例#28
0
def test_data_y_indexing():
    """ test data y indexing
        only checking kazutsugi
    """

    d = micro_data()

    msg = 'y arrays not equal'
    y1 = [0, 1, 0, 1, 0, 1, 0, 0, 0, 0]

    assert_array_equal(d.y[8], y1, msg)
    assert_array_equal(d.y['kazutsugi'], y1, msg)

    assert_raises(IndexError, d.y.__getitem__, 0)
    assert_raises(IndexError, d.y.__getitem__, 'era')
    assert_raises(IndexError, d.y.__getitem__, 'wtf')
    assert_raises(IndexError, d.y.__getitem__, None)
    assert_raises(IndexError, d.y.__getitem__, slice(1))
示例#29
0
def test_prediction_add():
    "add two predictions together"

    d = testing.micro_data()
    p1 = nx.Prediction()
    p2 = nx.Prediction()
    d1 = d['train']
    d2 = d['tournament']
    rs = np.random.RandomState(0)
    y1 = 0.2 * (rs.rand(len(d1)) - 0.5) + 0.5
    y2 = 0.2 * (rs.rand(len(d2)) - 0.5) + 0.5
    p1 = p1.merge_arrays(d1.ids, y1, 'model1')
    p2 = p2.merge_arrays(d2.ids, y2, 'model1')

    p = p1 + p2  # just make sure that it runs

    assert_raises(ValueError, p.__add__, p1)
    assert_raises(ValueError, p1.__add__, p1)
示例#30
0
def test_backtest_production():
    "Make sure backtest and production run"
    d = testing.micro_data()
    model = nx.fifty()
    with testing.HiddenPrints():
        p = nx.production(model, d)
        ok_(p.shape[1] == 5, 'wrong number of tournaments')
        ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
        p = nx.backtest(model, d, kfold=2)
        ok_(p.shape[1] == 5, 'wrong number of tournaments')
        ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
        for verbosity in (0, 1, 2, 3):
            nx.backtest(model, d, tournament=3, kfold=2, verbosity=verbosity)
            nx.production(model, d, tournament='ken', verbosity=verbosity)
            nx.production(model, d, tournament=4, verbosity=verbosity)
            nx.production(model, d, tournament=None, verbosity=verbosity)
            if verbosity == 3:
                nx.production(model, d, tournament=5, verbosity=verbosity)
                nx.production(model,
                              d,
                              tournament='charles',
                              verbosity=verbosity)