Exemplo n.º 1
0
def test_prediction_setitem():
    "compare prediction._setitem__ with merge"

    data = nx.play_data()
    p1 = nx.production(nx.logistic(), data, 'bernie', verbosity=0)
    p2 = nx.production(nx.logistic(1e-5), data, 2, verbosity=0)
    p3 = nx.production(nx.logistic(1e-6), data, 3, verbosity=0)
    p4 = nx.backtest(nx.logistic(), data, 4, verbosity=0)

    p = nx.Prediction()
    p[('logistic', 1)] = p1
    p[('logistic', 2)] = p2
    p[('logistic', 3)] = p3
    p[('logistic', 4)] = p4

    pp = nx.Prediction()
    pp = pp.merge(p1)
    pp = pp.merge(p2)
    pp = pp.merge(p3)
    pp = pp.merge(p4)

    pd.testing.assert_frame_equal(p.df, pp.df)

    assert_raises(ValueError, p.__setitem__, ('logistic', 1), p1)
    assert_raises(ValueError, p.__setitem__, ('logistic', 1), p)
Exemplo n.º 2
0
def test_prediction_setitem():
    "compare prediction._setitem__ with merge"

    data = nx.play_data()
    p1 = nx.production(nx.logistic(), data, 'model1', verbosity=0)
    p2 = nx.production(nx.logistic(1e-5), data, 'model2',  verbosity=0)
    p3 = nx.production(nx.logistic(1e-6), data, 'model3',  verbosity=0)
    p4 = nx.backtest(nx.logistic(), data, 'model1',  verbosity=0)

    p = nx.Prediction()
    p['model1'] = p1
    p['model2'] = p2
    p['model3'] = p3
    p['model1'] = p4

    pp = nx.Prediction()
    pp = pp.merge(p1)
    pp = pp.merge(p2)
    pp = pp.merge(p3)
    pp = pp.merge(p4)

    pd.testing.assert_frame_equal(p.df, pp.df)

    assert_raises(ValueError, p.__setitem__, 'model1', p1)
    assert_raises(ValueError, p.__setitem__, 'model1', p)
Exemplo n.º 3
0
def compare_change(data):
    """
    Run multiple models: fit on training data, predict for tournament data.
    Then change the data, rerun and compare performance with and without the
    change.
    """

    print('\nStandard dataset:\n')

    # we'll look at 5 models
    prediction = nx.production(nx.logistic(), data, verbosity=1)
    prediction += nx.production(nx.extratrees(), data, verbosity=1)
    prediction += nx.production(nx.randomforest(), data, verbosity=1)
    prediction += nx.production(nx.mlpc(), data, verbosity=1)
    prediction += nx.production(nx.logisticPCA(), data, verbosity=1)

    # let's now make a change, could be anything; as an example let's add
    # the square of each feature to the dataset
    x = np.hstack((data.x, data.x * data.x))
    data2 = data.xnew(x)

    print('\nDataset expanded with squared features:\n')

    # rerun all models with the new expanded data
    prediction2 = nx.production(nx.logistic(), data2, verbosity=1)
    prediction2 += nx.production(nx.extratrees(), data2, verbosity=1)
    prediction2 += nx.production(nx.randomforest(), data2, verbosity=1)
    prediction2 += nx.production(nx.mlpc(), data2, verbosity=1)
    prediction2 += nx.production(nx.logisticPCA(), data2, verbosity=1)

    # compare performance
    print('\nCompare (1 is regular dataset; 2 expanded dataset):\n')
    print(prediction.compare(data['validation'], prediction2))
Exemplo n.º 4
0
def test_prediction_regression():
    "regression test of prediction performance evaluation"
    d = nx.play_data()
    p = nx.production(nx.logistic(), d, tournament=None, verbosity=0)
    for number, name in nx.tournament_iter():
        p2 = nx.production(nx.logistic(), d, tournament=name, verbosity=0)
        df = p.performance_mean(d['validation'], mean_of='tournament')
        logloss1 = df.loc[name]['logloss']
        logloss2 = p2.summary(d['validation']).loc['mean']['logloss']
        diff = np.abs(logloss1 - logloss2)
        msg = 'failed on {}'.format(name)
        ok_(diff < 1e-6, msg)
Exemplo n.º 5
0
def test_model_rename():
    "Test renaming a model"
    model = nx.logistic()
    ok_(model.name == 'logistic', 'wrong name')
    model.rename('LR')
    ok_(model.name == 'LR', 'wrong name')
    model = model.rename('logreg')
    ok_(model.name == 'logreg', 'wrong name')
    ok_(model.__repr__().startswith('logreg'), 'wrong name')
    model = nx.logistic()
    ok_(model.rename(None).name == 'logistic', 'wrong name')
    assert_raises(ValueError, model.rename, 1)
Exemplo n.º 6
0
def runner_example():

    data = nx.play_data()
    splitter = nx.CVSplitter(data)

    # let's run 3 models
    m1 = {'model': nx.logistic(), 'prediction_file': None, 'csv_file': None}
    m2 = {'model': nx.logistic(1e-4)}
    m3 = {'model': nx.extratrees()}
    run_list = [m1, m2, m3]

    # we won't save anything, just display the results
    runner = nx.Runner(run_list, splitter, verbosity=1)
    runner.run()
Exemplo n.º 7
0
def test_multiple_runs():
    "test running multiple models through multiple tournaments"

    d = testing.play_data()
    models = [nx.logistic(), nx.fifty()]

    with testing.HiddenPrints():

        p = nx.production(models, d, 'bernie')
        ok_(p.shape[1] == 2, 'wrong number of tournaments')
        p = nx.backtest(models, d, 2)
        ok_(p.shape[1] == 2, 'wrong number of tournaments')
        p = nx.run(models, nx.ValidationSplitter(d), 'ken')
        ok_(p.shape[1] == 2, 'wrong number of tournaments')

        p = nx.production(models, d)
        ok_(p.shape[1] == 10, 'wrong number of tournaments')
        p = nx.backtest(models, d)
        ok_(p.shape[1] == 10, 'wrong number of tournaments')
        p = nx.run(models, nx.ValidationSplitter(d))
        ok_(p.shape[1] == 10, 'wrong number of tournaments')

        p = nx.production(models, d, [1, 5])
        ok_(p.shape[1] == 4, 'wrong number of tournaments')
        ok_(p.tournaments() == ['bernie', 'charles'], 'wrong tournaments')
        p = nx.backtest(models, d, ['charles', 'bernie'])
        ok_(p.shape[1] == 4, 'wrong number of tournaments')
        ok_(p.tournaments() == ['bernie', 'charles'], 'wrong tournaments')
        p = nx.run(models, nx.ValidationSplitter(d), ['ken'])
        ok_(p.shape[1] == 2, 'wrong number of tournaments')
        ok_(p.tournaments() == ['ken'], 'wrong tournaments')
Exemplo n.º 8
0
def cv_warning(data, nsamples=100):
    "Hold out a sample of eras not rows when doing cross validation."

    data = data['train']
    model = nx.logistic()
    results_cve = pd.DataFrame()
    results_cv = pd.DataFrame()

    for i in range(nsamples):

        # cv across eras
        cve = nx.CVSplitter(data, seed=i)
        prediction = nx.run(model, cve, verbosity=0)
        df = prediction.performance(data)
        results_cve = results_cve.append(df, ignore_index=True)

        # cv ignoring eras but y balanced
        cv = nx.IgnoreEraCVSplitter(data, seed=i)
        prediction = nx.run(model, cv, verbosity=0)
        df = prediction.performance(data)
        results_cv = results_cv.append(df, ignore_index=True)

        # display results
        rcve = results_cve.mean(axis=0)
        rcv = results_cv.mean(axis=0)
        rcve.name = 'cve'
        rcv.name = 'cv'
        r = pd.concat([rcve, rcv], axis=1)
        print("\n{} runs".format(i + 1))
        print(r)
Exemplo n.º 9
0
def test_prediction_check():
    "make sure prediction.check runs"
    d = nx.play_data()
    p = nx.production(nx.logistic(), d, verbosity=0)
    p += nx.production(nx.logisticPCA(), d, verbosity=0)
    df = p.check(['logistic'], d)
    ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe')
Exemplo n.º 10
0
def test_prediction_roundtrip():
    "save/load roundtrip shouldn't change prediction"
    d = micro_data()
    m = nx.logistic()
    p = nx.production(m, d, verbosity=0)
    with tempfile.NamedTemporaryFile() as temp:
        p.save(temp.name)
        p2 = nx.load_prediction(temp.name)
        ade(p, p2, "prediction corrupted during roundtrip")
Exemplo n.º 11
0
def concordance(data, tournament='bernie'):
    """
    Example showing how to calculate concordance.
    Concordance must be less than 0.12 to pass numerai's check.
    For an accurate concordance calculation `data` must be the full dataset.
    """
    models = [nx.logistic(), nx.extratrees(), nx.mlpc()]
    p = nx.production(models, data, tournament)
    print("\nA concordance less than 0.12 is passing")
    print(p.concordance(data))
Exemplo n.º 12
0
def test_prediction_check():
    "make sure prediction.check runs"
    d = nx.play_data()
    p1 = nx.production(nx.logistic(), d, 'ken', verbosity=0)
    p2 = p1.copy()
    p2 = p2.rename('example_predictions')
    p = p1 + p2
    with nx.testing.HiddenPrints():
        df = p.check(d)
    ok_(isinstance(df, dict), 'expecting a dictionary')
Exemplo n.º 13
0
def get_models():
    models = [
        nx.logistic(),
        nx.extratrees(),
        nx.randomforest(),
        nx.mlpc(),
        nx.logisticPCA(),
        nx.example_predictions(),
        fifty()
    ]
    return models
Exemplo n.º 14
0
def concordance_example(data):
    """
    Example showing how to calculate concordance.
    Concordance must be less than 0.12 to pass numerai's check.
    For an accurate concordance calculation `data` must be the full dataset.
    """
    prediction = nx.production(nx.logistic(), data)
    prediction += nx.production(nx.extratrees(), data)
    prediction += nx.production(nx.mlpc(), data)
    print("\nA concordance less than 0.12 is passing")
    print(prediction.concordance(data))
Exemplo n.º 15
0
def test_run():
    "Make sure run runs"
    d = testing.play_data()
    models = [nx.logistic(), fifty()]
    splitters = [nx.TournamentSplitter(d),
                 nx.ValidationSplitter(d),
                 nx.CheatSplitter(d),
                 nx.CVSplitter(d, kfold=2),
                 nx.SplitSplitter(d, fit_fraction=0.5)]
    for model in models:
        for splitter in splitters:
            nx.run(model, splitter, verbosity=0)
Exemplo n.º 16
0
def main():

    # download dataset from numerai
    nx.download_dataset('numerai_dataset.zip', verbose=True)

    # load numerai dataset
    data = nx.load_zip('numerai_dataset.zip', verbose=True)

    # we will use logistic regression; you will want to write your own model
    model = nx.logistic()

    # fit model with train data and make predictions for tournament data
    prediction = nx.production(model, data)

    # save predictions to csv file for later upload to numerai
    prediction.to_csv('logistic.csv', verbose=True)
Exemplo n.º 17
0
def numerox_example():
    """
    Example of how to prepare a submission for the Numerai tournament.
    It uses Numerox which you can install with: pip install numerox
    For more information see: https://github.com/kwgoodman/numerox
    """

    # download dataset from numerai, save it and then load it
    data = nx.download('numerai_dataset.zip')

    # we will use logistic regression; you will want to write your own model
    model = nx.logistic()

    # fit model with train data and make predictions for tournament data
    prediction = nx.production(model, data, tournament='bernie')

    # save predictions to csv file
    prediction.to_csv('logistic.csv', verbose=True)
Exemplo n.º 18
0
def test_run():
    "Make sure run runs"
    d = testing.play_data()
    models = [nx.logistic(), nx.fifty()]
    splitters = [
        nx.TournamentSplitter(d),
        nx.ValidationSplitter(d),
        nx.CheatSplitter(d),
        nx.CVSplitter(d, kfold=2),
        nx.SplitSplitter(d, fit_fraction=0.5)
    ]
    for model in models:
        for splitter in splitters:
            nx.run(model, splitter, tournament=2, verbosity=0)
            nx.run(model, splitter, tournament='bernie', verbosity=0)
            p = nx.run(model, splitter, tournament=None, verbosity=0)
            ok_(p.shape[1] == 5, 'wrong number of tournaments')
            ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
            p = nx.run(model, splitter, verbosity=0)
            ok_(p.shape[1] == 5, 'wrong number of tournaments')
            ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
    assert_raises(ValueError, nx.run, None, nx.TournamentSplitter(d))
    assert_raises(ValueError, nx.run, nx.fifty(), nx.TournamentSplitter(d), {})
Exemplo n.º 19
0
def compare_models(data):
    """
    Run multiple models: fit on training data, predict for tournament data.
    Then compare performance of the models
    """

    # we'll look at 5 models
    prediction = nx.production(nx.logistic(), data, verbosity=1)
    prediction += nx.production(nx.extratrees(), data, verbosity=1)
    prediction += nx.production(nx.randomforest(), data, verbosity=1)
    prediction += nx.production(nx.mlpc(), data, verbosity=1)
    prediction += nx.production(nx.logisticPCA(), data, verbosity=1)

    # correlation of models with logistic regression
    print('\nCorrelation:\n')
    prediction.correlation('logistic')

    # compare performance of models
    print('\nPerformance comparison:\n')
    print(prediction.performance(data['validation'], sort_by='logloss'))

    # dominance of models
    print('\nModel dominance:\n')
    print(prediction.dominance(data['validation'], sort_by='logloss'))

    # dominace between two models
    print('\nModel dominance between two models:\n')
    df = prediction[['logistic', 'logisticPCA']].dominance(data['validation'])
    print(df)

    # originality given that logistic model has already been submitted
    print('\nModel originality (versus logistic):\n')
    print(prediction.originality(['logistic']))

    # concordance
    print('\nConcordance:\n')
    print(prediction.concordance(data))
Exemplo n.º 20
0
def cv_warning(data, nsamples=100):

    model = nx.logistic()

    for i in range(nsamples):

        report = nx.Report()

        # cv across eras
        cve = nx.CVSplitter(data, seed=i)
        prediction = nx.run(model, cve, verbosity=0)
        report.append_prediction(prediction, 'cve')

        # cv ignoring eras but y balanced
        cv = nx.IgnoreEraCVSplitter(data, seed=i)
        prediction = nx.run(model, cv, verbosity=0)
        report.append_prediction(prediction, 'cv')

        # save performance results
        df = report.performance_df(data)
        cols = df.columns.tolist()
        cols[-1] = 'cv_type'
        df.columns = cols
        if i == 0:
            results = df
        else:
            results = results.append(df, ignore_index=True)

        # display results
        rcve = results[results.cv_type == 'cve'].mean(axis=0)
        rcv = results[results.cv_type == 'cv'].mean(axis=0)
        rcve.name = 'cve'
        rcv.name = 'cv'
        r = pd.concat([rcve, rcv], axis=1)
        print("\n{} runs".format(i+1))
        print(r)
Exemplo n.º 21
0
def concordance_example():
    data = nx.play_data()
    model = nx.logistic()
    prediction = nx.production(model, data)
    concord = nx.concordance(data, prediction)
    print("concordance {:.4f} (less than 0.12 is passing)".format(concord))
Exemplo n.º 22
0
def backtest_example(data):
    "Simple cross validation on training data using logistic regression"
    model = nx.logistic()
    prediction = nx.backtest(model, data)  # noqa
Exemplo n.º 23
0
def backtest_example():
    data = nx.play_data()
    model = nx.logistic()
    prediction = nx.backtest(model, data)  # noqa
Exemplo n.º 24
0
def get_models():
    models = [nx.logistic(), nx.extratrees(), nx.randomforest()]
    if HAS_XGBOOST:
        models.append(nx.xgboost())
    return models
Exemplo n.º 25
0
def get_model():
    model = nx.logistic()
    return model
Exemplo n.º 26
0
def backtest(data, tournament='bernie'):
    "Simple cross validation on training data using logistic regression"
    model = nx.logistic()
    prediction = nx.backtest(model, data, tournament)  # noqa
Exemplo n.º 27
0
def test_prediction_concordance():
    "make sure prediction.concordance runs"
    d = testing.play_data()
    p = nx.production(nx.logistic(), d, 'model1', verbosity=0)
    df = p.concordance(d)
    ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe')