def cv_warning(model, data, tournament='bernie', kfold=5, nsamples=100): "Hold out a sample of eras not rows when doing cross validation." data = data['train'] results_cve = pd.DataFrame() results_cv = pd.DataFrame() for i in range(nsamples): # cv across eras cve = nx.CVSplitter(data, kfold=kfold, seed=i) prediction = nx.run(model, cve, tournament, verbosity=0) df = prediction.performance(data, tournament) results_cve = results_cve.append(df, ignore_index=True) # cv ignoring eras but y balanced cv = nx.IgnoreEraCVSplitter(data, tournament=tournament, kfold=kfold, seed=i) prediction = nx.run(model, cv, tournament, verbosity=0) df = prediction.performance(data, tournament) results_cv = results_cv.append(df, ignore_index=True) # display results rcve = results_cve.mean(axis=0) rcv = results_cv.mean(axis=0) rcve.name = 'cve' rcv.name = 'cv' r = pd.concat([rcve, rcv], axis=1) print("\n{} runs".format(i + 1)) print(r)
def backtest(model, data, tournament=None, kfold=5, seed=0, verbosity=2): """ K-fold cross validation of model over the train data. Parameters ---------- model : nx.Model, list, tuple Prediction model. Can be a list or tuple of prediction models. Model names must be unique. data : nx.Data The data to run the model through. tournament : {None, int, str, list, tuple}, optional The tournament(s) to run the model through. By default (None) the model is run through all five tournaments. If a list or tuple of tournaments is given then it must must not contain duplicate tournaments. verbosity : int, optional An integer that determines verbosity. Zero is silent. Returns ------- p : nx.Prediction A prediction object containing the predictions of the specified model/tournament pairs. """ splitter = nx.CVSplitter(data, kfold=kfold, seed=seed, train_only=True) prediction = run(model, splitter, tournament, verbosity) return prediction
def test_cvsplitter_kfold(): """make sure cvsplitter runs k folds""" d = nx.play_data() for k in (2, 3): splitter = nx.CVSplitter(d, kfold=k) count = 0 for dfit, dpredict in splitter: count += 1 ok_(count == k, "CVSplitter iterated through wrong number of folds")
def test_run(): "Make sure run runs" d = testing.play_data() models = [nx.logistic(), fifty()] splitters = [nx.TournamentSplitter(d), nx.ValidationSplitter(d), nx.CheatSplitter(d), nx.CVSplitter(d, kfold=2), nx.SplitSplitter(d, fit_fraction=0.5)] for model in models: for splitter in splitters: nx.run(model, splitter, verbosity=0)
def runner_example(): data = nx.play_data() splitter = nx.CVSplitter(data) # let's run 3 models m1 = {'model': nx.logistic(), 'prediction_file': None, 'csv_file': None} m2 = {'model': nx.logistic(1e-4)} m3 = {'model': nx.extratrees()} run_list = [m1, m2, m3] # we won't save anything, just display the results runner = nx.Runner(run_list, splitter, verbosity=1) runner.run()
def test_splitter_overlap(): "prediction data should not overlap" d = nx.play_data() splitters = [ nx.TournamentSplitter(d), nx.ValidationSplitter(d), nx.CheatSplitter(d), nx.CVSplitter(d), nx.IgnoreEraCVSplitter(d), nx.SplitSplitter(d, fit_fraction=0.5) ] for splitter in splitters: predict_ids = [] for dfit, dpredict in splitter: predict_ids.extend(dpredict.ids.tolist()) ok_(len(predict_ids) == len(set(predict_ids)), "ids overlap")
def test_run(): "Make sure run runs" d = testing.play_data() models = [nx.linear(), nx.fifty()] splitters = [nx.TournamentSplitter(d), nx.ValidationSplitter(d), nx.CheatSplitter(d), nx.CVSplitter(d, kfold=2), nx.SplitSplitter(d, fit_fraction=0.5)] for model in models: for splitter in splitters: p = nx.run(model, splitter, tournament=None, verbosity=0) ok_(p.shape[1] == 1, 'wrong number of tournaments') ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments') assert_raises(ValueError, nx.run, None, nx.TournamentSplitter(d)) assert_raises(ValueError, nx.run, nx.fifty(), nx.TournamentSplitter(d), {})
def test_splitter_reset(): "splitter reset should not change results" d = nx.play_data() splitters = [ nx.TournamentSplitter(d), nx.ValidationSplitter(d), nx.CheatSplitter(d), nx.CVSplitter(d), nx.IgnoreEraCVSplitter(d), nx.SplitSplitter(d, fit_fraction=0.5) ] for splitter in splitters: ftups = [[], []] ptups = [[], []] for i in range(2): for dfit, dpredict in splitter: ftups[i].append(dfit) ptups[i].append(dpredict) splitter.reset() ok_(ftups[0] == ftups[1], "splitter reset changed fit split") ok_(ptups[0] == ptups[1], "splitter reset changed predict split")
def cv_warning(data, nsamples=100): model = nx.logistic() for i in range(nsamples): report = nx.Report() # cv across eras cve = nx.CVSplitter(data, seed=i) prediction = nx.run(model, cve, verbosity=0) report.append_prediction(prediction, 'cve') # cv ignoring eras but y balanced cv = nx.IgnoreEraCVSplitter(data, seed=i) prediction = nx.run(model, cv, verbosity=0) report.append_prediction(prediction, 'cv') # save performance results df = report.performance_df(data) cols = df.columns.tolist() cols[-1] = 'cv_type' df.columns = cols if i == 0: results = df else: results = results.append(df, ignore_index=True) # display results rcve = results[results.cv_type == 'cve'].mean(axis=0) rcv = results[results.cv_type == 'cv'].mean(axis=0) rcve.name = 'cve' rcv.name = 'cv' r = pd.concat([rcve, rcv], axis=1) print("\n{} runs".format(i+1)) print(r)