def test_prediction_setitem(): """compare prediction._setitem__ with merge""" data = nx.play_data() p1 = nx.production(nx.linear(), data, 'kazutsugi', verbosity=0) p2 = nx.production(nx.linear(), data, 8, verbosity=0) p3 = nx.production(nx.linear(), data, 8, verbosity=0) p4 = nx.backtest(nx.linear(), data, 8, verbosity=0) p = nx.Prediction() p[('linear', 1)] = p1 p[('linear', 2)] = p2 p[('linear', 3)] = p3 p[('linear', 4)] = p4 pp = nx.Prediction() pp = pp.merge(p1) pp = pp.merge(p2) pp = pp.merge(p3) pp = pp.merge(p4) pd.testing.assert_frame_equal(p.df, pp.df) assert_raises(ValueError, p.__setitem__, ('linear', 1), p1) assert_raises(ValueError, p.__setitem__, ('linear', 1), p)
def test_prediction_setitem(): "compare prediction._setitem__ with merge" data = nx.play_data() p1 = nx.production(nx.logistic(), data, 'bernie', verbosity=0) p2 = nx.production(nx.logistic(1e-5), data, 2, verbosity=0) p3 = nx.production(nx.logistic(1e-6), data, 3, verbosity=0) p4 = nx.backtest(nx.logistic(), data, 4, verbosity=0) p = nx.Prediction() p[('logistic', 1)] = p1 p[('logistic', 2)] = p2 p[('logistic', 3)] = p3 p[('logistic', 4)] = p4 pp = nx.Prediction() pp = pp.merge(p1) pp = pp.merge(p2) pp = pp.merge(p3) pp = pp.merge(p4) pd.testing.assert_frame_equal(p.df, pp.df) assert_raises(ValueError, p.__setitem__, ('logistic', 1), p1) assert_raises(ValueError, p.__setitem__, ('logistic', 1), p)
def test_prediction_setitem(): "compare prediction._setitem__ with merge" data = nx.play_data() p1 = nx.production(nx.logistic(), data, 'model1', verbosity=0) p2 = nx.production(nx.logistic(1e-5), data, 'model2', verbosity=0) p3 = nx.production(nx.logistic(1e-6), data, 'model3', verbosity=0) p4 = nx.backtest(nx.logistic(), data, 'model1', verbosity=0) p = nx.Prediction() p['model1'] = p1 p['model2'] = p2 p['model3'] = p3 p['model1'] = p4 pp = nx.Prediction() pp = pp.merge(p1) pp = pp.merge(p2) pp = pp.merge(p3) pp = pp.merge(p4) pd.testing.assert_frame_equal(p.df, pp.df) assert_raises(ValueError, p.__setitem__, 'model1', p1) assert_raises(ValueError, p.__setitem__, 'model1', p)
def run_one(model, splitter, tournament, verbosity=2): "Run a single model through a data splitter for a single tournament" t0 = time.time() name = model.name if verbosity > 2: print(splitter) if verbosity > 0: pprint.pprint(model) data = None prediction = nx.Prediction() for data_fit, data_predict in splitter: if verbosity > 0: if data is None: data = data_predict.copy() else: data = data + data_predict # the following line of code hides from your model the y # that you are trying to predict to prevent accidental cheating data_predict = data_predict.y_to_nan() ids, yhat = model.fit_predict(data_fit, data_predict, tournament) prediction = prediction.merge_arrays(ids, yhat, name, tournament) if verbosity > 1: print( prediction.summary(data.region_isnotin(['test', 'live']), tournament)) if verbosity == 1: print( prediction.summary(data.region_isnotin(['test', 'live']), tournament)) if verbosity > 1: minutes = (time.time() - t0) / 60 print('Done in {:.2f} minutes'.format(minutes)) return prediction
def predict(self, dpre: nx.data.Data, tournament: str) -> nx.Prediction: """ Alternative to fit_predict() dpre: must be data['tournament'] tournament: can be int or str. """ prediction = nx.Prediction() data_predict = dpre.y_to_nan() try: LOGGER.info('Inference started...') yhat = self.model.predict(data_predict.x) LOGGER.info( 'Inference complete...now preparing predictions for submission' ) except Exception as e: LOGGER.error(f'Failure to make predictions with {e}') raise e try: prediction = prediction.merge_arrays(data_predict.ids, yhat, self.name, tournament) return prediction except Exception as e: LOGGER.error(f'Failure to prepare predictions with {e}') raise e
def test_merge_predictions(): "test merge_predictions" p = testing.micro_prediction() assert_raises(ValueError, nx.merge_predictions, [p, p]) p2 = nx.merge_predictions([p, nx.Prediction()]) ade(p2, p, 'corruption of merge predictions') p1 = testing.micro_prediction([0, 1, 2, 3, 4]) p2 = testing.micro_prediction([5, 6, 7, 8, 9]) p12 = nx.merge_predictions([p1, p2]) ade(p12, p, 'corruption of merge predictions') p1 = testing.micro_prediction([0, 1, 2, 3]) p2 = testing.micro_prediction([4, 5, 6]) p3 = testing.micro_prediction([7, 8, 9]) p123 = nx.merge_predictions([p1, p2, p3]) ade(p123, p, 'corruption of merge predictions') p1 = testing.micro_prediction([9, 4, 3, 2]) p2 = testing.micro_prediction([1, 8, 7]) p3 = testing.micro_prediction([6, 5, 0]) p123 = nx.merge_predictions([p1, p2, p3]) ade(p123, p, 'corruption of merge predictions') p1 = testing.micro_prediction([0, 1, 2, 3, 4]) p11 = p1[['model0', 'model1']] p12 = p1['model2'] p2 = testing.micro_prediction([5, 6, 7, 8, 9]) p21 = p2['model0'] p22 = p2[['model1', 'model2']] p12 = nx.merge_predictions([p11, p21, p22, p12]) ade(p12, p, 'corruption of merge predictions')
def load_example_predictions(data_zip): "Load example predictions from Numerai zip archive" zf = zipfile.ZipFile(data_zip) df = pd.read_csv(zf.open(EXAMPLE_PREDICTIONS), header=0, index_col=0) df.columns = ['example_predictions'] p = nx.Prediction(df) return p
def micro_prediction(index=None): d = micro_data(index) n = len(d) rs = np.random.RandomState(0) yhat = 0.2 * (rs.rand(n) - 0.5) + 0.5 prediction = nx.Prediction() prediction.append(d.ids, yhat) return prediction
def load_example_predictions(data_zip, tournament): "Load example predictions from Numerai zip archive" zf = zipfile.ZipFile(data_zip) tourn_name = nx.tournament_str(tournament) filename = EXAMPLE_PREDICTIONS.format(tourn_name) df = pd.read_csv(zf.open(filename), header=0, index_col=0) df.columns = ['example_predictions_{}'.format(tourn_name)] p = nx.Prediction(df) return p
def test_prediction_add(): "add two predictions together" d = testing.micro_data() p1 = nx.Prediction() p2 = nx.Prediction() d1 = d['train'] d2 = d['tournament'] rs = np.random.RandomState(0) y1 = 0.2 * (rs.rand(len(d1)) - 0.5) + 0.5 y2 = 0.2 * (rs.rand(len(d2)) - 0.5) + 0.5 p1 = p1.merge_arrays(d1.ids, y1, 'model1') p2 = p2.merge_arrays(d2.ids, y2, 'model1') p = p1 + p2 # just make sure that it runs assert_raises(ValueError, p.__add__, p1) assert_raises(ValueError, p1.__add__, p1)
def test_prediction_properties(): "prediction properties should not be corrupted" d = nx.testing.micro_data() p = nx.Prediction() p = p.merge_arrays(d.ids, d.y['bernie'], 'model1', 1) p = p.merge_arrays(d.ids, d.y['elizabeth'], 'model2', 2) ok_((p.ids == p.df.index).all(), "ids is corrupted") ok_((p.ids == d.df.index).all(), "ids is corrupted") ok_((p.y[:, 0] == d.df.bernie).all(), "y is corrupted") ok_((p.y[:, 1] == d.df.elizabeth).all(), "y is corrupted")
def test_data_properties(): "prediction properties should not be corrupted" d = testing.micro_data() p = nx.Prediction() p = p.merge_arrays(d.ids, d.y, 'model1') p = p.merge_arrays(d.ids, d.y, 'model2') ok_((p.ids == p.df.index).all(), "ids is corrupted") ok_((p.ids == d.df.index).all(), "ids is corrupted") ok_((p.y[:, 0] == d.df.y).all(), "y is corrupted") ok_((p.y[:, 1] == d.df.y).all(), "y is corrupted")
def test_prediction_ynew(): "test prediction.ynew" p = nx.testing.micro_prediction() y = p.y.copy() y2 = np.random.rand(*y.shape) p2 = p.ynew(y2) np.testing.assert_array_equal(p2.y, y2, 'prediction.ynew failed') assert_raises(ValueError, p.ynew, y2[:3]) assert_raises(ValueError, p.ynew, y2[:, :2]) assert_raises(ValueError, p.ynew, y2.reshape(-1)) p = nx.Prediction() assert_raises(ValueError, p.ynew, y2)
def test_prediction_properties(): """prediction properties should not be corrupted""" d = nx.testing.micro_data() p = nx.Prediction() p = p.merge_arrays(d.ids, d.y['kazutsugi'], 'model1', 1) p = p.merge_arrays(d.ids, d.y['kazutsugi'], 'model2', 2) ok_((p.ids == p.df.index).all(), "ids is corrupted") ok_((p.ids == d.df.index).all(), "ids is corrupted") ok_((p.y[:, 0] == d.df.kazutsugi).all(), "y is corrupted") ok_((p.y[:, 1] == d.df.kazutsugi).all(), "y is corrupted")
def test_prediction_dominance(): "make sure prediction.dominance runs" d = nx.play_data() d = d['validation'] p = nx.Prediction() p = p.merge_arrays(d.ids, d.y, 'model1') p = p.merge_arrays(d.ids, d.y, 'model2') p = p.merge_arrays(d.ids, d.y, 'model3') df = p.dominance(d) ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe') assert_raises(ValueError, p['model1'].dominance, d)
def test_empty_prediction(): "Test handling of empty predictions" p = nx.Prediction() ok_(p.names == [], "wrong name") assert_raises(ValueError, p.rename, 'name') assert_raises(ValueError, p.rename, ['name']) assert_raises(ValueError, p.drop, 'name') assert_raises(ValueError, p.drop, ['name']) assert_raises(ValueError, p.save, 'not_used') ok_((p.ids == np.array([], dtype=str)).all(), 'empty ids') ok_(p.copy() == p, 'empty copy') ok_(p.size == 0, 'empty size') ok_(p.shape == (0, 0), 'empty shape') ok_(len(p) == 0, 'empty length') p.__repr__()
def test_prediction_dominance(): "make sure prediction.dominance runs" d = nx.play_data() d = d['validation'] p = nx.Prediction() p = p.merge_arrays(d.ids, d.y['bernie'], 'model1', 1) p = p.merge_arrays(d.ids, d.y['elizabeth'], 'model2', 2) p = p.merge_arrays(d.ids, d.y['jordan'], 'model3', 3) df = p.dominance(d, 3) df = p.dominance(d, 'jordan') ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe') assert_raises(ValueError, p[('model1', 1)].dominance, d, 1)
def test_prediction_dominance(): """make sure prediction.dominance runs""" d = nx.play_data() d = d['validation'] p = nx.Prediction() p = p.merge_arrays(d.ids, d.y['kazutsugi'], 'model1', 8) p = p.merge_arrays(d.ids, d.y['kazutsugi'], 'model2', 8) p = p.merge_arrays(d.ids, d.y['kazutsugi'], 'model3', 8) df = p.dominance(d, 8) df = p.dominance(d, 'kazutsugi') ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe') assert_raises(ValueError, p[('model1', 1)].dominance, d, 1)
def micro_prediction(index=None): "Returns a tiny prediction object for use in unit testing" cols = ['model0', 'model1', 'model2'] df = pd.DataFrame(columns=cols) df.loc['index0'] = [0.00, 0.01, 0.02] df.loc['index1'] = [0.10, 0.11, 0.12] df.loc['index2'] = [0.20, 0.21, 0.22] df.loc['index3'] = [0.30, 0.31, 0.32] df.loc['index4'] = [0.40, 0.41, 0.42] df.loc['index5'] = [0.50, 0.51, 0.52] df.loc['index6'] = [0.60, 0.61, 0.62] df.loc['index7'] = [0.70, 0.71, 0.72] df.loc['index8'] = [0.80, 0.81, 0.82] df.loc['index9'] = [0.90, 0.91, 0.92] if index is not None: df = df.iloc[index] df = df.copy() # assure contiguous memory prediction = nx.Prediction(df) return prediction
def micro_prediction(index=None): """Returns a tiny prediction object for use in unit testing""" cols = [('model0', 2), ('model1', 1), ('model2', 3), ('model0', 5)] df = pd.DataFrame(columns=cols) df.loc['index0'] = [0.002, 0.011, 0.023, 0.005] df.loc['index1'] = [0.102, 0.111, 0.123, 0.105] df.loc['index2'] = [0.202, 0.211, 0.223, 0.205] df.loc['index3'] = [0.302, 0.311, 0.323, 0.305] df.loc['index4'] = [0.402, 0.411, 0.423, 0.405] df.loc['index5'] = [0.502, 0.511, 0.523, 0.505] df.loc['index6'] = [0.602, 0.611, 0.623, 0.605] df.loc['index7'] = [0.702, 0.711, 0.723, 0.705] df.loc['index8'] = [0.802, 0.811, 0.823, 0.805] df.loc['index9'] = [0.902, 0.911, 0.923, 0.905] if index is not None: df = df.iloc[index] df = df.copy() # assure contiguous memory prediction = nx.Prediction(df) return prediction
def run(model, splitter, tournament=None, verbosity=2): """ Run a model/tournament pair (or pairs) through a data splitter. Parameters ---------- model : nx.Model, list, tuple Prediction model to run through the splitter. Can be a list or tuple of prediction models. Model names must be unique. splitter : nx.Splitter An iterator of fit/predict data pairs. tournament : {None, int, str, list, tuple}, optional The tournament(s) to run the model through. By default (None) the model is run through all active tournaments. If a list or tuple of tournaments is given then it must must not contain duplicate tournaments. verbosity : int, optional An integer that determines verbosity. Zero is silent. Returns ------- p : nx.Prediction A prediction object containing the predictions of the specified model/tournament pairs. """ # make list of models if isinstance(model, nx.Model): models = [model] elif isinstance(model, list) or isinstance(model, tuple): models = model else: raise ValueError('`model` must be a model, list, or tuple of models') names = [m.name for m in models] if len(names) != len(set(names)): raise ValueError('`model` cannot contain duplicate names') # make list of tournaments if tournament is None: tournaments = nx.tournament_all() elif nx.isint(tournament) or nx.isstring(tournament): tournaments = [tournament] elif isinstance(tournament, list) or isinstance(tournament, tuple): tournaments = tournament else: msg = '`tournament` must be an integer, string, list, tuple, or None.' raise ValueError(msg) tournaments = [nx.tournament_str(t) for t in tournaments] if len(tournaments) != len(set(tournaments)): raise ValueError('`tournament` cannot contain duplicates') # loop over all model/tournament pairs p = nx.Prediction() for m in models: for t in tournaments: p += run_one(m, splitter, t, verbosity=verbosity) splitter.reset() splitter.reset() return p
def test_emtpy_y_raises(): p = nx.Prediction() p.y
def test_data_hash(): "test prediction.hash" p = nx.testing.micro_prediction() ok_(p.hash() == p.hash(), "prediction.hash not reproduceable") p2 = nx.Prediction(p.df[::2]) ok_(p2.hash() == p2.hash(), "prediction.hash not reproduceable")