def y_to_nan(self): "Copy of data with y values set to NaN" data = self.copy() for name in nx.tournament_iter(active_only=True): kwargs = {name: np.nan} data.df = data.df.assign(**kwargs) return data
def get_user_activities(user): "Activity of `user` across all rounds and tournaments as dataframe" napi = NumerAPI() data = [] for number, name in nx.tournament_iter(): data += napi.get_user_activities(user, number) flat = [flatten_dict(d) for d in data] df = pd.DataFrame.from_dict(flat) return df
def xnew(self, x_array): "Copy of data but with data.x=`x_array`; must have same number of rows" if x_array.shape[0] != len(self): msg = "`x_array` must have the same number of rows as data" raise ValueError(msg) shape = (x_array.shape[0], x_array.shape[1] + 7) cols = ['x' + str(i) for i in range(x_array.shape[1])] cols = ['era', 'region'] + cols cols = cols + [name for number, name in nx.tournament_iter()] df = pd.DataFrame(data=np.empty(shape, dtype=np.float64), index=self.df.index.copy(deep=True), columns=cols) df['era'] = self.df['era'].values.copy() df['region'] = self.df['region'].values.copy() df.values[:, 2:-5] = x_array for number, name in nx.tournament_iter(): df[name] = self.df[name].values.copy() return Data(df)
def y_df(self): """Copy of targets, y, as a dataframe""" columns = [] data = [] for number, name in nx.tournament_iter(active_only=True): columns.append(name) data.append(self.y[number].reshape(-1, 1)) data = np.hstack(data) df = pd.DataFrame(data=data, columns=columns, index=self.ids) return df
def test_tournament_int(): """test tournament_int""" for t_int, t_str in nx.tournament_iter(): t_int2 = nx.tournament_int(t_int) ok_(t_int2 == t_int, "tournament int do not agree") t_int2 = nx.tournament_int(t_str) ok_(t_int2 == t_int, "tournament int do not agree") assert_raises(ValueError, nx.tournament_int, 0) assert_raises(ValueError, nx.tournament_int, 'burn') assert_raises(ValueError, nx.tournament_int, None)
def test_tournament_str(): """test tournament_str""" for t_int, t_str in nx.tournament_iter(): t_str2 = nx.tournament_str(t_int) ok_(t_str2 == t_str, "tournament str do not agree") t_str2 = nx.tournament_str(t_str) ok_(t_str2 == t_str, "tournament str do not agree") assert_raises(ValueError, nx.tournament_str, 0) assert_raises(ValueError, nx.tournament_str, 9) assert_raises(ValueError, nx.tournament_str, 'burn') assert_raises(ValueError, nx.tournament_str, None)
def test_prediction_regression(): "regression test of prediction performance evaluation" d = nx.play_data() p = nx.production(nx.logistic(), d, tournament=None, verbosity=0) for number, name in nx.tournament_iter(): p2 = nx.production(nx.logistic(), d, tournament=name, verbosity=0) df = p.performance_mean(d['validation'], mean_of='tournament') logloss1 = df.loc[name]['logloss'] logloss2 = p2.summary(d['validation']).loc['mean']['logloss'] diff = np.abs(logloss1 - logloss2) msg = 'failed on {}'.format(name) ok_(diff < 1e-6, msg)
def get_stakes_cutoff(round_number=None): """ Staking confidence cutoff for all tournaments in given round. Use this function for `round_number` greater than 112. """ data = [] for number, name in nx.tournament_iter(): s, c = get_stakes(round_number, tournament=number) data.append([name, c]) df = pd.DataFrame(data=data, columns=['tourney', 'cutoff']) df = df.set_index('tourney') return df
def test_prediction_regression(): """regression test of prediction performance evaluation""" d = nx.play_data() p = nx.production(nx.linear(), d, tournament=None, verbosity=0) for number, name in nx.tournament_iter(): p2 = nx.production(nx.linear(), d, tournament=name, verbosity=0) df = p.performance_mean(d['validation'], mean_of='tournament') logloss1 = float('%.3f' % (df.loc[name]['mse'])) logloss2 = float('%.3f' % (p2.summary(d['validation']).loc['mean']['mse'])) diff = np.abs(logloss1 - logloss2) msg = f"failed on {name}" ok_(diff < 1e-6, msg)
def xnew(self, x_array): """ Copy of data but with data.x=`x_array` must have same number of rows """ if x_array.shape[0] != len(self): msg = "`x_array` must have the same number of rows as data" raise ValueError(msg) n = nx.tournament_count(active_only=True) shape = (x_array.shape[0], x_array.shape[1] + n + 2) cols = ['x' + str(col) for col in range(1, x_array.shape[1] + 1)] cols = ['era', 'region'] + cols cols = cols + [ name for _, name in nx.tournament_iter(active_only=True) ] df = pd.DataFrame(data=np.empty(shape, dtype=np.float64), index=self.df.index.copy(deep=True), columns=cols) df['era'] = self.df['era'].values.copy() df['region'] = self.df['region'].values.copy() df.values[:, 2:-n] = x_array for _, name in nx.tournament_iter(active_only=True): df[name] = self.df[name].values.copy() return Data(df)
def metric_per_tournament(self, data, metric='corr'): """DataFrame containing given metric versus tournament""" dfs = [] for t_int, t_name in nx.tournament_iter(active_only=False): df, info = metrics_per_name(data, self, t_int, columns=[metric], split_pairs=False) df.columns = [t_name] dfs.append(df) df = pd.concat(dfs, axis=1) df.insert(df.shape[1], 'mean', df.mean(axis=1)) df = df.sort_values('mean') return df
def get_stakes_users(users, round_number=None): """ Stakes for given users for all tournaments. Use this function for `round_number` greater than 112. """ stakes = [] for number, name in nx.tournament_iter(): s, p = get_stakes(round_number, tournament=number) idx = s.index.isin(users) s = s[idx] s.insert(0, 'tourney', name) stakes.append(s) stakes = pd.concat(stakes, axis=0) return stakes
def test_data_y_for_tournament(): """test data.y_for_tournament""" d = nx.load_zip(TINY_DATASET_CSV) for number, name in nx.tournament_iter(active_only=True): y = np.zeros(14) y[0] = y[4] = y[5] = y[9] = 0.75000 y[1] = y[6] = 0.25000 y[10:] = np.nan yt = d.y[number] yt2 = d.y[name] assert_array_equal(yt, yt2, f"y{number} indexing corrupted") assert_array_equal(yt, y, f"y{number} targets corrupted")
def load_zip(file_path, verbose=False): "Load numerai dataset from zip archive; return Data" # load zip zf = zipfile.ZipFile(file_path) train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0) tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0) # turn into single dataframe and rename columns df = pd.concat([train, tourn], axis=0) rename_map = {'data_type': 'region'} for i in range(1, N_FEATURES + 1): rename_map['feature' + str(i)] = 'x' + str(i) for number, name in nx.tournament_iter(): rename_map['target_' + name] = name df.rename(columns=rename_map, inplace=True) # convert era, region, and labels to np.float64 df['era'] = df['era'].map(ERA_STR_TO_FLOAT) df['region'] = df['region'].map(REGION_STR_TO_FLOAT) n = len(nx.tournament_all()) df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64') # no way we did something wrong, right? n = 2 + N_FEATURES + nx.tournament_count() if df.shape[1] != n: raise IOError("expecting {} columns; found {}".format(n, df.shape[1])) # make sure memory is contiguous so that, e.g., data.x is a view df = df.copy() # to avoid copies we need the dtype of each column to be the same if df.dtypes.unique().size != 1: raise TypeError("dtype of each column should be the same") data = Data(df) if verbose: print(data) return data
def cutoff_impact(round_number=None, nmrs=[5, 10, 100, 200], is_cutoff=True, is_relative=False): """ Impact of adding stakes of various sizes (nmr) above the cutoff. If `is_cutoff` is True (default) then the cutoff is returned; otherwise the payout ratio is returned. If `is_relative` is False (default) then the absolute cutoff or payout ration is given; otherwise a relative value (compared to adding no additional stake) is given. """ data = [] for number, name in nx.tournament_iter(): s, cutoff = get_stakes(round_number, tournament=number) if is_cutoff: name = 'cutoff' x0 = cutoff else: name = 'payout' x0 = (1 - cutoff) / cutoff d = [name, x0] for nmr in nmrs: c, ignore = calc_cutoff(s, impact_probe_nmr=nmr) if is_cutoff: x = c else: x = (1 - c) / c if is_relative: x = x - x0 d.append(x) data.append(d) columns = ['tourney', name] + nmrs df = pd.DataFrame(data=data, columns=columns) df = df.set_index('tourney') return df
def load_zip(file_path, verbose=False, include_train=True, single_precision=True): """ Load numerai dataset from zip archive; return Data It includes train data by default. To work with tournament data only, set `include_train` to False. Set `single_precision` to True in order to have data in float32 (saves memory). """ # load zip zf = zipfile.ZipFile(file_path) if single_precision: # read first 100 rows to scan types # then replace all float64 types with float32 df_test = pd.read_csv(zf.open(TOURNAMENT_FILE), nrows=100, header=0, index_col=0) float_cols = [c for c in df_test if df_test[c].dtype == "float64"] float32_cols = {c: np.float32 for c in float_cols} tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0, engine='c', dtype=float32_cols) if include_train: train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0, engine='c', dtype=float32_cols) # merge train and tournament data to single dataframe df = pd.concat([train, tourn], axis=0) else: df = tourn else: # regular parsing, float64 will be used tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0) if include_train: train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0) # merge train and tournament data to single dataframe df = pd.concat([train, tourn], axis=0) else: df = tourn # rename columns rename_map = {'data_type': 'region'} for i in range(1, N_FEATURES + 1): rename_map['feature' + str(i)] = 'x' + str(i) for number, name in nx.tournament_iter(active_only=True): rename_map['target'] = name df.rename(columns=rename_map, inplace=True) # convert era, region, and labels to np.float32 or # np.float64 depending on the mode df['era'] = df['era'].map(ERA_STR_TO_FLOAT) df['region'] = df['region'].map(REGION_STR_TO_FLOAT) n = nx.tournament_count(active_only=True) if single_precision: df.iloc[:, -n:] = df.iloc[:, -n:].astype('float32') df.iloc[:, 0:2] = df.iloc[:, 0:2].astype('float32') else: df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64') # no way we did something wrong, right? n = 2 + N_FEATURES + nx.tournament_count(active_only=True) if df.shape[1] != n: raise IOError("expecting {} columns; found {}".format(n, df.shape[1])) # make sure memory is contiguous so that, e.g., data.x is a view df = df.copy() # to avoid copies we need the dtype of each column to be the same if df.dtypes.unique().size != 1: raise TypeError("dtype of each column should be the same") data = Data(df) if verbose: print(data) return data