def main(): numerai_tickers = pd.DataFrame(numerapi.SignalsAPI().ticker_universe(), columns=['bloomberg_ticker']) public_ticker_map = get_public_numerai_ticker_map() unique_bloomberg_tickers = list( set(numerai_tickers.bloomberg_ticker.tolist() + public_ticker_map.bloomberg_ticker.tolist())) print( f'Tickers in current Numerai Signals tournament: {len(numerai_tickers)}' ) print(f'Tickers in public numerai mapping: {len(public_ticker_map)}') print( f'Total number of unique bloomberg tickers: {len(unique_bloomberg_tickers)}' ) ticker_map = pd.DataFrame({ 'bloomberg': unique_bloomberg_tickers, 'alpha_vantage': None }) ticker_map['ticker'] = ticker_map.bloomberg.str[:-3] ticker_map['is_available_on_alpha_vantage'] = ticker_map.bloomberg.apply( is_available_on_alpha_vantage) ticker_map['alpha_vantage'] = ticker_map.apply( get_alpha_vantage_ticker_for_row, axis=1) ticker_map.drop(['ticker', 'is_available_on_alpha_vantage'], axis=1, inplace=True) ticker_map = ticker_map.merge(public_ticker_map, how='left', left_on='bloomberg', right_on='bloomberg_ticker') ticker_map.yahoo.fillna(SYMBOL_NOT_FOUND, inplace=True) print_ticker_map_stats(ticker_map) ticker_map.sort_values('bloomberg', inplace=True) ticker_map.to_csv(TICKER_MAP_FNAME, index=False, columns=['bloomberg', 'yahoo', 'alpha_vantage']) print(f'done - saved tickers to file {TICKER_MAP_FNAME}')
def download_yfinance(ticker_map: pd.DataFrame): napi = numerapi.SignalsAPI() eligible_tickers = pd.Series(napi.ticker_universe(), name='bloomberg_ticker') print(f"Number of eligible tickers: {len(eligible_tickers)}") print(f"Number of tickers in map: {len(ticker_map)}") yfinance_tickers = eligible_tickers.map( dict(zip(ticker_map['bloomberg_ticker'], ticker_map['yahoo']))).dropna() bloomberg_tickers = ticker_map['bloomberg_ticker'] print(f'Number of eligible, mapped tickers: {len(yfinance_tickers)}') n = 1000 # chunk row size chunk_df = [ yfinance_tickers.iloc[i:i + n] for i in range(0, len(yfinance_tickers), n) ] concat_dfs = [] print("Downloading data...") for df in chunk_df: try: # set threads = True for faster performance, but tickers will fail, scipt may hang # set threads = False for slower performance, but more tickers will succeed temp_df = yfinance.download(df.str.cat(sep=' '), start='2005-12-01', threads=True) temp_df = temp_df['Adj Close'].stack().reset_index() concat_dfs.append(temp_df) except: pass full_data = pd.concat(concat_dfs) full_data.columns = ['date', 'ticker', 'price'] full_data['bloomberg_ticker'] = full_data.ticker.map( dict(zip(ticker_map['yahoo'], bloomberg_tickers))) return full_data
import joblib import numerapi import pandas as pd from sklearn.ensemble import GradientBoostingRegressor from dateutil.relativedelta import relativedelta, FR TARGET_NAME = "target" PREDICTION_NAME = "signal" TRAINED_MODEL_PREFIX = './trained_model' # Pull model id from "MODEL_ID" environment variable # defaults to None, change to a model id from MODEL_ID = os.getenv('MODEL_ID', None) MODEL = GradientBoostingRegressor(subsample=0.1) napi = numerapi.SignalsAPI() def download_data(live_data_date): eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker") logging.info(f"Number of eligible tickers: {len(eligible_tickers)}") yfinance_tickers = map_tickers(eligible_tickers, "bloomberg_ticker", "yahoo") logging.info(f"Number of yahoo tickers: {len(yfinance_tickers)}") num_days_lag = 5 if os.path.exists('full_data.csv'): full_data = pd.read_csv('full_data.csv') quintile_lag, rsi_diff, rsi_diff_abs = get_rsi_feature_names(
def main(): napi = numerapi.SignalsAPI() # Numerai Universe eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker") print(f"Number of eligible tickers : {len(eligible_tickers)}") ticker_map = pd.read_csv( "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv" ) # ----- Yahoo <-> Bloomberg mapping ----- yfinance_tickers = eligible_tickers.map( dict(zip(ticker_map["bloomberg_ticker"], ticker_map["yahoo"]))).dropna() bloomberg_tickers = ticker_map["bloomberg_ticker"] print(f"Number of eligible, mapped tickers: {len(yfinance_tickers)}") us_ticker_map = ticker_map[ticker_map.bloomberg_ticker.str[-2:] == "US"] #tickers = us_ticker_map.yahoo.dropna().values #for US tickers tickers = ticker_map.yahoo.dropna().values #For possible tickers # ----- Raw data loading and formatting ----- print(f"using tickers: {len(tickers)}") full_data = load_data(tickers, "full_data.csv", threads=LOAD_DATA_IN_PARALLEL) full_data["bloomberg_ticker"] = full_data.ticker.map( dict(zip(ticker_map["yahoo"], bloomberg_tickers))) full_data = full_data[[ "bloomberg_ticker", "open", "high", "low", "close", "adjusted close" ]].sort_index(ascending=True) full_data.dropna(inplace=True, axis=0) # ----- Merging targets ----- url = "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val_bbg.csv" targets = pd.read_csv(url) targets["target"] = targets["target"].astype(np.float16) targets["date"] = pd.to_datetime(targets["friday_date"], format="%Y%m%d") gc.collect() # ----- Generate and select features ----- full_data = generate_featues(full_data) feature_names = [f for f in full_data.columns if "quintile" in f] ML_data = pd.merge( full_data.reset_index(), targets, on=["date", "bloomberg_ticker"], ).set_index("date") print(f"Number of eras in data: {len(ML_data.index.unique())}") ML_data = ML_data[ML_data.index.weekday == 4] ML_data = ML_data[ML_data.index.value_counts() > 200] # ----- Train test split ----- train_data = ML_data[ML_data["data_type"] == "train"] test_data = ML_data[ML_data["data_type"] == "validation"] corrs = train_data.groupby(train_data.index).apply( lambda x: x[feature_names + [TARGET_NAME]].corr()[TARGET_NAME]) mean_corr = corrs[feature_names].mean(0) print(mean_corr) last_friday = datetime.now() + relativedelta(weekday=FR(-1)) print(last_friday) date_string = last_friday.strftime("%Y-%m-%d") try: live_data = full_data.loc[date_string].copy() except KeyError as e: print(f"No ticker on {e}") live_data = full_data.iloc[:0].copy() live_data.dropna(subset=feature_names, inplace=True) print(len(live_data)) # ----- Train model ----- print("Training model...") model = GradientBoostingRegressor() model.fit(train_data[feature_names], train_data[TARGET_NAME]) print("Model trained.") # ----- Predict test data ----- train_data[PREDICTION_NAME] = model.predict(train_data[feature_names]) test_data[PREDICTION_NAME] = model.predict(test_data[feature_names]) live_data[PREDICTION_NAME] = model.predict(live_data[feature_names]) diagnostic_df = pd.concat([test_data, live_data]) diagnostic_df["friday_date"] = diagnostic_df.friday_date.fillna( last_friday.strftime("%Y%m%d")).astype(int) diagnostic_df["data_type"] = diagnostic_df.data_type.fillna("live") diagnostic_df[["bloomberg_ticker", "friday_date", "data_type", "signal"]].reset_index(drop=True).to_csv( "example_signal_alphavantage.csv", index=False) print( "Submission saved to example_signal_alphavantage.csv. Upload to signals.numer.ai for scores and diagnostics" )
def main(): '''Creates example_signal_upload.csv to upload for validation and live data submission''' napi = numerapi.SignalsAPI() # read in list of active Signals tickers which can change slightly era to era eligible_tickers = pd.Series(napi.ticker_universe(), name='bloomberg_ticker') print(f"Number of eligible tickers: {len(eligible_tickers)}") # read in yahoo to bloomberg ticker map, still a work in progress, h/t wsouza ticker_map = pd.read_csv( 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv' ) print(f"Number of tickers in map: {len(ticker_map)}") # map eligible numerai tickers to yahoo finance tickers yfinance_tickers = eligible_tickers.map( dict(zip(ticker_map['bloomberg_ticker'], ticker_map['yahoo']))).dropna() bloomberg_tickers = ticker_map['bloomberg_ticker'] print(f'Number of eligible, mapped tickers: {len(yfinance_tickers)}') # download data n = 1000 # chunk row size chunk_df = [ yfinance_tickers.iloc[i:i + n] for i in range(0, len(yfinance_tickers), n) ] concat_dfs = [] print("Downloading data...") for df in chunk_df: try: # set threads = True for faster performance, but tickers will fail, scipt may hang # set threads = False for slower performance, but more tickers will succeed temp_df = yfinance.download(df.str.cat(sep=' '), start='2005-12-01', threads=False) temp_df = temp_df['Adj Close'].stack().reset_index() concat_dfs.append(temp_df) except: # simplejson.errors.JSONDecodeError: pass full_data = pd.concat(concat_dfs) # properly position and clean raw data, after taking adjusted close only full_data.columns = ['date', 'ticker', 'price'] full_data.set_index('date', inplace=True) # convert yahoo finance tickers back to numerai tickers full_data['bloomberg_ticker'] = full_data.ticker.map( dict(zip(ticker_map['yahoo'], bloomberg_tickers))) print('Data downloaded.') print( f"Number of tickers with data: {len(full_data.bloomberg_ticker.unique())}" ) ticker_groups = full_data.groupby('ticker') full_data['RSI'] = ticker_groups['price'].transform(lambda x: RSI(x)) # group by era (date) and create quintile labels within each era, useful for learning relative ranking date_groups = full_data.groupby(full_data.index) full_data['RSI_quintile'] = date_groups['RSI'].transform( lambda group: pd.qcut(group, 5, labels=False, duplicates='drop')) full_data.dropna(inplace=True) # create lagged features grouped by ticker ticker_groups = full_data.groupby('ticker') num_days = 5 # lag 0 is that day's value, lag 1 is yesterday's value, etc for day in range(num_days + 1): full_data[f'RSI_quintile_lag_{day}'] = ticker_groups[ 'RSI_quintile'].transform(lambda group: group.shift(day)) # create difference of the lagged features and absolute difference of the lagged features (change in RSI quintile by day) for day in range(num_days): full_data[f'RSI_diff_{day}'] = full_data[ f'RSI_quintile_lag_{day}'] - full_data[ f'RSI_quintile_lag_{day + 1}'] full_data[f'RSI_abs_diff_{day}'] = np.abs( full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}']) # define column names of features, target, and prediction feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [ f'RSI_diff_{num}' for num in range(num_days) ] + [f'RSI_abs_diff_{num}' for num in range(num_days)] print(f'Features for training:\n {feature_names}') TARGET_NAME = 'target' PREDICTION_NAME = 'signal' # read in Signals targets targets = pd.read_csv('historical_targets.csv') targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d') # merge our feature data with Numerai targets ML_data = pd.merge(full_data.reset_index(), targets, on=['date', 'bloomberg_ticker']).set_index('date') # print(f'Number of eras in data: {len(ML_data.index.unique())}') # for training and testing we want clean, complete data only ML_data.dropna(inplace=True) ML_data = ML_data[ML_data.index.weekday == 4] # ensure we have only fridays ML_data = ML_data[ML_data.index.value_counts() > 50] # drop eras with under 50 observations per era # train test split train_data = ML_data[ML_data['data_type'] == 'train'] test_data = ML_data[ML_data['data_type'] == 'validation'] # train model print("Training model...") model = GradientBoostingRegressor(subsample=0.1) model.fit(train_data[feature_names], train_data[TARGET_NAME]) print("Model trained.") # predict test data test_data[PREDICTION_NAME] = model.predict(test_data[feature_names]) # predict live data # choose data as of most recent friday last_friday = datetime.now() + relativedelta(weekday=FR(-1)) date_string = last_friday.strftime('%Y-%m-%d') try: live_data = full_data.loc[date_string].copy() except KeyError as e: print(f"No ticker on {e}") live_data = full_data.iloc[:0].copy() live_data.dropna(subset=feature_names, inplace=True) # get data from the day before, for markets that were closed # on the most recent friday last_thursday = last_friday - datetime.timedelta(days=1) thursday_date_string = last_thursday.strftime('%Y-%m-%d') thursday_data = full_data.loc[thursday_date_string] # Only select tickers than aren't already present in live_data thursday_data = thursday_data[~thursday_data.ticker.isin(live_data.ticker. values)].copy() thursday_data.dropna(subset=feature_names, inplace=True) live_data = pd.concat([live_data, thursday_data]) print(f"Number of live tickers to submit: {len(live_data)}") live_data[PREDICTION_NAME] = model.predict(live_data[feature_names]) # prepare and writeout example file diagnostic_df = pd.concat([test_data, live_data]) diagnostic_df['friday_date'] = diagnostic_df.friday_date.fillna( last_friday.strftime('%Y%m%d')).astype(int) diagnostic_df['data_type'] = diagnostic_df.data_type.fillna('live') diagnostic_df[['bloomberg_ticker', 'friday_date', 'data_type', 'signal']].reset_index(drop=True).to_csv( 'example_signal_upload.csv', index=False) print( 'Example submission completed. Upload to signals.numer.ai for scores and live submission' )
def main(): # -----Tickers and mapping----- napi = numerapi.SignalsAPI() eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker") ticker_map = pd.read_csv( "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv" ) ticker_map = ticker_map[ticker_map.bloomberg_ticker.isin(eligible_tickers)] numerai_tickers = ticker_map["bloomberg_ticker"] yfinance_tickers = ticker_map["yahoo"] eod_tickers = pd.read_csv( "https://s3.amazonaws.com/quandl-production-static/end_of_day_us_stocks/ticker_list.csv" ) print(f"Number of eligible tickers : {len(eligible_tickers)}") common_tickers = np.intersect1d(yfinance_tickers.values.astype(str), eod_tickers["Ticker"].values.astype(str)) print( f"Number of tickers common between EOD and Bloomberg: {len(common_tickers)}" ) # downloads the whole dataset as zip and read data (takes around 1.5min) full_data = download_full_and_load(ticker_map, common_tickers, f_name="full_EOD.zip") # Building a custom feature full_data["day_chg"] = full_data["close"] / full_data["open"] - 1 gc.collect() # -----Feature engineering----- ticker_groups = full_data.groupby("bloomberg_ticker") # RSI full_data["close_RSI_14"] = ticker_groups["close"].transform( lambda x: RSI(x, 14)) full_data["close_RSI_21"] = ticker_groups["close"].transform( lambda x: RSI(x, 21)) full_data["day_chg_RSI_14"] = ticker_groups["day_chg"].transform( lambda x: RSI(x, 14)) full_data["day_chg_RSI_21"] = ticker_groups["day_chg"].transform( lambda x: RSI(x, 21)) # SMA full_data["close_SMA_14"] = ticker_groups["close"].transform( lambda x: x.rolling(14).mean()) full_data["close_SMA_21"] = ticker_groups["close"].transform( lambda x: x.rolling(21).mean()) indicators = [ "close_RSI_14", "close_RSI_21", "day_chg_RSI_14", "close_SMA_14", "close_SMA_21", "day_chg_RSI_21" ] full_data.dropna(axis=0, inplace=True) del ticker_groups # -----Feature engineering: Quintile----- date_groups = full_data.groupby(full_data.index) print("Quintiling...") for indicator in indicators: full_data[f"{indicator}_quintile"] = ( date_groups[indicator].transform(lambda group: pd.qcut( group, 100, labels=False, duplicates="drop")).astype( np.float16)) gc.collect() del date_groups gc.collect() # -----Feature engineering: Quintile lag----- ticker_groups = full_data.groupby("ticker") # create lagged features, lag 0 is that day's value, lag 1 is yesterday's value, etc print("Calculating lag...") for indicator in indicators: num_days = 5 for day in range(num_days + 1): full_data[f"{indicator}_quintile_lag_{day}"] = ticker_groups[ f"{indicator}_quintile"].transform( lambda group: group.shift(day)) gc.collect() full_data.dropna(axis=0, inplace=True) del ticker_groups gc.collect() print("Calculating changes in lag...") # create difference of the lagged features (change in RSI quintile by day) for indicator in indicators: for day in range(0, num_days): full_data[f"{indicator}_diff_{day}"] = ( full_data[f"{indicator}_quintile_lag_{day}"] - full_data[f"{indicator}_quintile_lag_{day + 1}"]).astype( np.float16) gc.collect() # create difference of the lagged features (change in RSI quintile by day) for indicator in indicators: full_data[f"{indicator}_abs_diff_{day}"] = np.abs( full_data[f"{indicator}_quintile_lag_{day}"] - full_data[f"{indicator}_quintile_lag_{day + 1}"]).astype( np.float16) gc.collect() TARGET_NAME = "target" PREDICTION_NAME = "signal" # read in Signals targets numerai_targets = "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val_bbg.csv" targets = pd.read_csv(numerai_targets) targets["date"] = pd.to_datetime(targets["friday_date"], format="%Y%m%d") # merge our feature data with Numerai targets ML_data = pd.merge(full_data.reset_index(), targets, on=["date", "bloomberg_ticker"]).set_index("date") print(f"Number of eras in data: {len(ML_data.index.unique())}") # for training and testing we want clean, complete data only ML_data.dropna(inplace=True) ML_data = ML_data[ML_data.index.weekday == 4] # ensure we have only fridays ML_data = ML_data[ML_data.index.value_counts() > 200] # drop eras with under 200 observations per era feature_names = [ f for f in ML_data.columns for y in ["lag", "diff"] if y in f ] print(f"Using {len(feature_names)} features") last_friday = datetime.now() + relativedelta(weekday=FR(-1)) date_string = last_friday.strftime("%Y-%m-%d") try: live_data = full_data.loc[date_string].copy() except KeyError as e: print(f"No ticker on {e}") live_data = full_data.iloc[:0].copy() live_data.dropna(subset=feature_names, inplace=True) # get data from the day before, for markets that were closed # on the most recent friday last_thursday = last_friday - timedelta(days=1) thursday_date_string = last_thursday.strftime("%Y-%m-%d") thursday_data = full_data.loc[thursday_date_string] # Only select tickers than aren't already present in live_data thursday_data = thursday_data[~thursday_data.ticker.isin(live_data.ticker. values)].copy() thursday_data.dropna(subset=feature_names, inplace=True) live_data = pd.concat([live_data, thursday_data]) # train test split train_data = ML_data[ML_data["data_type"] == "train"].copy() test_data = ML_data[ML_data["data_type"] == "validation"].copy() train_data[feature_names] /= 100.0 test_data[feature_names] /= 100.0 live_data[feature_names] /= 100.0 del ML_data gc.collect() # train model print("Training model...") model = GradientBoostingRegressor(n_estimators=50) model.fit(train_data[feature_names], train_data[TARGET_NAME]) print("Model trained.") # predict test data train_data[PREDICTION_NAME] = model.predict(train_data[feature_names]) test_data[PREDICTION_NAME] = model.predict(test_data[feature_names]) print(f"Number of live tickers to submit: {len(live_data)}") live_data[PREDICTION_NAME] = model.predict(live_data[feature_names]) # prepare and writeout example file diagnostic_df = pd.concat([test_data, live_data]) diagnostic_df["friday_date"] = diagnostic_df.friday_date.fillna( last_friday.strftime("%Y%m%d")).astype(int) diagnostic_df["data_type"] = diagnostic_df.data_type.fillna("live") diagnostic_df[["bloomberg_ticker", "friday_date", "data_type", "signal"]].reset_index(drop=True).to_csv( "example_quandl_signal_upload.csv", index=False) print( "Example submission completed. Upload to signals.numer.ai for scores and live submission" )
from dateutil.relativedelta import relativedelta, FR TARGET_NAME = "target" PREDICTION_NAME = "signal" TRAINED_MODEL_PREFIX = './trained_model' # Define models here as (ID, model instance), # a model ID of None is submitted as your default model MODEL_CONFIGS = [ (None, GradientBoostingRegressor(subsample=0.1)), # (YOUR MODEL ID, LinearRegression(n_jobs=10)) # etc... ] if os.getenv('NUMERAI_PUBLIC_ID') and os.getenv('NUMERAI_SECRET_KEY'): napi = numerapi.SignalsAPI() else: config = configparser.ConfigParser() config.read('../.numerai/.keys') # initialize API client napi = numerapi.SignalsAPI( public_id=config['numerai']['NUMERAI_PUBLIC_ID'], secret_key=config['numerai']['NUMERAI_SECRET_KEY']) def download_data(live_data_date): eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker") logging.info(f"Number of eligible tickers: {len(eligible_tickers)}")
line_post(notification_message) try: """ Catboostを呼び出しnumeraisignalsに予測を提出する """ #!pip install numerapi #!pip install yfinance #!pip install simplejson # #!pip install catboost # Tickers that Numerai signals want. These are bloomberg tickers. yfinance asks for yahoo finance tickers. # Data acquisition napi = numerapi.SignalsAPI() eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker") print(f"Number of eligible tickers : {len(eligible_tickers)}") print(eligible_tickers.head(10)) # This file has mapping from bloomberg to yahoo finance tickers. So, we can use yfinance tickers to download and then map/rename them back to bloomberg tickers. ticker_map = pd.read_csv( 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv' ) print(len(ticker_map))
def api_fixture(): api = numerapi.SignalsAPI(verbosity='DEBUG') return api
start_date = args.startDate end_date = args.endDate to_date = str( int( time.mktime( datetime.datetime.strptime(end_date, "%d/%m/%Y").timetuple()))) from_date = str( int( time.mktime( datetime.datetime.strptime(start_date, "%d/%m/%Y").timetuple()))) subReddit = 'wallstreetbets' numerAI = numerapi.SignalsAPI() eligible_tickers = pd.Series(numerAI.ticker_universe(), name="bloomberg_ticker") numerAI_ticker_map = pd.read_csv( 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv' ) bb_tickers = numerAI_ticker_map["bloomberg_ticker"] def get_Pushshift_Data(query, after, before, sub): reURL = 'https://api.pushshift.io/reddit/search/submission/?title=' + \ str(query)+'&size=1000&after='+str(after) + \ '&before='+str(before)+'&subreddit='+str(sub) print(reURL) r = requests.get(reURL) data = json.loads(r.text)