def train(df, winsor_quantile, model_name, feature_name, param, custom_filter=None): df = df.copy() # Filter Dataset to current Stock Prices Only model_df = df[df.index.get_level_values( 1) == df.index.get_level_values(1).max()] if custom_filter: tickers = model_df.reset_index()['Ticker'].unique() tickers = tickers[np.in1d(tickers, custom_filter)] model_df = model_df.loc[tickers] # Winsorize the data to even out the distribution model_df = sf.winsorize(model_df, clip=True, columns=[ 'Close'], quantile=winsor_quantile) # DataFrames with signals for training- and test-sets. X = model_df.drop(columns=['Close', 'Dataset']) y = model_df['Close'] # Fit Model model = XGBRegressor(**param) model.fit(X, y) # Save the Model pickle.dump(model, open(MODELS_DIR/f"{model_name}.pkl", "wb")) # Save Features for SHAP X.to_csv(DATA_DIR/f'{feature_name}_features.csv') y.to_csv(DATA_DIR/f'{feature_name}_target.csv') return model
def plot_correlation(model, X, y): model.fit(X, y) y_pred = model.predict(X) pred_df = pd.concat( [y.reset_index(), pd.Series(y_pred, name='Predicted Close')], axis=1) g = sns.scatterplot(data=sf.winsorize( pred_df[['Ticker', 'Close', 'Predicted Close']].groupby('Ticker').mean(), clip=False, quantile=.005), x='Close', y='Predicted Close') text = ( f"Correlation: {pred_df[['Ticker', 'Close', 'Predicted Close']].groupby('Ticker').mean().corr().values[0][1]:.2%}" ) g.set_title(text)
# Name of the new column for the returns. TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years' # Calculate the mean log-returns for all 1-3 year periods. df_returns_1_3y = \ hub.mean_log_returns(name=TOTAL_RETURN_1_3Y, future=True, annualized=True, min_years=1, max_years=3) dfs = [df_signals, df_returns_1_3y] df_sig_rets = pd.concat(dfs, axis=1) # Clip the signals and returns at their 5% and 95% quantiles. # We do not set them to NaN because it would remove too much data. df_sig_rets = sf.winsorize(df_sig_rets) # Remove all rows with missing values (NaN) # because scikit-learn cannot handle that. df_sig_rets = df_sig_rets.dropna(how='any') # Remove all tickers which have less than 200 data-rows. df_sig_rets = df_sig_rets.groupby(TICKER) \ .filter(lambda df: len(df)>200) # List of all unique stock-tickers in the dataset. tickers = df_sig_rets.reset_index()[TICKER].unique() # Split the tickers into training- and test-sets. tickers_train, tickers_test = \ train_test_split(tickers, train_size=0.8, random_state=1234)
# Combine the signals and stock-returns. # We are only using the rows which are NetNet discounts. dfs = [df_signals.loc[mask_netnet], df_returns_1_3y.loc[mask_netnet]] df_sig_rets = pd.concat(dfs, axis=1) sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER, data=df_sig_rets.reset_index(), legend=False) #--------------------removing outliers------------------------ # Winsorization is basically limits or clips the data between e.g. the 5% and 95% quantiles of the data # Select all columns except for the P/NetNet ratio. columns = df_sig_rets.columns.drop(P_NETNET) # Winsorize all the other signals and stock-returns. df_sig_rets2 = sf.winsorize(df_sig_rets, columns=columns) #plot to see the difference #see all the dots above or blow the bound are stocked around the boarder sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER, data=df_sig_rets2.reset_index(), legend=False) # Winsorize all the other signals and stock-returns. # Instead of clipping values beyond the bounds, set them to NaN. df_sig_rets = sf.winsorize(df_sig_rets, columns=columns, clip=False) sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER, data=df_sig_rets.reset_index(), legend=False) #---------------------Linear Correlation--------------------- #We will study the linear correlation between the signals and stock-returns, #to roughly assess which signals might be the best predictors for stock-returns. #We will also study the linear correlation between the signals themselves,