示例#1
0
def train(df, winsor_quantile, model_name, feature_name, param, custom_filter=None):
    df = df.copy()

    # Filter Dataset to current Stock Prices Only
    model_df = df[df.index.get_level_values(
        1) == df.index.get_level_values(1).max()]

    if custom_filter:
        tickers = model_df.reset_index()['Ticker'].unique()
        tickers = tickers[np.in1d(tickers, custom_filter)]

        model_df = model_df.loc[tickers]

    # Winsorize the data to even out the distribution
    model_df = sf.winsorize(model_df, clip=True, columns=[
                            'Close'], quantile=winsor_quantile)

    # DataFrames with signals for training- and test-sets.
    X = model_df.drop(columns=['Close', 'Dataset'])
    y = model_df['Close']

    # Fit Model
    model = XGBRegressor(**param)
    model.fit(X, y)

    # Save the Model
    pickle.dump(model, open(MODELS_DIR/f"{model_name}.pkl", "wb"))

    # Save Features for SHAP
    X.to_csv(DATA_DIR/f'{feature_name}_features.csv')
    y.to_csv(DATA_DIR/f'{feature_name}_target.csv')

    return model
def plot_correlation(model, X, y):

    model.fit(X, y)
    y_pred = model.predict(X)

    pred_df = pd.concat(
        [y.reset_index(),
         pd.Series(y_pred, name='Predicted Close')], axis=1)

    g = sns.scatterplot(data=sf.winsorize(
        pred_df[['Ticker', 'Close',
                 'Predicted Close']].groupby('Ticker').mean(),
        clip=False,
        quantile=.005),
                        x='Close',
                        y='Predicted Close')

    text = (
        f"Correlation: {pred_df[['Ticker', 'Close', 'Predicted Close']].groupby('Ticker').mean().corr().values[0][1]:.2%}"
    )

    g.set_title(text)
示例#3
0
# Name of the new column for the returns.
TOTAL_RETURN_1_3Y = 'Total Return 1-3 Years'

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = \
    hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

# Clip the signals and returns at their 5% and 95% quantiles.
# We do not set them to NaN because it would remove too much data.
df_sig_rets = sf.winsorize(df_sig_rets)

# Remove all rows with missing values (NaN)
# because scikit-learn cannot handle that.
df_sig_rets = df_sig_rets.dropna(how='any')

# Remove all tickers which have less than 200 data-rows.
df_sig_rets = df_sig_rets.groupby(TICKER) \
                .filter(lambda df: len(df)>200)

# List of all unique stock-tickers in the dataset.
tickers = df_sig_rets.reset_index()[TICKER].unique()

# Split the tickers into training- and test-sets.
tickers_train, tickers_test = \
    train_test_split(tickers, train_size=0.8, random_state=1234)
示例#4
0
# Combine the signals and stock-returns.
# We are only using the rows which are NetNet discounts.
dfs = [df_signals.loc[mask_netnet],
       df_returns_1_3y.loc[mask_netnet]]
df_sig_rets = pd.concat(dfs, axis=1)
sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER,
                data=df_sig_rets.reset_index(), legend=False)


#--------------------removing outliers------------------------

# Winsorization is basically limits or clips the data between e.g. the 5% and 95% quantiles of the data
# Select all columns except for the P/NetNet ratio.
columns = df_sig_rets.columns.drop(P_NETNET)
# Winsorize all the other signals and stock-returns.
df_sig_rets2 = sf.winsorize(df_sig_rets, columns=columns)
#plot to see the difference 
#see all the dots above or blow the bound are stocked around the boarder 
sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER,
                data=df_sig_rets2.reset_index(), legend=False)

# Winsorize all the other signals and stock-returns.
# Instead of clipping values beyond the bounds, set them to NaN.
df_sig_rets = sf.winsorize(df_sig_rets, columns=columns, clip=False)
sns.scatterplot(x=P_NETNET, y=TOTAL_RETURN_1_3Y, hue=TICKER,
                data=df_sig_rets.reset_index(), legend=False)

#---------------------Linear Correlation---------------------
#We will study the linear correlation between the signals and stock-returns, 
#to roughly assess which signals might be the best predictors for stock-returns.
#We will also study the linear correlation between the signals themselves,