def test_status_calc(): """ Test the status_calc function which generates training labels """ assert utils.status_calc(50, 20, 12.2) == 1 assert utils.status_calc(12.003, 10, 15) == 0 assert utils.status_calc(-10, -30, 5) == 1 assert utils.status_calc(-31, -30, 15) == 0 assert utils.status_calc(15, 5, 10) == 1 with pytest.raises(ValueError): utils.status_calc(12, 10, -3)
def backtest(): data_df = pd.read_csv("keystats.csv", index_col='Date') data_df.dropna(axis=0, how='any', inplace=True) features = data_df.columns[6:] X = data_df[features].values y = list( status_calc(data_df["stock_p_change"], data_df["SP500_p_change"], outperformance=float(config['settings']['outperform']))) z = np.array(data_df[["stock_p_change", "SP500_p_change"]]) X_train, X_test, y_train, y_test, z_train, z_test = train_test_split( X, y, z, test_size=float(config['settings']['test_size'])) clf = RandomForestClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classifier performance\n", "=" * 20) print(f"Accuracy score: {clf.score(X_test, y_test): .2f}") print(f"Precision score: {precision_score(y_test, y_pred): .2f}") num_positive_predictions = sum(y_pred) if num_positive_predictions < 0: print(colored('[Error:] ', 'red') + "No stocks predicted!") stock_returns = 1 + z_test[y_pred, 0] / 100 market_returns = 1 + z_test[y_pred, 1] / 100 avg_predicted_stock_growth = sum(stock_returns) / num_positive_predictions index_growth = sum(market_returns) / num_positive_predictions percentage_stock_returns = 100 * (avg_predicted_stock_growth - 1) percentage_market_returns = 100 * (index_growth - 1) total_outperformance = percentage_stock_returns - percentage_market_returns print("\n Stock prediction performance report \n", "=" * 40) print(f"Total Trades:", num_positive_predictions) print( f"Average return for stock predictions: {percentage_stock_returns: .1f} %" ) print( f"Average market return in the same period: {percentage_market_returns: .1f}% " ) print( f"Compared to the index, our strategy earns {total_outperformance: .1f} percentage points more" )
def build_data_set(): """ Reads the keystats.csv file and prepares it for scikit-learn :return: X_train and y_train numpy arrays """ training_data = pd.read_csv("keystats.csv", index_col='Date') training_data.dropna(axis=0, how='any', inplace=True) features = training_data.columns[6:] X_train = training_data[features].values # Generate the labels: '1' if a stock beats the S&P500 by more than 10%, else '0'. y_train = list( status_calc(training_data["stock_p_change"], training_data["SP500_p_change"], OUTPERFORMANCE)) return X_train, y_train
def backtest(): """ A simple backtest, which splits the dataset into a train set and test set, then fits a Random Forest classifier to the train set. We print the precision and accuracy of the classifier on the test set, then run a backtest comparing this strategy's performance to passive investment in the S&P500. Please note that there is a methodological flaw in this backtest which will give deceptively good results, so the results here should not encourage you to live trade. """ # Build the dataset, and drop any rows with missing values data_df = pd.read_csv("keystats.csv", index_col="Date") data_df.dropna(axis=0, how="any", inplace=True) features = data_df.columns[6:] X = data_df[features].values # The labels are generated by applying the status_calc to the dataframe. # '1' if a stock beats the S&P500 by more than x%, else '0'. Here x is the # outperformance parameter, which is set to 10 by default but can be redefined. y = list( status_calc( data_df["stock_p_change"], data_df["SP500_p_change"], outperformance=10 ) ) # z is required for us to track returns z = np.array(data_df[["stock_p_change", "SP500_p_change"]]) # Generate the train set and test set by randomly splitting the dataset X_train, X_test, y_train, y_test, z_train, z_test = train_test_split( X, y, z, test_size=0.2 ) # Instantiate a RandomForestClassifier with 100 trees, then fit it to the training data clf = RandomForestClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) # Generate the predictions, then print test set accuracy and precision y_pred = clf.predict(X_test) print("Classifier performance\n", "=" * 20) print(f"Accuracy score: {clf.score(X_test, y_test): .2f}") print(f"Precision score: {precision_score(y_test, y_pred): .2f}") # Because y_pred is an array of 1s and 0s, the number of positive predictions # is equal to the sum of the array num_positive_predictions = sum(y_pred) if num_positive_predictions < 0: print("No stocks predicted!") # Recall that z_test stores the change in stock price in column 0, and the # change in S&P500 price in column 1. # Whenever a stock is predicted to outperform (y_pred = 1), we 'buy' that stock # and simultaneously `buy` the index for comparison. stock_returns = 1 + z_test[y_pred, 0] / 100 market_returns = 1 + z_test[y_pred, 1] / 100 # Calculate the average growth for each stock we predicted 'buy' # and the corresponding index growth avg_predicted_stock_growth = sum(stock_returns) / num_positive_predictions index_growth = sum(market_returns) / num_positive_predictions percentage_stock_returns = 100 * (avg_predicted_stock_growth - 1) percentage_market_returns = 100 * (index_growth - 1) total_outperformance = percentage_stock_returns - percentage_market_returns print("\n Stock prediction performance report \n", "=" * 40) print(f"Total Trades:", num_positive_predictions) print(f"Average return for stock predictions: {percentage_stock_returns: .1f} %") print( f"Average market return in the same period: {percentage_market_returns: .1f}% " ) print( f"Compared to the index, our strategy earns {total_outperformance: .1f} percentage points more" )