def predict(self, X_test, pred_index=None): predicted_values = X_test * self.weights if self.global_hyperparams['output_type'] == 'C': predicted_values = to_class(predicted_values, self.global_hyperparams['threshold']) if pred_index is not None: self._store_predicted_values(pred_index, predicted_values) return predicted_values
def __init__(self, global_hyperparams, pred_val, asset_data, threshold=None): self.pred_val = pred_val self.output_type = global_hyperparams['output_type'] self.threshold = threshold if threshold is not None else global_hyperparams[ 'threshold'] self.pred_val_class = pred_val if self.output_type == 'C' else to_class( pred_val, threshold) self.pred_val_class = self.pred_val_class.squeeze() self.asset_data = asset_data[self.pred_val_class.index]
def predict(self, X_test, pred_index=None): """ Predict function used in main and in the cross validation process It can accept as an X_test input either an array or a dataframe and gives a corresponding output This version of the function only works for ML algorithm and it has to be recoded for TA algorithms If a pred_index is provided, the prediction will be stored in predicted_values with this index """ if self.algo_type=="ML": predicted_values=self.model.predict(X_test) if self.global_hyperparams["output_type"]=="C" and self.model._estimator_type!='classifier': # If we use a regression model and we still need to output a class predicted_values=to_class(predicted_values, self.global_hyperparams["threshold"]) else: predicted_values=np.nan # Not integrated yet if pred_index is not None: self._store_predicted_values(pred_index, predicted_values) return predicted_values
def predict(self, X_test, pred_index=None): w = self.window_size if self.mean_type == "arithmetic": predicted_values = X_test.iloc[:, :w].mean(axis=1, skipna=None) elif self.mean_type == "geometric": # Let us note that the geometric mean should be optimized using numpy vectorized operations predicted_values = 1 for col in X_test.iloc[:, :w].columns: predicted_values = predicted_values * (1 + X_test.iloc[:, col]) predicted_values = np.power(predicted_values, 1 / w) - 1 # The output will be different in case of a regression or classification, no need to change the output for a regression if self.global_hyperparams["output_type"] == "C": threshold = self.global_hyperparams["threshold"] predicted_values = to_class(predicted_values, threshold) if pred_index is not None: self._store_predicted_values(pred_index, predicted_values) return predicted_values # here we have a redundency in the return and the side effect of the method, this is used to simplify coding
def predict(self, X_test, pred_index=None): w = self.window_size if self.mean_type == 'arithmetic': predicted_values = X_test.iloc[:, :w].mean(axis=1, skipna=None) elif self.mean_type == 'geometric': # Let us note that the geometric mean should be optimized using numpy vectorized operations predicted_values = 1 for col in X_test.iloc[:, : w].columns: # We stop at the column number w predicted_values = predicted_values * (1 + X_test[col].values) predicted_values = np.power(predicted_values, 1 / w) - 1 # We classify the predictions in case of a classification output if self.global_hyperparams['output_type'] == 'C': threshold = self.global_hyperparams['threshold'] predicted_values = to_class(predicted_values, threshold) if pred_index is not None: self._store_predicted_values(pred_index, predicted_values) return predicted_values
dataset=data.dataset_building('quandl', asset_ids, start_date, end_date, n_max=None) # please recode the dataset_building functio to make it support local and quandl data dataset = data.add_returns(dataset, [0]) # creates some NANs as a result of the returns computation dataset.dropna(inplace=True) # We select an asset returns time series to predict from the dataset Y_0=dataset[dataset.columns[1]] # need to find a reliable way to find the index of the column # X: include all the lags of Y and additional data lags=range(1,rolling_window_size+1) X=data.lagged(dataset,lags=lags) # In X please always include all the lags of Y that you want to use for the HM as first colunms #max_lags=max(lags) # We could also turn X into classes data, is that meaningful? # X=to_class(X,threshold) # In case of classification, we classify Y if output_type=='C': Y=data.to_class(Y_0, threshold) ## Creating & calibrating the different algorithms # First define a dictionary of algorithm associated with their names # As arguments please include the fixed hyperparams of the model as a named argument # For the hyperparameters grid to use in cross validation please provide a dictionary using sklearn syntax algos={'HM':HM(global_hyperparams, hp_grid={'window_size':[10,100,500]}), #'LR':LR(global_hyperparams), #'Lasso':LR(global_hyperparams, regularization='Lasso',hp_grid={'alpha':np.logspace(-4,1,10)}), #'ElasticNet':LR(global_hyperparams, regularization='ElasticNet',hp_grid={'alpha':np.logspace(-3,1,20),'l1_ratio':np.linspace(0,1,20)}), #'Tree':DT(global_hyperparams,hp_grid={'max_features':['sqrt',None],'criterion':['gini','entropy']}), #'RF':RF(global_hyperparams, hp_grid={'max_features':['sqrt',None],'n_estimators':range(10,200,20)}), #'ADAB':ADAB(global_hyperparams, hp_grid={'n_estimators':[1,5,10]}, base_algo=DT(global_hyperparams)), #'MLP':MLP(global_hyperparams,hp_grid={'alpha':np.linspace(0.1,1,9),'hidden_layer_sizes':[(10,),(100,),(200,)]},activation='relu', solver='lbfgs'),
def __main__(): ## Global Hyperparameters # The window size of the rolling window used to define each training set size # The models will never see more than this number of points at once rolling_window_size = 500 # Output type : C for Classification, R for Regression # Note that for a Classification, 1 means positive return, -1 means negative, and 0 means bellow threshold output_type = "C" # In case of 3 class Classification, please provide an absolute level for the zero return threshold # Fix it to 0 for a binary classification # The optimal value can also be globally optimized as a result of the PnL optimisation and will be function of the volatility of the asset threshold = 0.001 # This dictionary of global hyperparameters will be passed an an argument of all built algorithms global_hyperparams = { "rolling_window_size": rolling_window_size, "output_type": output_type, "threshold": threshold } ## Building the dataset dataset = data.dataset_building(n_max=2000) # We select an asset returns time series to predict from the dataset asset_label = "EURUSD Curncy" Y = dataset[[asset_label]] Y.dropna(inplace=True) # With lags, used as X, maybe this implementation is not optimal, think about a slicing way to do that? lags = range(1, rolling_window_size + 1) X = data.lagged(Y, lags=lags) max_lags = max(lags) # We could also turn X into classes data, is that meaningful? # X=to_class(X,threshold) # In case of classification, we transform Y and put the classes labels into global_hyperparams if output_type == "C": Y = data.to_class( Y, threshold ) # Notice that now the values of Y is the index of the class in classes classes = np.unique(Y) global_hyperparams["classes"] = classes ## Creating & calibrating the different algorithms # First define a dictionary of algorithm associated with their names # As arguments please include the fixed hyperparams of the model as a named argument # For the hyperparameters grid to use in cross validation please provide a dictionary using sklearn syntax algos = { "HM AR Full window": HM(global_hyperparams, window_size=10, hp_grid={'window_size': [1, 100]}), "HM GEO Full window": HM(global_hyperparams, mean_type="geometric", hp_grid={'window_size': [1, 10, 50, 100]}), "HM AR Short Term": HM(global_hyperparams, window_size=10), "LR": LR(global_hyperparams), "Lasso": LR(global_hyperparams, regularization="Lasso", hp_grid={"alpha": np.logspace(-4, 1, 5)}) } # Then we just allow ourselves to work/calib/fit/train only a subsets of these algos #algos_used=algos.keys() algos_used = ["Lasso"] #algos_used=["HM AR Full window"] #algos_used=["HM GEO Full window"] for key in algos_used: # We let each algo select the relevent data to work on algos[key].select_data(X) for i in range( rolling_window_size + max_lags, len(Y.index) ): # Note that i in the numeric index in Y of the predicted value train = range(i - rolling_window_size, i) # should be equal to i-rolling_window_size:i-1 test = [ i ] # I am not sure of the index, we can check, it is inside [] to ;ake urer the slicing produces a dataframe pred_index = Y.index[test] # This is the timestamp of i # We train all the algos on the testing set, it includes the calibration of hyperparameters and the fitting algos[key].calib(X.iloc[train], Y.iloc[train], pred_index, cross_val_type="ts_cv", n_splits=5, calib_type="GridSearch") # We build the predictions algos[key].predict(X.iloc[test], pred_index) # for debug print(i) # We compute the outputs algos[key].compute_outputs(Y) # for debug print(algos[key].best_hp) pass ## Core algorithm # Hyperparameters of the Core algorithm rolling_window_size_core = rolling_window_size core_algo = HM( global_hyperparams, window_size=rolling_window_size_core) # Average of the predictions # We first built a new dataset with all the predictions for the algos, it will be our new X X_core = data.core_dataset(algos, algos_used) ## Trading Strategy ## Backtest/Plots/Trading Execution return 0